sha512block_ppc64x.s raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Based on CRYPTOGAMS code with the following comment:
   6  // # ====================================================================
   7  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   8  // # project. The module is, however, dual licensed under OpenSSL and
   9  // # CRYPTOGAMS licenses depending on where you obtain it. For further
  10  // # details see http://www.openssl.org/~appro/cryptogams/.
  11  // # ====================================================================
  12  
  13  //go:build (ppc64 || ppc64le) && !purego
  14  
  15  #include "textflag.h"
  16  
  17  // SHA512 block routine. See sha512block.go for Go equivalent.
  18  //
  19  // The algorithm is detailed in FIPS 180-4:
  20  //
  21  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
  22  //
  23  // Wt = Mt; for 0 <= t <= 15
  24  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
  25  //
  26  // a = H0
  27  // b = H1
  28  // c = H2
  29  // d = H3
  30  // e = H4
  31  // f = H5
  32  // g = H6
  33  // h = H7
  34  //
  35  // for t = 0 to 79 {
  36  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
  37  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
  38  //    h = g
  39  //    g = f
  40  //    f = e
  41  //    e = d + T1
  42  //    d = c
  43  //    c = b
  44  //    b = a
  45  //    a = T1 + T2
  46  // }
  47  //
  48  // H0 = a + H0
  49  // H1 = b + H1
  50  // H2 = c + H2
  51  // H3 = d + H3
  52  // H4 = e + H4
  53  // H5 = f + H5
  54  // H6 = g + H6
  55  // H7 = h + H7
  56  
  57  #define CTX	R3
  58  #define INP	R4
  59  #define END	R5
  60  #define TBL	R6
  61  #define CNT	R8
  62  #define LEN	R9
  63  #define TEMP	R12
  64  
  65  #define TBL_STRT R7 // Pointer to start of kcon table.
  66  
  67  #define R_x000	R0
  68  #define R_x010	R10
  69  #define R_x020	R25
  70  #define R_x030	R26
  71  #define R_x040	R14
  72  #define R_x050	R15
  73  #define R_x060	R16
  74  #define R_x070	R17
  75  #define R_x080	R18
  76  #define R_x090	R19
  77  #define R_x0a0	R20
  78  #define R_x0b0	R21
  79  #define R_x0c0	R22
  80  #define R_x0d0	R23
  81  #define R_x0e0	R24
  82  #define R_x0f0	R28
  83  #define R_x100	R29
  84  #define R_x110	R27
  85  
  86  
  87  // V0-V7 are A-H
  88  // V8-V23 are used for the message schedule
  89  #define KI	V24
  90  #define FUNC	V25
  91  #define S0	V26
  92  #define S1	V27
  93  #define s0	V28
  94  #define s1	V29
  95  #define LEMASK	V31	// Permutation control register for little endian
  96  
  97  // VPERM is needed on LE to switch the bytes
  98  
  99  #ifdef GOARCH_ppc64le
 100  #define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
 101  #else
 102  #define VPERMLE(va,vb,vc,vt)
 103  #endif
 104  
 105  // 2 copies of each Kt, to fill both doublewords of a vector register
 106  DATA  ·kcon+0x000(SB)/8, $0x428a2f98d728ae22
 107  DATA  ·kcon+0x008(SB)/8, $0x428a2f98d728ae22
 108  DATA  ·kcon+0x010(SB)/8, $0x7137449123ef65cd
 109  DATA  ·kcon+0x018(SB)/8, $0x7137449123ef65cd
 110  DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfec4d3b2f
 111  DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfec4d3b2f
 112  DATA  ·kcon+0x030(SB)/8, $0xe9b5dba58189dbbc
 113  DATA  ·kcon+0x038(SB)/8, $0xe9b5dba58189dbbc
 114  DATA  ·kcon+0x040(SB)/8, $0x3956c25bf348b538
 115  DATA  ·kcon+0x048(SB)/8, $0x3956c25bf348b538
 116  DATA  ·kcon+0x050(SB)/8, $0x59f111f1b605d019
 117  DATA  ·kcon+0x058(SB)/8, $0x59f111f1b605d019
 118  DATA  ·kcon+0x060(SB)/8, $0x923f82a4af194f9b
 119  DATA  ·kcon+0x068(SB)/8, $0x923f82a4af194f9b
 120  DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5da6d8118
 121  DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5da6d8118
 122  DATA  ·kcon+0x080(SB)/8, $0xd807aa98a3030242
 123  DATA  ·kcon+0x088(SB)/8, $0xd807aa98a3030242
 124  DATA  ·kcon+0x090(SB)/8, $0x12835b0145706fbe
 125  DATA  ·kcon+0x098(SB)/8, $0x12835b0145706fbe
 126  DATA  ·kcon+0x0A0(SB)/8, $0x243185be4ee4b28c
 127  DATA  ·kcon+0x0A8(SB)/8, $0x243185be4ee4b28c
 128  DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3d5ffb4e2
 129  DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3d5ffb4e2
 130  DATA  ·kcon+0x0C0(SB)/8, $0x72be5d74f27b896f
 131  DATA  ·kcon+0x0C8(SB)/8, $0x72be5d74f27b896f
 132  DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe3b1696b1
 133  DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe3b1696b1
 134  DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a725c71235
 135  DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a725c71235
 136  DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174cf692694
 137  DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174cf692694
 138  DATA  ·kcon+0x100(SB)/8, $0xe49b69c19ef14ad2
 139  DATA  ·kcon+0x108(SB)/8, $0xe49b69c19ef14ad2
 140  DATA  ·kcon+0x110(SB)/8, $0xefbe4786384f25e3
 141  DATA  ·kcon+0x118(SB)/8, $0xefbe4786384f25e3
 142  DATA  ·kcon+0x120(SB)/8, $0x0fc19dc68b8cd5b5
 143  DATA  ·kcon+0x128(SB)/8, $0x0fc19dc68b8cd5b5
 144  DATA  ·kcon+0x130(SB)/8, $0x240ca1cc77ac9c65
 145  DATA  ·kcon+0x138(SB)/8, $0x240ca1cc77ac9c65
 146  DATA  ·kcon+0x140(SB)/8, $0x2de92c6f592b0275
 147  DATA  ·kcon+0x148(SB)/8, $0x2de92c6f592b0275
 148  DATA  ·kcon+0x150(SB)/8, $0x4a7484aa6ea6e483
 149  DATA  ·kcon+0x158(SB)/8, $0x4a7484aa6ea6e483
 150  DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dcbd41fbd4
 151  DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dcbd41fbd4
 152  DATA  ·kcon+0x170(SB)/8, $0x76f988da831153b5
 153  DATA  ·kcon+0x178(SB)/8, $0x76f988da831153b5
 154  DATA  ·kcon+0x180(SB)/8, $0x983e5152ee66dfab
 155  DATA  ·kcon+0x188(SB)/8, $0x983e5152ee66dfab
 156  DATA  ·kcon+0x190(SB)/8, $0xa831c66d2db43210
 157  DATA  ·kcon+0x198(SB)/8, $0xa831c66d2db43210
 158  DATA  ·kcon+0x1A0(SB)/8, $0xb00327c898fb213f
 159  DATA  ·kcon+0x1A8(SB)/8, $0xb00327c898fb213f
 160  DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7beef0ee4
 161  DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7beef0ee4
 162  DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf33da88fc2
 163  DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf33da88fc2
 164  DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147930aa725
 165  DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147930aa725
 166  DATA  ·kcon+0x1E0(SB)/8, $0x06ca6351e003826f
 167  DATA  ·kcon+0x1E8(SB)/8, $0x06ca6351e003826f
 168  DATA  ·kcon+0x1F0(SB)/8, $0x142929670a0e6e70
 169  DATA  ·kcon+0x1F8(SB)/8, $0x142929670a0e6e70
 170  DATA  ·kcon+0x200(SB)/8, $0x27b70a8546d22ffc
 171  DATA  ·kcon+0x208(SB)/8, $0x27b70a8546d22ffc
 172  DATA  ·kcon+0x210(SB)/8, $0x2e1b21385c26c926
 173  DATA  ·kcon+0x218(SB)/8, $0x2e1b21385c26c926
 174  DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc5ac42aed
 175  DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc5ac42aed
 176  DATA  ·kcon+0x230(SB)/8, $0x53380d139d95b3df
 177  DATA  ·kcon+0x238(SB)/8, $0x53380d139d95b3df
 178  DATA  ·kcon+0x240(SB)/8, $0x650a73548baf63de
 179  DATA  ·kcon+0x248(SB)/8, $0x650a73548baf63de
 180  DATA  ·kcon+0x250(SB)/8, $0x766a0abb3c77b2a8
 181  DATA  ·kcon+0x258(SB)/8, $0x766a0abb3c77b2a8
 182  DATA  ·kcon+0x260(SB)/8, $0x81c2c92e47edaee6
 183  DATA  ·kcon+0x268(SB)/8, $0x81c2c92e47edaee6
 184  DATA  ·kcon+0x270(SB)/8, $0x92722c851482353b
 185  DATA  ·kcon+0x278(SB)/8, $0x92722c851482353b
 186  DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a14cf10364
 187  DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a14cf10364
 188  DATA  ·kcon+0x290(SB)/8, $0xa81a664bbc423001
 189  DATA  ·kcon+0x298(SB)/8, $0xa81a664bbc423001
 190  DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70d0f89791
 191  DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70d0f89791
 192  DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a30654be30
 193  DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a30654be30
 194  DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d6ef5218
 195  DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d6ef5218
 196  DATA  ·kcon+0x2D0(SB)/8, $0xd69906245565a910
 197  DATA  ·kcon+0x2D8(SB)/8, $0xd69906245565a910
 198  DATA  ·kcon+0x2E0(SB)/8, $0xf40e35855771202a
 199  DATA  ·kcon+0x2E8(SB)/8, $0xf40e35855771202a
 200  DATA  ·kcon+0x2F0(SB)/8, $0x106aa07032bbd1b8
 201  DATA  ·kcon+0x2F8(SB)/8, $0x106aa07032bbd1b8
 202  DATA  ·kcon+0x300(SB)/8, $0x19a4c116b8d2d0c8
 203  DATA  ·kcon+0x308(SB)/8, $0x19a4c116b8d2d0c8
 204  DATA  ·kcon+0x310(SB)/8, $0x1e376c085141ab53
 205  DATA  ·kcon+0x318(SB)/8, $0x1e376c085141ab53
 206  DATA  ·kcon+0x320(SB)/8, $0x2748774cdf8eeb99
 207  DATA  ·kcon+0x328(SB)/8, $0x2748774cdf8eeb99
 208  DATA  ·kcon+0x330(SB)/8, $0x34b0bcb5e19b48a8
 209  DATA  ·kcon+0x338(SB)/8, $0x34b0bcb5e19b48a8
 210  DATA  ·kcon+0x340(SB)/8, $0x391c0cb3c5c95a63
 211  DATA  ·kcon+0x348(SB)/8, $0x391c0cb3c5c95a63
 212  DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4ae3418acb
 213  DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4ae3418acb
 214  DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f7763e373
 215  DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f7763e373
 216  DATA  ·kcon+0x370(SB)/8, $0x682e6ff3d6b2b8a3
 217  DATA  ·kcon+0x378(SB)/8, $0x682e6ff3d6b2b8a3
 218  DATA  ·kcon+0x380(SB)/8, $0x748f82ee5defb2fc
 219  DATA  ·kcon+0x388(SB)/8, $0x748f82ee5defb2fc
 220  DATA  ·kcon+0x390(SB)/8, $0x78a5636f43172f60
 221  DATA  ·kcon+0x398(SB)/8, $0x78a5636f43172f60
 222  DATA  ·kcon+0x3A0(SB)/8, $0x84c87814a1f0ab72
 223  DATA  ·kcon+0x3A8(SB)/8, $0x84c87814a1f0ab72
 224  DATA  ·kcon+0x3B0(SB)/8, $0x8cc702081a6439ec
 225  DATA  ·kcon+0x3B8(SB)/8, $0x8cc702081a6439ec
 226  DATA  ·kcon+0x3C0(SB)/8, $0x90befffa23631e28
 227  DATA  ·kcon+0x3C8(SB)/8, $0x90befffa23631e28
 228  DATA  ·kcon+0x3D0(SB)/8, $0xa4506cebde82bde9
 229  DATA  ·kcon+0x3D8(SB)/8, $0xa4506cebde82bde9
 230  DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7b2c67915
 231  DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7b2c67915
 232  DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2e372532b
 233  DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2e372532b
 234  DATA  ·kcon+0x400(SB)/8, $0xca273eceea26619c
 235  DATA  ·kcon+0x408(SB)/8, $0xca273eceea26619c
 236  DATA  ·kcon+0x410(SB)/8, $0xd186b8c721c0c207
 237  DATA  ·kcon+0x418(SB)/8, $0xd186b8c721c0c207
 238  DATA  ·kcon+0x420(SB)/8, $0xeada7dd6cde0eb1e
 239  DATA  ·kcon+0x428(SB)/8, $0xeada7dd6cde0eb1e
 240  DATA  ·kcon+0x430(SB)/8, $0xf57d4f7fee6ed178
 241  DATA  ·kcon+0x438(SB)/8, $0xf57d4f7fee6ed178
 242  DATA  ·kcon+0x440(SB)/8, $0x06f067aa72176fba
 243  DATA  ·kcon+0x448(SB)/8, $0x06f067aa72176fba
 244  DATA  ·kcon+0x450(SB)/8, $0x0a637dc5a2c898a6
 245  DATA  ·kcon+0x458(SB)/8, $0x0a637dc5a2c898a6
 246  DATA  ·kcon+0x460(SB)/8, $0x113f9804bef90dae
 247  DATA  ·kcon+0x468(SB)/8, $0x113f9804bef90dae
 248  DATA  ·kcon+0x470(SB)/8, $0x1b710b35131c471b
 249  DATA  ·kcon+0x478(SB)/8, $0x1b710b35131c471b
 250  DATA  ·kcon+0x480(SB)/8, $0x28db77f523047d84
 251  DATA  ·kcon+0x488(SB)/8, $0x28db77f523047d84
 252  DATA  ·kcon+0x490(SB)/8, $0x32caab7b40c72493
 253  DATA  ·kcon+0x498(SB)/8, $0x32caab7b40c72493
 254  DATA  ·kcon+0x4A0(SB)/8, $0x3c9ebe0a15c9bebc
 255  DATA  ·kcon+0x4A8(SB)/8, $0x3c9ebe0a15c9bebc
 256  DATA  ·kcon+0x4B0(SB)/8, $0x431d67c49c100d4c
 257  DATA  ·kcon+0x4B8(SB)/8, $0x431d67c49c100d4c
 258  DATA  ·kcon+0x4C0(SB)/8, $0x4cc5d4becb3e42b6
 259  DATA  ·kcon+0x4C8(SB)/8, $0x4cc5d4becb3e42b6
 260  DATA  ·kcon+0x4D0(SB)/8, $0x597f299cfc657e2a
 261  DATA  ·kcon+0x4D8(SB)/8, $0x597f299cfc657e2a
 262  DATA  ·kcon+0x4E0(SB)/8, $0x5fcb6fab3ad6faec
 263  DATA  ·kcon+0x4E8(SB)/8, $0x5fcb6fab3ad6faec
 264  DATA  ·kcon+0x4F0(SB)/8, $0x6c44198c4a475817
 265  DATA  ·kcon+0x4F8(SB)/8, $0x6c44198c4a475817
 266  DATA  ·kcon+0x500(SB)/8, $0x0000000000000000
 267  DATA  ·kcon+0x508(SB)/8, $0x0000000000000000
 268  DATA  ·kcon+0x510(SB)/8, $0x1011121314151617
 269  DATA  ·kcon+0x518(SB)/8, $0x0001020304050607
 270  GLOBL ·kcon(SB), RODATA, $1312
 271  
 272  #define SHA512ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
 273  	VSEL		g, f, e, FUNC; \
 274  	VSHASIGMAD	$15, e, $1, S1; \
 275  	VADDUDM		xi, h, h; \
 276  	VSHASIGMAD	$0, a, $1, S0; \
 277  	VADDUDM		FUNC, h, h; \
 278  	VXOR		b, a, FUNC; \
 279  	VADDUDM		S1, h, h; \
 280  	VSEL		b, c, FUNC, FUNC; \
 281  	VADDUDM		KI, g, g; \
 282  	VADDUDM		h, d, d; \
 283  	VADDUDM		FUNC, S0, S0; \
 284  	LVX		(TBL)(idx), KI; \
 285  	VADDUDM		S0, h, h
 286  
 287  #define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
 288  	VSHASIGMAD	$0, xj_1, $0, s0; \
 289  	VSEL		g, f, e, FUNC; \
 290  	VSHASIGMAD	$15, e, $1, S1; \
 291  	VADDUDM		xi, h, h; \
 292  	VSHASIGMAD	$0, a, $1, S0; \
 293  	VSHASIGMAD	$15, xj_14, $0, s1; \
 294  	VADDUDM		FUNC, h, h; \
 295  	VXOR		b, a, FUNC; \
 296  	VADDUDM		xj_9, xj, xj; \
 297  	VADDUDM		S1, h, h; \
 298  	VSEL		b, c, FUNC, FUNC; \
 299  	VADDUDM		KI, g, g; \
 300  	VADDUDM		h, d, d; \
 301  	VADDUDM		FUNC, S0, S0; \
 302  	VADDUDM		s0, xj, xj; \
 303  	LVX		(TBL)(idx), KI; \
 304  	VADDUDM		S0, h, h; \
 305  	VADDUDM		s1, xj, xj
 306  
 307  // func blockPOWER(dig *Digest, p []byte)
 308  TEXT ·blockPOWER(SB),0,$0-32
 309  	MOVD	dig+0(FP), CTX
 310  	MOVD	p_base+8(FP), INP
 311  	MOVD	p_len+16(FP), LEN
 312  
 313  	SRD	$6, LEN
 314  	SLD	$6, LEN
 315  
 316  	ADD	INP, LEN, END
 317  
 318  	CMP	INP, END
 319  	BEQ	end
 320  
 321  	MOVD	$·kcon(SB), TBL_STRT
 322  
 323  	MOVD	R0, CNT
 324  	MOVWZ	$0x010, R_x010
 325  	MOVWZ	$0x020, R_x020
 326  	MOVWZ	$0x030, R_x030
 327  	MOVD	$0x040, R_x040
 328  	MOVD	$0x050, R_x050
 329  	MOVD	$0x060, R_x060
 330  	MOVD	$0x070, R_x070
 331  	MOVD	$0x080, R_x080
 332  	MOVD	$0x090, R_x090
 333  	MOVD	$0x0a0, R_x0a0
 334  	MOVD	$0x0b0, R_x0b0
 335  	MOVD	$0x0c0, R_x0c0
 336  	MOVD	$0x0d0, R_x0d0
 337  	MOVD	$0x0e0, R_x0e0
 338  	MOVD	$0x0f0, R_x0f0
 339  	MOVD	$0x100, R_x100
 340  	MOVD	$0x110, R_x110
 341  
 342  
 343  #ifdef GOARCH_ppc64le
 344  	// Generate the mask used with VPERM for LE
 345  	MOVWZ	$8, TEMP
 346  	LVSL	(TEMP)(R0), LEMASK
 347  	VSPLTISB	$0x0F, KI
 348  	VXOR	KI, LEMASK, LEMASK
 349  #endif
 350  
 351  	LXVD2X	(CTX)(R_x000), VS32	// v0 = vs32
 352  	LXVD2X	(CTX)(R_x010), VS34	// v2 = vs34
 353  	LXVD2X	(CTX)(R_x020), VS36	// v4 = vs36
 354  
 355  	// unpack the input values into vector registers
 356  	VSLDOI	$8, V0, V0, V1
 357  	LXVD2X	(CTX)(R_x030), VS38	// v6 = vs38
 358  	VSLDOI	$8, V2, V2, V3
 359  	VSLDOI	$8, V4, V4, V5
 360  	VSLDOI	$8, V6, V6, V7
 361  
 362  loop:
 363  	MOVD	TBL_STRT, TBL
 364  	LVX	(TBL)(R_x000), KI
 365  
 366  	LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
 367  	ADD	$16, INP
 368  
 369  	// Copy V0-V7 to VS24-VS31
 370  
 371  	XXLOR	V0, V0, VS24
 372  	XXLOR	V1, V1, VS25
 373  	XXLOR	V2, V2, VS26
 374  	XXLOR	V3, V3, VS27
 375  	XXLOR	V4, V4, VS28
 376  	XXLOR	V5, V5, VS29
 377  	XXLOR	V6, V6, VS30
 378  	XXLOR	V7, V7, VS31
 379  
 380  	VADDUDM	KI, V7, V7	// h+K[i]
 381  	LVX	(TBL)(R_x010), KI
 382  
 383  	VPERMLE(V8,V8,LEMASK,V8)
 384  	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
 385  	LXVD2X	(INP)(R_x000), VS42	// load v10 (=vs42) in advance
 386  	VSLDOI	$8, V8, V8, V9
 387  	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
 388  	VPERMLE(V10,V10,LEMASK,V10)
 389  	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
 390  	LXVD2X	(INP)(R_x010), VS44	// load v12 (=vs44) in advance
 391  	VSLDOI	$8, V10, V10, V11
 392  	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
 393  	VPERMLE(V12,V12,LEMASK,V12)
 394  	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
 395  	LXVD2X	(INP)(R_x020), VS46	// load v14 (=vs46) in advance
 396  	VSLDOI	$8, V12, V12, V13
 397  	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
 398  	VPERMLE(V14,V14,LEMASK,V14)
 399  	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
 400  	LXVD2X	(INP)(R_x030), VS48	// load v16 (=vs48) in advance
 401  	VSLDOI	$8, V14, V14, V15
 402  	SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
 403  	VPERMLE(V16,V16,LEMASK,V16)
 404  	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
 405  	LXVD2X	(INP)(R_x040), VS50	// load v18 (=vs50) in advance
 406  	VSLDOI	$8, V16, V16, V17
 407  	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
 408  	VPERMLE(V18,V18,LEMASK,V18)
 409  	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
 410  	LXVD2X	(INP)(R_x050), VS52	// load v20 (=vs52) in advance
 411  	VSLDOI	$8, V18, V18, V19
 412  	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
 413  	VPERMLE(V20,V20,LEMASK,V20)
 414  	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
 415  	LXVD2X	(INP)(R_x060), VS54	// load v22 (=vs54) in advance
 416  	VSLDOI	$8, V20, V20, V21
 417  	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
 418  	VPERMLE(V22,V22,LEMASK,V22)
 419  	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
 420  	VSLDOI	$8, V22, V22, V23
 421  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
 422  
 423  	MOVWZ	$4, TEMP
 424  	MOVWZ	TEMP, CTR
 425  	ADD	$0x120, TBL
 426  	ADD	$0x70, INP
 427  
 428  L16_xx:
 429  	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
 430  	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
 431  	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
 432  	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
 433  	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
 434  	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
 435  	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
 436  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
 437  	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
 438  	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
 439  	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
 440  	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
 441  	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
 442  	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
 443  	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
 444  	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
 445  	ADD	$0x100, TBL
 446  
 447  	BDNZ	L16_xx
 448  
 449  	XXLOR	VS24, VS24, V10
 450  	XXLOR	VS25, VS25, V11
 451  	XXLOR	VS26, VS26, V12
 452  	XXLOR	VS27, VS27, V13
 453  	XXLOR	VS28, VS28, V14
 454  	XXLOR	VS29, VS29, V15
 455  	XXLOR	VS30, VS30, V16
 456  	XXLOR	VS31, VS31, V17
 457  	VADDUDM	V10, V0, V0
 458  	VADDUDM	V11, V1, V1
 459  	VADDUDM	V12, V2, V2
 460  	VADDUDM	V13, V3, V3
 461  	VADDUDM	V14, V4, V4
 462  	VADDUDM	V15, V5, V5
 463  	VADDUDM	V16, V6, V6
 464  	VADDUDM	V17, V7, V7
 465  
 466  	CMPU	INP, END
 467  	BLT	loop
 468  
 469  #ifdef GOARCH_ppc64le
 470  	VPERM	V0, V1, KI, V0
 471  	VPERM	V2, V3, KI, V2
 472  	VPERM	V4, V5, KI, V4
 473  	VPERM	V6, V7, KI, V6
 474  #else
 475  	VPERM	V1, V0, KI, V0
 476  	VPERM	V3, V2, KI, V2
 477  	VPERM	V5, V4, KI, V4
 478  	VPERM	V7, V6, KI, V6
 479  #endif
 480  	STXVD2X	VS32, (CTX+R_x000)	// v0 = vs32
 481  	STXVD2X	VS34, (CTX+R_x010)	// v2 = vs34
 482  	STXVD2X	VS36, (CTX+R_x020)	// v4 = vs36
 483  	STXVD2X	VS38, (CTX+R_x030)	// v6 = vs38
 484  
 485  end:
 486  	RET
 487  
 488