sha256block_ppc64x.s raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build (ppc64 || ppc64le) && !purego
   6  
   7  // Based on CRYPTOGAMS code with the following comment:
   8  // # ====================================================================
   9  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10  // # project. The module is, however, dual licensed under OpenSSL and
  11  // # CRYPTOGAMS licenses depending on where you obtain it. For further
  12  // # details see http://www.openssl.org/~appro/cryptogams/.
  13  // # ====================================================================
  14  
  15  #include "textflag.h"
  16  
  17  // SHA256 block routine. See sha256block.go for Go equivalent.
  18  //
  19  // The algorithm is detailed in FIPS 180-4:
  20  //
  21  //  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
  22  //
  23  // Wt = Mt; for 0 <= t <= 15
  24  // Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
  25  //
  26  // a = H0
  27  // b = H1
  28  // c = H2
  29  // d = H3
  30  // e = H4
  31  // f = H5
  32  // g = H6
  33  // h = H7
  34  //
  35  // for t = 0 to 63 {
  36  //    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
  37  //    T2 = BIGSIGMA0(a) + Maj(a,b,c)
  38  //    h = g
  39  //    g = f
  40  //    f = e
  41  //    e = d + T1
  42  //    d = c
  43  //    c = b
  44  //    b = a
  45  //    a = T1 + T2
  46  // }
  47  //
  48  // H0 = a + H0
  49  // H1 = b + H1
  50  // H2 = c + H2
  51  // H3 = d + H3
  52  // H4 = e + H4
  53  // H5 = f + H5
  54  // H6 = g + H6
  55  // H7 = h + H7
  56  
  57  #define CTX	R3
  58  #define INP	R4
  59  #define END	R5
  60  #define TBL	R6 // Pointer into kcon table
  61  #define LEN	R9
  62  #define TEMP	R12
  63  
  64  #define TBL_STRT	R7 // Pointer to start of kcon table.
  65  
  66  #define R_x000	R0
  67  #define R_x010	R8
  68  #define R_x020	R10
  69  #define R_x030	R11
  70  #define R_x040	R14
  71  #define R_x050	R15
  72  #define R_x060	R16
  73  #define R_x070	R17
  74  #define R_x080	R18
  75  #define R_x090	R19
  76  #define R_x0a0	R20
  77  #define R_x0b0	R21
  78  #define R_x0c0	R22
  79  #define R_x0d0	R23
  80  #define R_x0e0	R24
  81  #define R_x0f0	R25
  82  #define R_x100	R26
  83  #define R_x110	R27
  84  
  85  
  86  // V0-V7 are A-H
  87  // V8-V23 are used for the message schedule
  88  #define KI	V24
  89  #define FUNC	V25
  90  #define S0	V26
  91  #define S1	V27
  92  #define s0	V28
  93  #define s1	V29
  94  #define LEMASK	V31 // Permutation control register for little endian
  95  
  96  // 4 copies of each Kt, to fill all 4 words of a vector register
  97  DATA  ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
  98  DATA  ·kcon+0x008(SB)/8, $0x428a2f98428a2f98
  99  DATA  ·kcon+0x010(SB)/8, $0x7137449171374491
 100  DATA  ·kcon+0x018(SB)/8, $0x7137449171374491
 101  DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf
 102  DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf
 103  DATA  ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5
 104  DATA  ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5
 105  DATA  ·kcon+0x040(SB)/8, $0x3956c25b3956c25b
 106  DATA  ·kcon+0x048(SB)/8, $0x3956c25b3956c25b
 107  DATA  ·kcon+0x050(SB)/8, $0x59f111f159f111f1
 108  DATA  ·kcon+0x058(SB)/8, $0x59f111f159f111f1
 109  DATA  ·kcon+0x060(SB)/8, $0x923f82a4923f82a4
 110  DATA  ·kcon+0x068(SB)/8, $0x923f82a4923f82a4
 111  DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5
 112  DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5
 113  DATA  ·kcon+0x080(SB)/8, $0xd807aa98d807aa98
 114  DATA  ·kcon+0x088(SB)/8, $0xd807aa98d807aa98
 115  DATA  ·kcon+0x090(SB)/8, $0x12835b0112835b01
 116  DATA  ·kcon+0x098(SB)/8, $0x12835b0112835b01
 117  DATA  ·kcon+0x0A0(SB)/8, $0x243185be243185be
 118  DATA  ·kcon+0x0A8(SB)/8, $0x243185be243185be
 119  DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3
 120  DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3
 121  DATA  ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74
 122  DATA  ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74
 123  DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe
 124  DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe
 125  DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7
 126  DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7
 127  DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174
 128  DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174
 129  DATA  ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1
 130  DATA  ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1
 131  DATA  ·kcon+0x110(SB)/8, $0xefbe4786efbe4786
 132  DATA  ·kcon+0x118(SB)/8, $0xefbe4786efbe4786
 133  DATA  ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6
 134  DATA  ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6
 135  DATA  ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc
 136  DATA  ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc
 137  DATA  ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f
 138  DATA  ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f
 139  DATA  ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa
 140  DATA  ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa
 141  DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc
 142  DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc
 143  DATA  ·kcon+0x170(SB)/8, $0x76f988da76f988da
 144  DATA  ·kcon+0x178(SB)/8, $0x76f988da76f988da
 145  DATA  ·kcon+0x180(SB)/8, $0x983e5152983e5152
 146  DATA  ·kcon+0x188(SB)/8, $0x983e5152983e5152
 147  DATA  ·kcon+0x190(SB)/8, $0xa831c66da831c66d
 148  DATA  ·kcon+0x198(SB)/8, $0xa831c66da831c66d
 149  DATA  ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8
 150  DATA  ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8
 151  DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7
 152  DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7
 153  DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3
 154  DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3
 155  DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147
 156  DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147
 157  DATA  ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351
 158  DATA  ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351
 159  DATA  ·kcon+0x1F0(SB)/8, $0x1429296714292967
 160  DATA  ·kcon+0x1F8(SB)/8, $0x1429296714292967
 161  DATA  ·kcon+0x200(SB)/8, $0x27b70a8527b70a85
 162  DATA  ·kcon+0x208(SB)/8, $0x27b70a8527b70a85
 163  DATA  ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138
 164  DATA  ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138
 165  DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc
 166  DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc
 167  DATA  ·kcon+0x230(SB)/8, $0x53380d1353380d13
 168  DATA  ·kcon+0x238(SB)/8, $0x53380d1353380d13
 169  DATA  ·kcon+0x240(SB)/8, $0x650a7354650a7354
 170  DATA  ·kcon+0x248(SB)/8, $0x650a7354650a7354
 171  DATA  ·kcon+0x250(SB)/8, $0x766a0abb766a0abb
 172  DATA  ·kcon+0x258(SB)/8, $0x766a0abb766a0abb
 173  DATA  ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e
 174  DATA  ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e
 175  DATA  ·kcon+0x270(SB)/8, $0x92722c8592722c85
 176  DATA  ·kcon+0x278(SB)/8, $0x92722c8592722c85
 177  DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1
 178  DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1
 179  DATA  ·kcon+0x290(SB)/8, $0xa81a664ba81a664b
 180  DATA  ·kcon+0x298(SB)/8, $0xa81a664ba81a664b
 181  DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70
 182  DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70
 183  DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3
 184  DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3
 185  DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d192e819
 186  DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d192e819
 187  DATA  ·kcon+0x2D0(SB)/8, $0xd6990624d6990624
 188  DATA  ·kcon+0x2D8(SB)/8, $0xd6990624d6990624
 189  DATA  ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585
 190  DATA  ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585
 191  DATA  ·kcon+0x2F0(SB)/8, $0x106aa070106aa070
 192  DATA  ·kcon+0x2F8(SB)/8, $0x106aa070106aa070
 193  DATA  ·kcon+0x300(SB)/8, $0x19a4c11619a4c116
 194  DATA  ·kcon+0x308(SB)/8, $0x19a4c11619a4c116
 195  DATA  ·kcon+0x310(SB)/8, $0x1e376c081e376c08
 196  DATA  ·kcon+0x318(SB)/8, $0x1e376c081e376c08
 197  DATA  ·kcon+0x320(SB)/8, $0x2748774c2748774c
 198  DATA  ·kcon+0x328(SB)/8, $0x2748774c2748774c
 199  DATA  ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5
 200  DATA  ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5
 201  DATA  ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3
 202  DATA  ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3
 203  DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a
 204  DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a
 205  DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f
 206  DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f
 207  DATA  ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3
 208  DATA  ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3
 209  DATA  ·kcon+0x380(SB)/8, $0x748f82ee748f82ee
 210  DATA  ·kcon+0x388(SB)/8, $0x748f82ee748f82ee
 211  DATA  ·kcon+0x390(SB)/8, $0x78a5636f78a5636f
 212  DATA  ·kcon+0x398(SB)/8, $0x78a5636f78a5636f
 213  DATA  ·kcon+0x3A0(SB)/8, $0x84c8781484c87814
 214  DATA  ·kcon+0x3A8(SB)/8, $0x84c8781484c87814
 215  DATA  ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208
 216  DATA  ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208
 217  DATA  ·kcon+0x3C0(SB)/8, $0x90befffa90befffa
 218  DATA  ·kcon+0x3C8(SB)/8, $0x90befffa90befffa
 219  DATA  ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb
 220  DATA  ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb
 221  DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7
 222  DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7
 223  DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2
 224  DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2
 225  DATA  ·kcon+0x400(SB)/8, $0x0000000000000000
 226  DATA  ·kcon+0x408(SB)/8, $0x0000000000000000
 227  
 228  #ifdef GOARCH_ppc64le
 229  DATA  ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
 230  DATA  ·kcon+0x418(SB)/8, $0x1011121300010203
 231  DATA  ·kcon+0x420(SB)/8, $0x1011121310111213
 232  DATA  ·kcon+0x428(SB)/8, $0x0405060700010203
 233  DATA  ·kcon+0x430(SB)/8, $0x1011121308090a0b
 234  DATA  ·kcon+0x438(SB)/8, $0x0405060700010203
 235  #else
 236  DATA  ·kcon+0x410(SB)/8, $0x1011121300010203
 237  DATA  ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
 238  DATA  ·kcon+0x420(SB)/8, $0x0405060700010203
 239  DATA  ·kcon+0x428(SB)/8, $0x1011121310111213
 240  DATA  ·kcon+0x430(SB)/8, $0x0001020304050607
 241  DATA  ·kcon+0x438(SB)/8, $0x08090a0b10111213
 242  #endif
 243  
 244  GLOBL ·kcon(SB), RODATA, $1088
 245  
 246  #define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
 247  	VSEL		g, f, e, FUNC; \
 248  	VSHASIGMAW	$15, e, $1, S1; \
 249  	VADDUWM		xi, h, h; \
 250  	VSHASIGMAW	$0, a, $1, S0; \
 251  	VADDUWM		FUNC, h, h; \
 252  	VXOR		b, a, FUNC; \
 253  	VADDUWM		S1, h, h; \
 254  	VSEL		b, c, FUNC, FUNC; \
 255  	VADDUWM		KI, g, g; \
 256  	VADDUWM		h, d, d; \
 257  	VADDUWM		FUNC, S0, S0; \
 258  	LVX		(TBL)(idx), KI; \
 259  	VADDUWM		S0, h, h
 260  
 261  #define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
 262  	VSHASIGMAW	$0, xj_1, $0, s0; \
 263  	VSEL		g, f, e, FUNC; \
 264  	VSHASIGMAW	$15, e, $1, S1; \
 265  	VADDUWM		xi, h, h; \
 266  	VSHASIGMAW	$0, a, $1, S0; \
 267  	VSHASIGMAW	$15, xj_14, $0, s1; \
 268  	VADDUWM		FUNC, h, h; \
 269  	VXOR		b, a, FUNC; \
 270  	VADDUWM		xj_9, xj, xj; \
 271  	VADDUWM		S1, h, h; \
 272  	VSEL		b, c, FUNC, FUNC; \
 273  	VADDUWM		KI, g, g; \
 274  	VADDUWM		h, d, d; \
 275  	VADDUWM		FUNC, S0, S0; \
 276  	VADDUWM		s0, xj, xj; \
 277  	LVX		(TBL)(idx), KI; \
 278  	VADDUWM		S0, h, h; \
 279  	VADDUWM		s1, xj, xj
 280  
 281  #ifdef GOARCH_ppc64le
 282  #define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt
 283  #else
 284  #define VPERMLE(va,vb,vc,vt)
 285  #endif
 286  
 287  // func blockPOWER(dig *Digest, p []byte)
 288  TEXT ·blockPOWER(SB),0,$0-32
 289  	MOVD	dig+0(FP), CTX
 290  	MOVD	p_base+8(FP), INP
 291  	MOVD	p_len+16(FP), LEN
 292  
 293  	SRD	$6, LEN
 294  	SLD	$6, LEN
 295  	ADD	INP, LEN, END
 296  
 297  	CMP	INP, END
 298  	BEQ	end
 299  
 300  	MOVD	$·kcon(SB), TBL_STRT
 301  	MOVD	$0x10, R_x010
 302  
 303  #ifdef GOARCH_ppc64le
 304  	MOVWZ	$8, TEMP
 305  	LVSL	(TEMP)(R0), LEMASK
 306  	VSPLTISB	$0x0F, KI
 307  	VXOR	KI, LEMASK, LEMASK
 308  #endif
 309  
 310  	LXVW4X	(CTX)(R_x000), V0
 311  	LXVW4X	(CTX)(R_x010), V4
 312  
 313  	// unpack the input values into vector registers
 314  	VSLDOI	$4, V0, V0, V1
 315  	VSLDOI	$8, V0, V0, V2
 316  	VSLDOI	$12, V0, V0, V3
 317  	VSLDOI	$4, V4, V4, V5
 318  	VSLDOI	$8, V4, V4, V6
 319  	VSLDOI	$12, V4, V4, V7
 320  
 321  	MOVD	$0x020, R_x020
 322  	MOVD	$0x030, R_x030
 323  	MOVD	$0x040, R_x040
 324  	MOVD	$0x050, R_x050
 325  	MOVD	$0x060, R_x060
 326  	MOVD	$0x070, R_x070
 327  	MOVD	$0x080, R_x080
 328  	MOVD	$0x090, R_x090
 329  	MOVD	$0x0a0, R_x0a0
 330  	MOVD	$0x0b0, R_x0b0
 331  	MOVD	$0x0c0, R_x0c0
 332  	MOVD	$0x0d0, R_x0d0
 333  	MOVD	$0x0e0, R_x0e0
 334  	MOVD	$0x0f0, R_x0f0
 335  	MOVD	$0x100, R_x100
 336  	MOVD	$0x110, R_x110
 337  
 338  loop:
 339  	MOVD	TBL_STRT, TBL
 340  	LVX	(TBL)(R_x000), KI
 341  
 342  	LXVD2X	(INP)(R_x000), V8 // load v8 in advance
 343  
 344  	// Offload to VSR24-31 (aka FPR24-31)
 345  	XXLOR	V0, V0, VS24
 346  	XXLOR	V1, V1, VS25
 347  	XXLOR	V2, V2, VS26
 348  	XXLOR	V3, V3, VS27
 349  	XXLOR	V4, V4, VS28
 350  	XXLOR	V5, V5, VS29
 351  	XXLOR	V6, V6, VS30
 352  	XXLOR	V7, V7, VS31
 353  
 354  	VADDUWM	KI, V7, V7        // h+K[i]
 355  	LVX	(TBL)(R_x010), KI
 356  
 357  	VPERMLE(V8, V8, LEMASK, V8)
 358  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
 359  	VSLDOI	$4, V8, V8, V9
 360  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
 361  	VSLDOI	$4, V9, V9, V10
 362  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
 363  	LXVD2X	(INP)(R_x010), V12 // load v12 in advance
 364  	VSLDOI	$4, V10, V10, V11
 365  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
 366  	VPERMLE(V12, V12, LEMASK, V12)
 367  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
 368  	VSLDOI	$4, V12, V12, V13
 369  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
 370  	VSLDOI	$4, V13, V13, V14
 371  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
 372  	LXVD2X	(INP)(R_x020), V16 // load v16 in advance
 373  	VSLDOI	$4, V14, V14, V15
 374  	SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
 375  	VPERMLE(V16, V16, LEMASK, V16)
 376  	SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
 377  	VSLDOI	$4, V16, V16, V17
 378  	SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
 379  	VSLDOI	$4, V17, V17, V18
 380  	SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
 381  	VSLDOI	$4, V18, V18, V19
 382  	LXVD2X	(INP)(R_x030), V20 // load v20 in advance
 383  	SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
 384  	VPERMLE(V20, V20, LEMASK, V20)
 385  	SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
 386  	VSLDOI	$4, V20, V20, V21
 387  	SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
 388  	VSLDOI	$4, V21, V21, V22
 389  	SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
 390  	VSLDOI	$4, V22, V22, V23
 391  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
 392  
 393  	MOVD	$3, TEMP
 394  	MOVD	TEMP, CTR
 395  	ADD	$0x120, TBL
 396  	ADD	$0x40, INP
 397  
 398  L16_xx:
 399  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
 400  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
 401  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
 402  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
 403  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
 404  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
 405  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
 406  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
 407  	SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
 408  	SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
 409  	SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
 410  	SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
 411  	SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
 412  	SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
 413  	SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
 414  	SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
 415  	ADD	$0x100, TBL
 416  
 417  	BDNZ	L16_xx
 418  
 419  	XXLOR	VS24, VS24, V10
 420  
 421  	XXLOR	VS25, VS25, V11
 422  	VADDUWM	V10, V0, V0
 423  	XXLOR	VS26, VS26, V12
 424  	VADDUWM	V11, V1, V1
 425  	XXLOR	VS27, VS27, V13
 426  	VADDUWM	V12, V2, V2
 427  	XXLOR	VS28, VS28, V14
 428  	VADDUWM	V13, V3, V3
 429  	XXLOR	VS29, VS29, V15
 430  	VADDUWM	V14, V4, V4
 431  	XXLOR	VS30, VS30, V16
 432  	VADDUWM	V15, V5, V5
 433  	XXLOR	VS31, VS31, V17
 434  	VADDUWM	V16, V6, V6
 435  	VADDUWM	V17, V7, V7
 436  
 437  	CMPU	INP, END
 438  	BLT	loop
 439  
 440  	LVX	(TBL)(R_x000), V8
 441  	VPERM	V0, V1, KI, V0
 442  	LVX	(TBL)(R_x010), V9
 443  	VPERM	V4, V5, KI, V4
 444  	VPERM	V0, V2, V8, V0
 445  	VPERM	V4, V6, V8, V4
 446  	VPERM	V0, V3, V9, V0
 447  	VPERM	V4, V7, V9, V4
 448  	STXVD2X	V0, (CTX+R_x000)
 449  	STXVD2X	V4, (CTX+R_x010)
 450  
 451  end:
 452  	RET
 453  
 454