sha256block_amd64_avx2.mx raw

   1  // Copyright 2024 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package main
   6  
   7  import (
   8  	. "github.com/mmcloughlin/avo/build"
   9  	. "github.com/mmcloughlin/avo/operand"
  10  	. "github.com/mmcloughlin/avo/reg"
  11  )
  12  
  13  // The avx2-version is described in an Intel White-Paper:
  14  // "Fast SHA-256 Implementations on Intel Architecture Processors"
  15  // To find it, surf to http://www.intel.com/p/en_US/embedded
  16  // and search for that title.
  17  // AVX2 version by Intel, same algorithm as code in Linux kernel:
  18  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha256-avx2-asm.S
  19  // by
  20  //     James Guilford <james.guilford@intel.com>
  21  //     Kirk Yap <kirk.s.yap@intel.com>
  22  //     Tim Chen <tim.c.chen@linux.intel.com>
  23  
  24  func blockAVX2() {
  25  	Implement("blockAVX2")
  26  	AllocLocal(536)
  27  
  28  	Load(Param("dig"), CTX) // d.h[8]
  29  	Load(Param("p").Base(), INP)
  30  	Load(Param("p").Len(), NUM_BYTES)
  31  
  32  	LEAQ(Mem{Base: INP, Index: NUM_BYTES, Scale: 1, Disp: -64}, NUM_BYTES) // Pointer to the last block
  33  	MOVQ(NUM_BYTES, Mem{Base: SP}.Offset(_INP_END))
  34  
  35  	CMPQ(NUM_BYTES, INP)
  36  	JE(LabelRef("avx2_only_one_block"))
  37  
  38  	Comment("Load initial digest")
  39  	CTX := Mem{Base: CTX}
  40  	MOVL(CTX.Offset(0), a)  //  a = H0
  41  	MOVL(CTX.Offset(4), b)  //  b = H1
  42  	MOVL(CTX.Offset(8), c)  //  c = H2
  43  	MOVL(CTX.Offset(12), d) //  d = H3
  44  	MOVL(CTX.Offset(16), e) //  e = H4
  45  	MOVL(CTX.Offset(20), f) //  f = H5
  46  	MOVL(CTX.Offset(24), g) //  g = H6
  47  	MOVL(CTX.Offset(28), h) //  h = H7
  48  
  49  	avx2_loop0()
  50  	avx2_last_block_enter()
  51  	avx2_loop1()
  52  	avx2_loop2()
  53  	avx2_loop3()
  54  	avx2_do_last_block()
  55  	avx2_only_one_block()
  56  	done_hash()
  57  }
  58  
  59  func avx2_loop0() {
  60  	Label("avx2_loop0")
  61  	Comment("at each iteration works with one block (512 bit)")
  62  	VMOVDQU(Mem{Base: INP}.Offset(0*32), XTMP0)
  63  	VMOVDQU(Mem{Base: INP}.Offset(1*32), XTMP1)
  64  	VMOVDQU(Mem{Base: INP}.Offset(2*32), XTMP2)
  65  	VMOVDQU(Mem{Base: INP}.Offset(3*32), XTMP3)
  66  
  67  	flip_mask := flip_mask_DATA()
  68  
  69  	VMOVDQU(flip_mask, BYTE_FLIP_MASK)
  70  
  71  	Comment("Apply Byte Flip Mask: LE -> BE")
  72  	VPSHUFB(BYTE_FLIP_MASK, XTMP0, XTMP0)
  73  	VPSHUFB(BYTE_FLIP_MASK, XTMP1, XTMP1)
  74  	VPSHUFB(BYTE_FLIP_MASK, XTMP2, XTMP2)
  75  	VPSHUFB(BYTE_FLIP_MASK, XTMP3, XTMP3)
  76  
  77  	Comment("Transpose data into high/low parts")
  78  	VPERM2I128(Imm(0x20), XTMP2, XTMP0, XDWORD0) //  w3,  w2,  w1,  w0
  79  	VPERM2I128(Imm(0x31), XTMP2, XTMP0, XDWORD1) //  w7,  w6,  w5,  w4
  80  	VPERM2I128(Imm(0x20), XTMP3, XTMP1, XDWORD2) // w11, w10,  w9,  w8
  81  	VPERM2I128(Imm(0x31), XTMP3, XTMP1, XDWORD3) // w15, w14, w13, w12
  82  
  83  	K256 := K256_DATA()
  84  	LEAQ(K256, TBL) // Loading address of table with round-specific constants
  85  }
  86  
  87  func avx2_last_block_enter() {
  88  	Label("avx2_last_block_enter")
  89  	ADDQ(Imm(64), INP)
  90  	MOVQ(INP, Mem{Base: SP}.Offset(_INP))
  91  	XORQ(SRND, SRND)
  92  }
  93  
  94  // for w0 - w47
  95  func avx2_loop1() {
  96  	Label("avx2_loop1")
  97  
  98  	Comment("Do 4 rounds and scheduling")
  99  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((0 * 32)), XDWORD0, XFER)
 100  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
 101  	roundAndSchedN0(_XFER+0*32, a, b, c, d, e, f, g, h, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
 102  	roundAndSchedN1(_XFER+0*32, h, a, b, c, d, e, f, g, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
 103  	roundAndSchedN2(_XFER+0*32, g, h, a, b, c, d, e, f, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
 104  	roundAndSchedN3(_XFER+0*32, f, g, h, a, b, c, d, e, XDWORD0, XDWORD1, XDWORD2, XDWORD3)
 105  
 106  	Comment("Do 4 rounds and scheduling")
 107  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
 108  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
 109  	roundAndSchedN0(_XFER+1*32, e, f, g, h, a, b, c, d, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
 110  	roundAndSchedN1(_XFER+1*32, d, e, f, g, h, a, b, c, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
 111  	roundAndSchedN2(_XFER+1*32, c, d, e, f, g, h, a, b, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
 112  	roundAndSchedN3(_XFER+1*32, b, c, d, e, f, g, h, a, XDWORD1, XDWORD2, XDWORD3, XDWORD0)
 113  
 114  	Comment("Do 4 rounds and scheduling")
 115  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((2 * 32)), XDWORD2, XFER)
 116  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+2*32))
 117  	roundAndSchedN0(_XFER+2*32, a, b, c, d, e, f, g, h, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
 118  	roundAndSchedN1(_XFER+2*32, h, a, b, c, d, e, f, g, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
 119  	roundAndSchedN2(_XFER+2*32, g, h, a, b, c, d, e, f, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
 120  	roundAndSchedN3(_XFER+2*32, f, g, h, a, b, c, d, e, XDWORD2, XDWORD3, XDWORD0, XDWORD1)
 121  
 122  	Comment("Do 4 rounds and scheduling")
 123  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset((3 * 32)), XDWORD3, XFER)
 124  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+3*32))
 125  	roundAndSchedN0(_XFER+3*32, e, f, g, h, a, b, c, d, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
 126  	roundAndSchedN1(_XFER+3*32, d, e, f, g, h, a, b, c, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
 127  	roundAndSchedN2(_XFER+3*32, c, d, e, f, g, h, a, b, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
 128  	roundAndSchedN3(_XFER+3*32, b, c, d, e, f, g, h, a, XDWORD3, XDWORD0, XDWORD1, XDWORD2)
 129  
 130  	ADDQ(Imm(4*32), SRND)
 131  	CMPQ(SRND, U32(3*4*32))
 132  	JB(LabelRef("avx2_loop1"))
 133  }
 134  
 135  // w48 - w63 processed with no scheduling (last 16 rounds)
 136  func avx2_loop2() {
 137  	Label("avx2_loop2")
 138  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(0*32), XDWORD0, XFER)
 139  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+0*32))
 140  	doRoundN0(_XFER+0*32, a, b, c, d, e, f, g, h, h)
 141  	doRoundN1(_XFER+0*32, h, a, b, c, d, e, f, g, h)
 142  	doRoundN2(_XFER+0*32, g, h, a, b, c, d, e, f, g)
 143  	doRoundN3(_XFER+0*32, f, g, h, a, b, c, d, e, f)
 144  
 145  	VPADDD(Mem{Base: TBL, Scale: 1, Index: SRND}.Offset(1*32), XDWORD1, XFER)
 146  	VMOVDQU(XFER, Mem{Base: SP, Scale: 1, Index: SRND}.Offset(_XFER+1*32))
 147  	doRoundN0(_XFER+1*32, e, f, g, h, a, b, c, d, e)
 148  	doRoundN1(_XFER+1*32, d, e, f, g, h, a, b, c, d)
 149  	doRoundN2(_XFER+1*32, c, d, e, f, g, h, a, b, c)
 150  	doRoundN3(_XFER+1*32, b, c, d, e, f, g, h, a, b)
 151  
 152  	ADDQ(Imm(2*32), SRND)
 153  
 154  	VMOVDQU(XDWORD2, XDWORD0)
 155  	VMOVDQU(XDWORD3, XDWORD1)
 156  
 157  	CMPQ(SRND, U32(4*4*32))
 158  	JB(LabelRef("avx2_loop2"))
 159  
 160  	Load(Param("dig"), CTX) // d.h[8]
 161  	MOVQ(Mem{Base: SP}.Offset(_INP), INP)
 162  
 163  	registers := []GPPhysical{a, b, c, d, e, f, g, h}
 164  	for i, reg := range registers {
 165  		addm(Mem{Base: CTX}.Offset(i*4), reg)
 166  	}
 167  
 168  	CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
 169  	JB(LabelRef("done_hash"))
 170  
 171  	XORQ(SRND, SRND)
 172  }
 173  
 174  // Do second block using previously scheduled results
 175  func avx2_loop3() {
 176  	Label("avx2_loop3")
 177  	doRoundN0(_XFER+0*32+16, a, b, c, d, e, f, g, h, a)
 178  	doRoundN1(_XFER+0*32+16, h, a, b, c, d, e, f, g, h)
 179  	doRoundN2(_XFER+0*32+16, g, h, a, b, c, d, e, f, g)
 180  	doRoundN3(_XFER+0*32+16, f, g, h, a, b, c, d, e, f)
 181  
 182  	doRoundN0(_XFER+1*32+16, e, f, g, h, a, b, c, d, e)
 183  	doRoundN1(_XFER+1*32+16, d, e, f, g, h, a, b, c, d)
 184  	doRoundN2(_XFER+1*32+16, c, d, e, f, g, h, a, b, c)
 185  	doRoundN3(_XFER+1*32+16, b, c, d, e, f, g, h, a, b)
 186  
 187  	ADDQ(Imm(2*32), SRND)
 188  	CMPQ(SRND, U32(4*4*32))
 189  	JB(LabelRef("avx2_loop3"))
 190  
 191  	Load(Param("dig"), CTX) // d.h[8]
 192  	MOVQ(Mem{Base: SP}.Offset(_INP), INP)
 193  	ADDQ(Imm(64), INP)
 194  
 195  	registers := []GPPhysical{a, b, c, d, e, f, g, h}
 196  	for i, reg := range registers {
 197  		addm(Mem{Base: CTX}.Offset(i*4), reg)
 198  	}
 199  
 200  	CMPQ(Mem{Base: SP}.Offset(_INP_END), INP)
 201  	JA(LabelRef("avx2_loop0"))
 202  	JB(LabelRef("done_hash"))
 203  }
 204  
 205  func avx2_do_last_block() {
 206  	Label("avx2_do_last_block")
 207  	VMOVDQU(Mem{Base: INP}.Offset(0), XWORD0)
 208  	VMOVDQU(Mem{Base: INP}.Offset(16), XWORD1)
 209  	VMOVDQU(Mem{Base: INP}.Offset(32), XWORD2)
 210  	VMOVDQU(Mem{Base: INP}.Offset(48), XWORD3)
 211  
 212  	flip_mask := flip_mask_DATA()
 213  	VMOVDQU(flip_mask, BYTE_FLIP_MASK)
 214  
 215  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD0, XWORD0)
 216  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD1, XWORD1)
 217  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD2, XWORD2)
 218  	VPSHUFB(X_BYTE_FLIP_MASK, XWORD3, XWORD3)
 219  
 220  	K256 := K256_DATA()
 221  	LEAQ(K256, TBL)
 222  
 223  	JMP(LabelRef("avx2_last_block_enter"))
 224  }
 225  
 226  // Load initial digest
 227  func avx2_only_one_block() {
 228  	Label("avx2_only_one_block")
 229  	registers := []GPPhysical{a, b, c, d, e, f, g, h}
 230  	for i, reg := range registers {
 231  		MOVL(Mem{Base: CTX}.Offset(i*4), reg)
 232  	}
 233  	JMP(LabelRef("avx2_do_last_block"))
 234  }
 235  
 236  func done_hash() {
 237  	Label("done_hash")
 238  	VZEROUPPER()
 239  	RET()
 240  }
 241  
 242  // addm (mem), reg
 243  //   - Add reg to mem using reg-mem add and store
 244  func addm(P1 Mem, P2 GPPhysical) {
 245  	ADDL(P2, P1)
 246  	MOVL(P1, P2)
 247  }
 248  
 249  var (
 250  	XDWORD0 VecPhysical = Y4
 251  	XDWORD1             = Y5
 252  	XDWORD2             = Y6
 253  	XDWORD3             = Y7
 254  
 255  	XWORD0 = X4
 256  	XWORD1 = X5
 257  	XWORD2 = X6
 258  	XWORD3 = X7
 259  
 260  	XTMP0 = Y0
 261  	XTMP1 = Y1
 262  	XTMP2 = Y2
 263  	XTMP3 = Y3
 264  	XTMP4 = Y8
 265  	XTMP5 = Y11
 266  
 267  	XFER = Y9
 268  
 269  	BYTE_FLIP_MASK   = Y13 // mask to convert LE -> BE
 270  	X_BYTE_FLIP_MASK = X13
 271  
 272  	NUM_BYTES GPPhysical = RDX
 273  	INP                  = RDI
 274  
 275  	CTX = RSI // Beginning of digest in memory (a, b, c, ... , h)
 276  
 277  	a = EAX
 278  	b = EBX
 279  	c = ECX
 280  	d = R8L
 281  	e = EDX
 282  	f = R9L
 283  	g = R10L
 284  	h = R11L
 285  
 286  	old_h = R11L
 287  
 288  	TBL = RBP
 289  
 290  	SRND = RSI // SRND is same register as CTX
 291  
 292  	T1 = R12L
 293  
 294  	y0 = R13L
 295  	y1 = R14L
 296  	y2 = R15L
 297  	y3 = EDI
 298  
 299  	// Offsets
 300  	XFER_SIZE    = 2 * 64 * 4
 301  	INP_END_SIZE = 8
 302  	INP_SIZE     = 8
 303  
 304  	_XFER      = 0
 305  	_INP_END   = _XFER + XFER_SIZE
 306  	_INP       = _INP_END + INP_END_SIZE
 307  	STACK_SIZE = _INP + INP_SIZE
 308  )
 309  
 310  func roundAndSchedN0(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
 311  	//                                                                 #############################  RND N + 0 ############################//
 312  	MOVL(a, y3)           //                                           y3 = a
 313  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
 314  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
 315  
 316  	ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 317  	ORL(c, y3)                                                      // y3 = a|c
 318  	VPALIGNR(Imm(4), XDWORD2, XDWORD3, XTMP0)                       // XTMP0 = W[-7]
 319  	MOVL(f, y2)                                                     // y2 = f
 320  	RORXL(Imm(13), a, T1)                                           // T1 = a >> 13
 321  
 322  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
 323  	XORL(g, y2)                   //                                   y2 = f^g
 324  	VPADDD(XDWORD0, XTMP0, XTMP0) //                                   XTMP0 = W[-7] + W[-16]
 325  	RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
 326  
 327  	ANDL(e, y2)           //                                           y2 = (f^g)&e
 328  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 329  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
 330  	ADDL(h, d)            //                                           d = k + w + h + d
 331  
 332  	ANDL(b, y3)                               //                       y3 = (a|c)&b
 333  	VPALIGNR(Imm(4), XDWORD0, XDWORD1, XTMP1) //                       XTMP1 = W[-15]
 334  	XORL(T1, y1)                              //                       y1 = (a>>22) ^ (a>>13)
 335  	RORXL(Imm(2), a, T1)                      //                       T1 = (a >> 2)
 336  
 337  	XORL(g, y2)                  //                                    y2 = CH = ((f^g)&e)^g
 338  	VPSRLD(Imm(7), XTMP1, XTMP2) //
 339  	XORL(T1, y1)                 //                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 340  	MOVL(a, T1)                  //                                    T1 = a
 341  	ANDL(c, T1)                  //                                    T1 = a&c
 342  
 343  	ADDL(y0, y2)                    //                                 y2 = S1 + CH
 344  	VPSLLD(Imm(32-7), XTMP1, XTMP3) //
 345  	ORL(T1, y3)                     //                                 y3 = MAJ = (a|c)&b)|(a&c)
 346  	ADDL(y1, h)                     //                                 h = k + w + h + S0
 347  
 348  	ADDL(y2, d)               //                                       d = k + w + h + d + S1 + CH = d + t1
 349  	VPOR(XTMP2, XTMP3, XTMP3) //                                       XTMP3 = W[-15] ror 7
 350  
 351  	VPSRLD(Imm(18), XTMP1, XTMP2)
 352  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
 353  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
 354  }
 355  
 356  func roundAndSchedN1(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
 357  	//                                                                 ################################### RND N + 1 ############################
 358  	MOVL(a, y3)                                                     // y3 = a
 359  	RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
 360  	RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
 361  	ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 362  	ORL(c, y3)                                                      // y3 = a|c
 363  
 364  	VPSRLD(Imm(3), XTMP1, XTMP4) //                                    XTMP4 = W[-15] >> 3
 365  	MOVL(f, y2)                  //                                    y2 = f
 366  	RORXL(Imm(13), a, T1)        //                                    T1 = a >> 13
 367  	XORL(y1, y0)                 //                                    y0 = (e>>25) ^ (e>>11)
 368  	XORL(g, y2)                  //                                    y2 = f^g
 369  
 370  	RORXL(Imm(6), e, y1)  //                                           y1 = (e >> 6)
 371  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 372  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
 373  	ANDL(e, y2)           //                                           y2 = (f^g)&e
 374  	ADDL(h, d)            //                                           d = k + w + h + d
 375  
 376  	VPSLLD(Imm(32-18), XTMP1, XTMP1)
 377  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
 378  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13)
 379  
 380  	VPXOR(XTMP1, XTMP3, XTMP3)
 381  	RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
 382  	XORL(g, y2)          //                                            y2 = CH = ((f^g)&e)^g
 383  
 384  	VPXOR(XTMP2, XTMP3, XTMP3) //                                      XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
 385  	XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 386  	MOVL(a, T1)                //                                      T1 = a
 387  	ANDL(c, T1)                //                                      T1 = a&c
 388  	ADDL(y0, y2)               //                                      y2 = S1 + CH
 389  
 390  	VPXOR(XTMP4, XTMP3, XTMP1)         //                              XTMP1 = s0
 391  	VPSHUFD(Imm(0xFA), XDWORD3, XTMP2) //                              XTMP2 = W[-2] {BBAA}
 392  	ORL(T1, y3)                        //                              y3 = MAJ = (a|c)&b)|(a&c)
 393  	ADDL(y1, h)                        //                              h = k + w + h + S0
 394  
 395  	VPADDD(XTMP1, XTMP0, XTMP0) //                                     XTMP0 = W[-16] + W[-7] + s0
 396  	ADDL(y2, d)                 //                                     d = k + w + h + d + S1 + CH = d + t1
 397  	ADDL(y2, h)                 //                                     h = k + w + h + S0 + S1 + CH = t1 + S0
 398  	ADDL(y3, h)                 //                                     h = t1 + S0 + MAJ
 399  
 400  	VPSRLD(Imm(10), XTMP2, XTMP4) //                                   XTMP4 = W[-2] >> 10 {BBAA}
 401  }
 402  
 403  func roundAndSchedN2(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
 404  	//                                                                 ################################### RND N + 2 ############################
 405  	var shuff_00BA Mem = shuff_00BA_DATA()
 406  
 407  	MOVL(a, y3)                                                     // y3 = a
 408  	RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
 409  	ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 410  
 411  	VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xBxA}
 412  	RORXL(Imm(11), e, y1)         //                                   y1 = e >> 11
 413  	ORL(c, y3)                    //                                   y3 = a|c
 414  	MOVL(f, y2)                   //                                   y2 = f
 415  	XORL(g, y2)                   //                                   y2 = f^g
 416  
 417  	RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
 418  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
 419  	VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xBxA}
 420  	ANDL(e, y2)                   //                                   y2 = (f^g)&e
 421  
 422  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
 423  	VPXOR(XTMP3, XTMP2, XTMP2)
 424  	ADDL(h, d)  //                                                     d = k + w + h + d
 425  	ANDL(b, y3) //                                                     y3 = (a|c)&b
 426  
 427  	XORL(y1, y0)               //                                      y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 428  	RORXL(Imm(22), a, y1)      //                                      y1 = a >> 22
 429  	VPXOR(XTMP2, XTMP4, XTMP4) //                                      XTMP4 = s1 {xBxA}
 430  	XORL(g, y2)                //                                      y2 = CH = ((f^g)&e)^g
 431  
 432  	VPSHUFB(shuff_00BA, XTMP4, XTMP4) //                               XTMP4 = s1 {00BA}
 433  
 434  	XORL(T1, y1)                //                                     y1 = (a>>22) ^ (a>>13)
 435  	RORXL(Imm(2), a, T1)        //                                     T1 = (a >> 2)
 436  	VPADDD(XTMP4, XTMP0, XTMP0) //                                     XTMP0 = {..., ..., W[1], W[0]}
 437  
 438  	XORL(T1, y1)                   //                                  y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 439  	MOVL(a, T1)                    //                                  T1 = a
 440  	ANDL(c, T1)                    //                                  T1 = a&c
 441  	ADDL(y0, y2)                   //                                  y2 = S1 + CH
 442  	VPSHUFD(Imm(80), XTMP0, XTMP2) //                                  XTMP2 = W[-2] {DDCC}
 443  
 444  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
 445  	ADDL(y1, h) //                                                     h = k + w + h + S0
 446  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
 447  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
 448  
 449  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
 450  }
 451  
 452  func roundAndSchedN3(disp int, a, b, c, d, e, f, g, h GPPhysical, XDWORD0, XDWORD1, XDWORD2, XDWORD3 VecPhysical) {
 453  	//                                                                 ################################### RND N + 3 ############################
 454  	var shuff_DC00 Mem = shuff_DC00_DATA()
 455  
 456  	MOVL(a, y3)                                                     // y3 = a
 457  	RORXL(Imm(25), e, y0)                                           // y0 = e >> 25
 458  	RORXL(Imm(11), e, y1)                                           // y1 = e >> 11
 459  	ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 460  	ORL(c, y3)                                                      // y3 = a|c
 461  
 462  	VPSRLD(Imm(10), XTMP2, XTMP5) //                                   XTMP5 = W[-2] >> 10 {DDCC}
 463  	MOVL(f, y2)                   //                                   y2 = f
 464  	RORXL(Imm(13), a, T1)         //                                   T1 = a >> 13
 465  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11)
 466  	XORL(g, y2)                   //                                   y2 = f^g
 467  
 468  	VPSRLQ(Imm(19), XTMP2, XTMP3) //                                   XTMP3 = W[-2] ror 19 {xDxC}
 469  	RORXL(Imm(6), e, y1)          //                                   y1 = (e >> 6)
 470  	ANDL(e, y2)                   //                                   y2 = (f^g)&e
 471  	ADDL(h, d)                    //                                   d = k + w + h + d
 472  	ANDL(b, y3)                   //                                   y3 = (a|c)&b
 473  
 474  	VPSRLQ(Imm(17), XTMP2, XTMP2) //                                   XTMP2 = W[-2] ror 17 {xDxC}
 475  	XORL(y1, y0)                  //                                   y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 476  	XORL(g, y2)                   //                                   y2 = CH = ((f^g)&e)^g
 477  
 478  	VPXOR(XTMP3, XTMP2, XTMP2)
 479  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
 480  	ADDL(y0, y2)          //                                           y2 = S1 + CH
 481  
 482  	VPXOR(XTMP2, XTMP5, XTMP5) //                                      XTMP5 = s1 {xDxC}
 483  	XORL(T1, y1)               //                                      y1 = (a>>22) ^ (a>>13)
 484  	ADDL(y2, d)                //                                      d = k + w + h + d + S1 + CH = d + t1
 485  
 486  	RORXL(Imm(2), a, T1) //                                            T1 = (a >> 2)
 487  
 488  	VPSHUFB(shuff_DC00, XTMP5, XTMP5) //                               XTMP5 = s1 {DC00}
 489  
 490  	VPADDD(XTMP0, XTMP5, XDWORD0) //                                   XDWORD0 = {W[3], W[2], W[1], W[0]}
 491  	XORL(T1, y1)                  //                                   y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 492  	MOVL(a, T1)                   //                                   T1 = a
 493  	ANDL(c, T1)                   //                                   T1 = a&c
 494  	ORL(T1, y3)                   //                                   y3 = MAJ = (a|c)&b)|(a&c)
 495  
 496  	ADDL(y1, h) //                                                     h = k + w + h + S0
 497  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
 498  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
 499  }
 500  
 501  func doRoundN0(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
 502  	//                                                                 ################################### RND N + 0 ###########################
 503  	MOVL(f, y2)           //                                           y2 = f
 504  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
 505  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
 506  	XORL(g, y2)           //                                           y2 = f^g
 507  
 508  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
 509  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
 510  	ANDL(e, y2)          //                                            y2 = (f^g)&e
 511  
 512  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 513  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
 514  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
 515  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
 516  	MOVL(a, y3)           //                                           y3 = a
 517  
 518  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
 519  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
 520  	ADDL(Mem{Base: SP, Disp: disp + 0*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 521  	ORL(c, y3)                                                      // y3 = a|c
 522  
 523  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 524  	MOVL(a, T1)  //                                                    T1 = a
 525  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
 526  	ANDL(c, T1)  //                                                    T1 = a&c
 527  	ADDL(y0, y2) //                                                    y2 = S1 + CH
 528  
 529  	ADDL(h, d)  //                                                     d = k + w + h + d
 530  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
 531  	ADDL(y1, h) //                                                     h = k + w + h + S0
 532  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
 533  }
 534  
 535  func doRoundN1(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
 536  	//                                                                 ################################### RND N + 1 ###########################
 537  	ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
 538  	MOVL(f, y2)           //                                           y2 = f
 539  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
 540  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
 541  	XORL(g, y2)           //                                           y2 = f^g
 542  
 543  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
 544  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
 545  	ANDL(e, y2)          //                                            y2 = (f^g)&e
 546  	ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
 547  
 548  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 549  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
 550  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
 551  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
 552  	MOVL(a, y3)           //                                           y3 = a
 553  
 554  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
 555  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
 556  	ADDL(Mem{Base: SP, Disp: disp + 1*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 557  	ORL(c, y3)                                                      // y3 = a|c
 558  
 559  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 560  	MOVL(a, T1)  //                                                    T1 = a
 561  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
 562  	ANDL(c, T1)  //                                                    T1 = a&c
 563  	ADDL(y0, y2) //                                                    y2 = S1 + CH
 564  
 565  	ADDL(h, d)  //                                                     d = k + w + h + d
 566  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
 567  	ADDL(y1, h) //                                                     h = k + w + h + S0
 568  
 569  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
 570  }
 571  
 572  func doRoundN2(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
 573  	//                                                                 ################################### RND N + 2 ##############################
 574  	ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
 575  	MOVL(f, y2)           //                                           y2 = f
 576  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
 577  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
 578  	XORL(g, y2)           //                                           y2 = f^g
 579  
 580  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
 581  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
 582  	ANDL(e, y2)          //                                            y2 = (f^g)&e
 583  	ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
 584  
 585  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 586  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
 587  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
 588  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
 589  	MOVL(a, y3)           //                                           y3 = a
 590  
 591  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
 592  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
 593  	ADDL(Mem{Base: SP, Disp: disp + 2*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 594  	ORL(c, y3)                                                      // y3 = a|c
 595  
 596  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 597  	MOVL(a, T1)  //                                                    T1 = a
 598  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
 599  	ANDL(c, T1)  //                                                    T1 = a&c
 600  	ADDL(y0, y2) //                                                    y2 = S1 + CH
 601  
 602  	ADDL(h, d)  //                                                     d = k + w + h + d
 603  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
 604  	ADDL(y1, h) //                                                     h = k + w + h + S0
 605  
 606  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
 607  }
 608  
 609  func doRoundN3(disp int, a, b, c, d, e, f, g, h, old_h GPPhysical) {
 610  	//                                                                 ################################### RND N + 3 ###########################
 611  	ADDL(y2, old_h)       //                                           h = k + w + h + S0 + S1 + CH = t1 + S0
 612  	MOVL(f, y2)           //                                           y2 = f
 613  	RORXL(Imm(25), e, y0) //                                           y0 = e >> 25
 614  	RORXL(Imm(11), e, y1) //                                           y1 = e >> 11
 615  	XORL(g, y2)           //                                           y2 = f^g
 616  
 617  	XORL(y1, y0)         //                                            y0 = (e>>25) ^ (e>>11)
 618  	RORXL(Imm(6), e, y1) //                                            y1 = (e >> 6)
 619  	ANDL(e, y2)          //                                            y2 = (f^g)&e
 620  	ADDL(y3, old_h)      //                                            h = t1 + S0 + MAJ
 621  
 622  	XORL(y1, y0)          //                                           y0 = (e>>25) ^ (e>>11) ^ (e>>6)
 623  	RORXL(Imm(13), a, T1) //                                           T1 = a >> 13
 624  	XORL(g, y2)           //                                           y2 = CH = ((f^g)&e)^g
 625  	RORXL(Imm(22), a, y1) //                                           y1 = a >> 22
 626  	MOVL(a, y3)           //                                           y3 = a
 627  
 628  	XORL(T1, y1)                                                    // y1 = (a>>22) ^ (a>>13)
 629  	RORXL(Imm(2), a, T1)                                            // T1 = (a >> 2)
 630  	ADDL(Mem{Base: SP, Disp: disp + 3*4, Scale: 1, Index: SRND}, h) // h = k + w + h
 631  	ORL(c, y3)                                                      // y3 = a|c
 632  
 633  	XORL(T1, y1) //                                                    y1 = (a>>22) ^ (a>>13) ^ (a>>2)
 634  	MOVL(a, T1)  //                                                    T1 = a
 635  	ANDL(b, y3)  //                                                    y3 = (a|c)&b
 636  	ANDL(c, T1)  //                                                    T1 = a&c
 637  	ADDL(y0, y2) //                                                    y2 = S1 + CH
 638  
 639  	ADDL(h, d)  //                                                     d = k + w + h + d
 640  	ORL(T1, y3) //                                                     y3 = MAJ = (a|c)&b)|(a&c)
 641  	ADDL(y1, h) //                                                     h = k + w + h + S0
 642  
 643  	ADDL(y2, d) //                                                     d = k + w + h + d + S1 + CH = d + t1
 644  
 645  	ADDL(y2, h) //                                                     h = k + w + h + S0 + S1 + CH = t1 + S0
 646  
 647  	ADDL(y3, h) //                                                     h = t1 + S0 + MAJ
 648  }
 649  
 650  // Pointers for memoizing Data section symbols
 651  var flip_maskPtr, shuff_00BAPtr, shuff_DC00Ptr, K256Ptr *Mem
 652  
 653  // shuffle byte order from LE to BE
 654  func flip_mask_DATA() Mem {
 655  	if flip_maskPtr != nil {
 656  		return *flip_maskPtr
 657  	}
 658  
 659  	flip_mask := GLOBL("flip_mask", RODATA)
 660  	flip_maskPtr = &flip_mask
 661  
 662  	DATA(0x00, U64(0x0405060700010203))
 663  	DATA(0x08, U64(0x0c0d0e0f08090a0b))
 664  	DATA(0x10, U64(0x0405060700010203))
 665  	DATA(0x18, U64(0x0c0d0e0f08090a0b))
 666  	return flip_mask
 667  }
 668  
 669  // shuffle xBxA -> 00BA
 670  func shuff_00BA_DATA() Mem {
 671  	if shuff_00BAPtr != nil {
 672  		return *shuff_00BAPtr
 673  	}
 674  
 675  	shuff_00BA := GLOBL("shuff_00BA", RODATA)
 676  	shuff_00BAPtr = &shuff_00BA
 677  
 678  	DATA(0x00, U64(0x0b0a090803020100))
 679  	DATA(0x08, U64(0xFFFFFFFFFFFFFFFF))
 680  	DATA(0x10, U64(0x0b0a090803020100))
 681  	DATA(0x18, U64(0xFFFFFFFFFFFFFFFF))
 682  	return shuff_00BA
 683  }
 684  
 685  // shuffle xDxC -> DC00
 686  func shuff_DC00_DATA() Mem {
 687  	if shuff_DC00Ptr != nil {
 688  		return *shuff_DC00Ptr
 689  	}
 690  
 691  	shuff_DC00 := GLOBL("shuff_DC00", RODATA)
 692  	shuff_DC00Ptr = &shuff_DC00
 693  
 694  	DATA(0x00, U64(0xFFFFFFFFFFFFFFFF))
 695  	DATA(0x08, U64(0x0b0a090803020100))
 696  	DATA(0x10, U64(0xFFFFFFFFFFFFFFFF))
 697  	DATA(0x18, U64(0x0b0a090803020100))
 698  	return shuff_DC00
 699  }
 700  
 701  // Round specific constants
 702  func K256_DATA() Mem {
 703  	if K256Ptr != nil {
 704  		return *K256Ptr
 705  	}
 706  
 707  	K256 := GLOBL("K256", NOPTR+RODATA)
 708  	K256Ptr = &K256
 709  
 710  	offset_idx := 0
 711  
 712  	for i := 0; i < len(_K); i += 4 {
 713  		DATA((offset_idx+0)*4, U32(_K[i+0])) // k1
 714  		DATA((offset_idx+1)*4, U32(_K[i+1])) // k2
 715  		DATA((offset_idx+2)*4, U32(_K[i+2])) // k3
 716  		DATA((offset_idx+3)*4, U32(_K[i+3])) // k4
 717  
 718  		DATA((offset_idx+4)*4, U32(_K[i+0])) // k1
 719  		DATA((offset_idx+5)*4, U32(_K[i+1])) // k2
 720  		DATA((offset_idx+6)*4, U32(_K[i+2])) // k3
 721  		DATA((offset_idx+7)*4, U32(_K[i+3])) // k4
 722  		offset_idx += 8
 723  	}
 724  	return K256
 725  }
 726