sha1block_amd64_asm.mx raw

   1  // Copyright 2024 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package main
   6  
   7  import (
   8  	. "github.com/mmcloughlin/avo/build"
   9  	. "github.com/mmcloughlin/avo/operand"
  10  	. "github.com/mmcloughlin/avo/reg"
  11  )
  12  
  13  //go:generate go run . -out ../sha1block_amd64.s -pkg sha1
  14  
  15  // AVX2 version by Intel, same algorithm as code in Linux kernel:
  16  // https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S
  17  // Authors:
  18  // Ilya Albrekht <ilya.albrekht@intel.com>
  19  // Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  20  // Ronen Zohar <ronen.zohar@intel.com>
  21  // Chandramouli Narayanan <mouli@linux.intel.com>
  22  
  23  func main() {
  24  	Package("crypto/sha1")
  25  	ConstraintExpr("!purego")
  26  	blockAVX2()
  27  	blockSHANI()
  28  	Generate()
  29  }
  30  
  31  // This is the implementation using AVX2, BMI1 and BMI2. It is based on:
  32  // "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
  33  // From http://software.intel.com/en-us/articles
  34  // (look for improving-the-performance-of-the-secure-hash-algorithm-1)
  35  // This implementation is 2x unrolled, and interleaves vector instructions,
  36  // used to precompute W, with scalar computation of current round
  37  // for optimal scheduling.
  38  
  39  // Trivial helper macros.
  40  
  41  func UPDATE_HASH(A, TB, C, D, E GPPhysical) {
  42  	ADDL(Mem{Base: R9}, A)
  43  	MOVL(A, Mem{Base: R9})
  44  	ADDL(Mem{Base: R9}.Offset(4), TB)
  45  	MOVL(TB, Mem{Base: R9}.Offset(4))
  46  	ADDL(Mem{Base: R9}.Offset(8), C)
  47  	MOVL(C, Mem{Base: R9}.Offset(8))
  48  	ADDL(Mem{Base: R9}.Offset(12), D)
  49  	MOVL(D, Mem{Base: R9}.Offset(12))
  50  	ADDL(Mem{Base: R9}.Offset(16), E)
  51  	MOVL(E, Mem{Base: R9}.Offset(16))
  52  }
  53  
  54  // Helper macros for PRECALC, which does precomputations
  55  
  56  func PRECALC_0(OFFSET int) {
  57  	VMOVDQU(Mem{Base: R10}.Offset(OFFSET), X0)
  58  }
  59  
  60  func PRECALC_1(OFFSET int) {
  61  	VINSERTI128(Imm(1), Mem{Base: R13}.Offset(OFFSET), Y0, Y0)
  62  }
  63  
  64  func PRECALC_2(YREG VecPhysical) {
  65  	VPSHUFB(Y10, Y0, YREG)
  66  }
  67  
  68  func PRECALC_4(YREG VecPhysical, K_OFFSET int) {
  69  	VPADDD(Mem{Base: R8}.Offset(K_OFFSET), YREG, Y0)
  70  }
  71  
  72  func PRECALC_7(OFFSET int) {
  73  	VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET*2))
  74  }
  75  
  76  // Message scheduling pre-compute for rounds 0-15
  77  //
  78  //   - R13 is a pointer to even 64-byte block
  79  //   - R10 is a pointer to odd 64-byte block
  80  //   - R14 is a pointer to temp buffer
  81  //   - X0 is used as temp register
  82  //   - YREG is clobbered as part of computation
  83  //   - OFFSET chooses 16 byte chunk within a block
  84  //   - R8 is a pointer to constants block
  85  //   - K_OFFSET chooses K constants relevant to this round
  86  //   - X10 holds swap mask
  87  func PRECALC_00_15(OFFSET int, YREG VecPhysical) {
  88  	PRECALC_0(OFFSET)
  89  	PRECALC_1(OFFSET)
  90  	PRECALC_2(YREG)
  91  	PRECALC_4(YREG, 0x0)
  92  	PRECALC_7(OFFSET)
  93  }
  94  
  95  // Helper macros for PRECALC_16_31
  96  
  97  func PRECALC_16(REG_SUB_16, REG_SUB_12, REG_SUB_4, REG VecPhysical) {
  98  	VPALIGNR(Imm(8), REG_SUB_16, REG_SUB_12, REG) // w[i-14]
  99  	VPSRLDQ(Imm(4), REG_SUB_4, Y0)                // w[i-3]
 100  }
 101  
 102  func PRECALC_17(REG_SUB_16, REG_SUB_8, REG VecPhysical) {
 103  	VPXOR(REG_SUB_8, REG, REG)
 104  	VPXOR(REG_SUB_16, Y0, Y0)
 105  }
 106  
 107  func PRECALC_18(REG VecPhysical) {
 108  	VPXOR(Y0, REG, REG)
 109  	VPSLLDQ(Imm(12), REG, Y9)
 110  }
 111  
 112  func PRECALC_19(REG VecPhysical) {
 113  	VPSLLD(Imm(1), REG, Y0)
 114  	VPSRLD(Imm(31), REG, REG)
 115  }
 116  
 117  func PRECALC_20(REG VecPhysical) {
 118  	VPOR(REG, Y0, Y0)
 119  	VPSLLD(Imm(2), Y9, REG)
 120  }
 121  
 122  func PRECALC_21(REG VecPhysical) {
 123  	VPSRLD(Imm(30), Y9, Y9)
 124  	VPXOR(REG, Y0, Y0)
 125  }
 126  
 127  func PRECALC_23(REG VecPhysical, K_OFFSET, OFFSET int) {
 128  	VPXOR(Y9, Y0, REG)
 129  	VPADDD(Mem{Base: R8}.Offset(K_OFFSET), REG, Y0)
 130  	VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET))
 131  }
 132  
 133  // Message scheduling pre-compute for rounds 16-31
 134  //   - calculating last 32 w[i] values in 8 XMM registers
 135  //   - pre-calculate K+w[i] values and store to mem
 136  //   - for later load by ALU add instruction.
 137  //   - "brute force" vectorization for rounds 16-31 only
 138  //   - due to w[i]->w[i-3] dependency.
 139  //   - clobbers 5 input ymm registers REG_SUB*
 140  //   - uses X0 and X9 as temp registers
 141  //   - As always, R8 is a pointer to constants block
 142  //   - and R14 is a pointer to temp buffer
 143  func PRECALC_16_31(REG, REG_SUB_4, REG_SUB_8, REG_SUB_12, REG_SUB_16 VecPhysical, K_OFFSET, OFFSET int) {
 144  	PRECALC_16(REG_SUB_16, REG_SUB_12, REG_SUB_4, REG)
 145  	PRECALC_17(REG_SUB_16, REG_SUB_8, REG)
 146  	PRECALC_18(REG)
 147  	PRECALC_19(REG)
 148  	PRECALC_20(REG)
 149  	PRECALC_21(REG)
 150  	PRECALC_23(REG, K_OFFSET, OFFSET)
 151  }
 152  
 153  // Helper macros for PRECALC_32_79
 154  
 155  func PRECALC_32(REG_SUB_8, REG_SUB_4 VecPhysical) {
 156  	VPALIGNR(Imm(8), REG_SUB_8, REG_SUB_4, Y0)
 157  }
 158  
 159  func PRECALC_33(REG_SUB_28, REG VecPhysical) {
 160  	VPXOR(REG_SUB_28, REG, REG)
 161  }
 162  
 163  func PRECALC_34(REG_SUB_16 VecPhysical) {
 164  	VPXOR(REG_SUB_16, Y0, Y0)
 165  }
 166  
 167  func PRECALC_35(REG VecPhysical) {
 168  	VPXOR(Y0, REG, REG)
 169  }
 170  
 171  func PRECALC_36(REG VecPhysical) {
 172  	VPSLLD(Imm(2), REG, Y0)
 173  }
 174  
 175  func PRECALC_37(REG VecPhysical) {
 176  	VPSRLD(Imm(30), REG, REG)
 177  	VPOR(REG, Y0, REG)
 178  }
 179  
 180  func PRECALC_39(REG VecPhysical, K_OFFSET, OFFSET int) {
 181  	VPADDD(Mem{Base: R8}.Offset(K_OFFSET), REG, Y0)
 182  	VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET))
 183  }
 184  
 185  // Message scheduling pre-compute for rounds 32-79
 186  // In SHA-1 specification we have:
 187  // w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 188  // Which is the same as:
 189  // w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 190  // This allows for more efficient vectorization,
 191  // since w[i]->w[i-3] dependency is broken
 192  
 193  func PRECALC_32_79(REG, REG_SUB_4, REG_SUB_8, REG_SUB_16, REG_SUB_28 VecPhysical, K_OFFSET, OFFSET int) {
 194  	PRECALC_32(REG_SUB_8, REG_SUB_4)
 195  	PRECALC_33(REG_SUB_28, REG)
 196  	PRECALC_34(REG_SUB_16)
 197  	PRECALC_35(REG)
 198  	PRECALC_36(REG)
 199  	PRECALC_37(REG)
 200  	PRECALC_39(REG, K_OFFSET, OFFSET)
 201  }
 202  
 203  func PRECALC() {
 204  	PRECALC_00_15(0, Y15)
 205  	PRECALC_00_15(0x10, Y14)
 206  	PRECALC_00_15(0x20, Y13)
 207  	PRECALC_00_15(0x30, Y12)
 208  	PRECALC_16_31(Y8, Y12, Y13, Y14, Y15, 0, 0x80)
 209  	PRECALC_16_31(Y7, Y8, Y12, Y13, Y14, 0x20, 0xa0)
 210  	PRECALC_16_31(Y5, Y7, Y8, Y12, Y13, 0x20, 0xc0)
 211  	PRECALC_16_31(Y3, Y5, Y7, Y8, Y12, 0x20, 0xe0)
 212  	PRECALC_32_79(Y15, Y3, Y5, Y8, Y14, 0x20, 0x100)
 213  	PRECALC_32_79(Y14, Y15, Y3, Y7, Y13, 0x20, 0x120)
 214  	PRECALC_32_79(Y13, Y14, Y15, Y5, Y12, 0x40, 0x140)
 215  	PRECALC_32_79(Y12, Y13, Y14, Y3, Y8, 0x40, 0x160)
 216  	PRECALC_32_79(Y8, Y12, Y13, Y15, Y7, 0x40, 0x180)
 217  	PRECALC_32_79(Y7, Y8, Y12, Y14, Y5, 0x40, 0x1a0)
 218  	PRECALC_32_79(Y5, Y7, Y8, Y13, Y3, 0x40, 0x1c0)
 219  	PRECALC_32_79(Y3, Y5, Y7, Y12, Y15, 0x60, 0x1e0)
 220  	PRECALC_32_79(Y15, Y3, Y5, Y8, Y14, 0x60, 0x200)
 221  	PRECALC_32_79(Y14, Y15, Y3, Y7, Y13, 0x60, 0x220)
 222  	PRECALC_32_79(Y13, Y14, Y15, Y5, Y12, 0x60, 0x240)
 223  	PRECALC_32_79(Y12, Y13, Y14, Y3, Y8, 0x60, 0x260)
 224  }
 225  
 226  // Macros calculating individual rounds have general form
 227  // CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST
 228  // CALC_ROUND_{PRE,POST} macros follow
 229  
 230  func CALC_F1_PRE(OFFSET int, REG_A, REG_B, REG_C, REG_E GPPhysical) {
 231  	ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E)
 232  	ANDNL(REG_C, REG_A, EBP)
 233  	LEAL(Mem{Base: REG_E, Index: REG_B, Scale: 1}, REG_E) // Add F from the previous round
 234  	RORXL(Imm(0x1b), REG_A, R12L)
 235  	RORXL(Imm(2), REG_A, REG_B) //                           for next round
 236  }
 237  
 238  func CALC_F1_POST(REG_A, REG_B, REG_E GPPhysical) {
 239  	ANDL(REG_B, REG_A)                                  // b&c
 240  	XORL(EBP, REG_A)                                    // F1 = (b&c) ^ (~b&d)
 241  	LEAL(Mem{Base: REG_E, Index: R12, Scale: 1}, REG_E) // E += A >>> 5
 242  }
 243  
 244  // Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX
 245  
 246  func CALC_0() {
 247  	MOVL(ESI, EBX) // Precalculating first round
 248  	RORXL(Imm(2), ESI, ESI)
 249  	ANDNL(EAX, EBX, EBP)
 250  	ANDL(EDI, EBX)
 251  	XORL(EBP, EBX)
 252  	CALC_F1_PRE(0x0, ECX, EBX, EDI, EDX)
 253  	PRECALC_0(0x80)
 254  	CALC_F1_POST(ECX, ESI, EDX)
 255  }
 256  
 257  func CALC_1() {
 258  	CALC_F1_PRE(0x4, EDX, ECX, ESI, EAX)
 259  	PRECALC_1(0x80)
 260  	CALC_F1_POST(EDX, EBX, EAX)
 261  }
 262  
 263  func CALC_2() {
 264  	CALC_F1_PRE(0x8, EAX, EDX, EBX, EDI)
 265  	PRECALC_2(Y15)
 266  	CALC_F1_POST(EAX, ECX, EDI)
 267  }
 268  
 269  func CALC_3() {
 270  	CALC_F1_PRE(0xc, EDI, EAX, ECX, ESI)
 271  	CALC_F1_POST(EDI, EDX, ESI)
 272  }
 273  
 274  func CALC_4() {
 275  	CALC_F1_PRE(0x20, ESI, EDI, EDX, EBX)
 276  	PRECALC_4(Y15, 0x0)
 277  	CALC_F1_POST(ESI, EAX, EBX)
 278  }
 279  
 280  func CALC_5() {
 281  	CALC_F1_PRE(0x24, EBX, ESI, EAX, ECX)
 282  	CALC_F1_POST(EBX, EDI, ECX)
 283  }
 284  
 285  func CALC_6() {
 286  	CALC_F1_PRE(0x28, ECX, EBX, EDI, EDX)
 287  	CALC_F1_POST(ECX, ESI, EDX)
 288  }
 289  
 290  func CALC_7() {
 291  	CALC_F1_PRE(0x2c, EDX, ECX, ESI, EAX)
 292  	PRECALC_7(0x0)
 293  	CALC_F1_POST(EDX, EBX, EAX)
 294  }
 295  
 296  func CALC_8() {
 297  	CALC_F1_PRE(0x40, EAX, EDX, EBX, EDI)
 298  	PRECALC_0(0x90)
 299  	CALC_F1_POST(EAX, ECX, EDI)
 300  }
 301  
 302  func CALC_9() {
 303  	CALC_F1_PRE(0x44, EDI, EAX, ECX, ESI)
 304  	PRECALC_1(0x90)
 305  	CALC_F1_POST(EDI, EDX, ESI)
 306  }
 307  
 308  func CALC_10() {
 309  	CALC_F1_PRE(0x48, ESI, EDI, EDX, EBX)
 310  	PRECALC_2(Y14)
 311  	CALC_F1_POST(ESI, EAX, EBX)
 312  }
 313  
 314  func CALC_11() {
 315  	CALC_F1_PRE(0x4c, EBX, ESI, EAX, ECX)
 316  	CALC_F1_POST(EBX, EDI, ECX)
 317  }
 318  
 319  func CALC_12() {
 320  	CALC_F1_PRE(0x60, ECX, EBX, EDI, EDX)
 321  	PRECALC_4(Y14, 0x0)
 322  	CALC_F1_POST(ECX, ESI, EDX)
 323  }
 324  
 325  func CALC_13() {
 326  	CALC_F1_PRE(0x64, EDX, ECX, ESI, EAX)
 327  	CALC_F1_POST(EDX, EBX, EAX)
 328  }
 329  
 330  func CALC_14() {
 331  	CALC_F1_PRE(0x68, EAX, EDX, EBX, EDI)
 332  	CALC_F1_POST(EAX, ECX, EDI)
 333  }
 334  
 335  func CALC_15() {
 336  	CALC_F1_PRE(0x6c, EDI, EAX, ECX, ESI)
 337  	PRECALC_7(0x10)
 338  	CALC_F1_POST(EDI, EDX, ESI)
 339  }
 340  
 341  func CALC_16() {
 342  	CALC_F1_PRE(0x80, ESI, EDI, EDX, EBX)
 343  	PRECALC_0(0xa0)
 344  	CALC_F1_POST(ESI, EAX, EBX)
 345  }
 346  
 347  func CALC_17() {
 348  	CALC_F1_PRE(0x84, EBX, ESI, EAX, ECX)
 349  	PRECALC_1(0xa0)
 350  	CALC_F1_POST(EBX, EDI, ECX)
 351  }
 352  
 353  func CALC_18() {
 354  	CALC_F1_PRE(0x88, ECX, EBX, EDI, EDX)
 355  	PRECALC_2(Y13)
 356  	CALC_F1_POST(ECX, ESI, EDX)
 357  }
 358  
 359  func CALC_F2_PRE(OFFSET int, REG_A, REG_B, REG_E GPPhysical) {
 360  	ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E)
 361  	LEAL(Mem{Base: REG_E, Index: REG_B, Scale: 1}, REG_E) // Add F from the previous round
 362  	RORXL(Imm(0x1b), REG_A, R12L)
 363  	RORXL(Imm(2), REG_A, REG_B) //                           for next round
 364  }
 365  
 366  func CALC_F2_POST(REG_A, REG_B, REG_C, REG_E GPPhysical) {
 367  	XORL(REG_B, REG_A)
 368  	ADDL(R12L, REG_E)
 369  	XORL(REG_C, REG_A)
 370  }
 371  
 372  func CALC_19() {
 373  	CALC_F2_PRE(0x8c, EDX, ECX, EAX)
 374  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 375  }
 376  
 377  func CALC_20() {
 378  	CALC_F2_PRE(0xa0, EAX, EDX, EDI)
 379  	PRECALC_4(Y13, 0x0)
 380  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 381  }
 382  
 383  func CALC_21() {
 384  	CALC_F2_PRE(0xa4, EDI, EAX, ESI)
 385  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 386  }
 387  
 388  func CALC_22() {
 389  	CALC_F2_PRE(0xa8, ESI, EDI, EBX)
 390  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 391  }
 392  
 393  func CALC_23() {
 394  	CALC_F2_PRE(0xac, EBX, ESI, ECX)
 395  	PRECALC_7(0x20)
 396  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 397  }
 398  
 399  func CALC_24() {
 400  	CALC_F2_PRE(0xc0, ECX, EBX, EDX)
 401  	PRECALC_0(0xb0)
 402  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 403  }
 404  
 405  func CALC_25() {
 406  	CALC_F2_PRE(0xc4, EDX, ECX, EAX)
 407  	PRECALC_1(0xb0)
 408  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 409  }
 410  
 411  func CALC_26() {
 412  	CALC_F2_PRE(0xc8, EAX, EDX, EDI)
 413  	PRECALC_2(Y12)
 414  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 415  }
 416  
 417  func CALC_27() {
 418  	CALC_F2_PRE(0xcc, EDI, EAX, ESI)
 419  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 420  }
 421  
 422  func CALC_28() {
 423  	CALC_F2_PRE(0xe0, ESI, EDI, EBX)
 424  	PRECALC_4(Y12, 0x0)
 425  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 426  }
 427  
 428  func CALC_29() {
 429  	CALC_F2_PRE(0xe4, EBX, ESI, ECX)
 430  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 431  }
 432  
 433  func CALC_30() {
 434  	CALC_F2_PRE(0xe8, ECX, EBX, EDX)
 435  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 436  }
 437  
 438  func CALC_31() {
 439  	CALC_F2_PRE(0xec, EDX, ECX, EAX)
 440  	PRECALC_7(0x30)
 441  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 442  }
 443  
 444  func CALC_32() {
 445  	CALC_F2_PRE(0x100, EAX, EDX, EDI)
 446  	PRECALC_16(Y15, Y14, Y12, Y8)
 447  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 448  }
 449  
 450  func CALC_33() {
 451  	CALC_F2_PRE(0x104, EDI, EAX, ESI)
 452  	PRECALC_17(Y15, Y13, Y8)
 453  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 454  }
 455  
 456  func CALC_34() {
 457  	CALC_F2_PRE(0x108, ESI, EDI, EBX)
 458  	PRECALC_18(Y8)
 459  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 460  }
 461  
 462  func CALC_35() {
 463  	CALC_F2_PRE(0x10c, EBX, ESI, ECX)
 464  	PRECALC_19(Y8)
 465  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 466  }
 467  
 468  func CALC_36() {
 469  	CALC_F2_PRE(0x120, ECX, EBX, EDX)
 470  	PRECALC_20(Y8)
 471  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 472  }
 473  
 474  func CALC_37() {
 475  	CALC_F2_PRE(0x124, EDX, ECX, EAX)
 476  	PRECALC_21(Y8)
 477  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 478  }
 479  
 480  func CALC_38() {
 481  	CALC_F2_PRE(0x128, EAX, EDX, EDI)
 482  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 483  }
 484  
 485  func CALC_F3_PRE(OFFSET int, REG_E GPPhysical) {
 486  	ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E)
 487  }
 488  
 489  func CALC_F3_POST(REG_A, REG_B, REG_C, REG_E, REG_TB GPPhysical) {
 490  	LEAL(Mem{Base: REG_E, Index: REG_TB, Scale: 1}, REG_E) // Add F from the previous round
 491  	MOVL(REG_B, EBP)
 492  	ORL(REG_A, EBP)
 493  	RORXL(Imm(0x1b), REG_A, R12L)
 494  	RORXL(Imm(2), REG_A, REG_TB)
 495  	ANDL(REG_C, EBP)
 496  	ANDL(REG_B, REG_A)
 497  	ORL(EBP, REG_A)
 498  	ADDL(R12L, REG_E)
 499  }
 500  
 501  func CALC_39() {
 502  	CALC_F3_PRE(0x12c, ESI)
 503  	PRECALC_23(Y8, 0x0, 0x80)
 504  	CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
 505  }
 506  
 507  func CALC_40() {
 508  	CALC_F3_PRE(0x140, EBX)
 509  	PRECALC_16(Y14, Y13, Y8, Y7)
 510  	CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
 511  }
 512  
 513  func CALC_41() {
 514  	CALC_F3_PRE(0x144, ECX)
 515  	PRECALC_17(Y14, Y12, Y7)
 516  	CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
 517  }
 518  
 519  func CALC_42() {
 520  	CALC_F3_PRE(0x148, EDX)
 521  	PRECALC_18(Y7)
 522  	CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
 523  }
 524  
 525  func CALC_43() {
 526  	CALC_F3_PRE(0x14c, EAX)
 527  	PRECALC_19(Y7)
 528  	CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
 529  }
 530  
 531  func CALC_44() {
 532  	CALC_F3_PRE(0x160, EDI)
 533  	PRECALC_20(Y7)
 534  	CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
 535  }
 536  
 537  func CALC_45() {
 538  	CALC_F3_PRE(0x164, ESI)
 539  	PRECALC_21(Y7)
 540  	CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
 541  }
 542  
 543  func CALC_46() {
 544  	CALC_F3_PRE(0x168, EBX)
 545  	CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
 546  }
 547  
 548  func CALC_47() {
 549  	CALC_F3_PRE(0x16c, ECX)
 550  	VPXOR(Y9, Y0, Y7)
 551  	VPADDD(Mem{Base: R8}.Offset(0x20), Y7, Y0)
 552  	VMOVDQU(Y0, Mem{Base: R14}.Offset(0xa0))
 553  	CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
 554  }
 555  
 556  func CALC_48() {
 557  	CALC_F3_PRE(0x180, EDX)
 558  	PRECALC_16(Y13, Y12, Y7, Y5)
 559  	CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
 560  }
 561  
 562  func CALC_49() {
 563  	CALC_F3_PRE(0x184, EAX)
 564  	PRECALC_17(Y13, Y8, Y5)
 565  	CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
 566  }
 567  
 568  func CALC_50() {
 569  	CALC_F3_PRE(0x188, EDI)
 570  	PRECALC_18(Y5)
 571  	CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
 572  }
 573  
 574  func CALC_51() {
 575  	CALC_F3_PRE(0x18c, ESI)
 576  	PRECALC_19(Y5)
 577  	CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
 578  }
 579  
 580  func CALC_52() {
 581  	CALC_F3_PRE(0x1a0, EBX)
 582  	PRECALC_20(Y5)
 583  	CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
 584  }
 585  
 586  func CALC_53() {
 587  	CALC_F3_PRE(0x1a4, ECX)
 588  	PRECALC_21(Y5)
 589  	CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
 590  }
 591  
 592  func CALC_54() {
 593  	CALC_F3_PRE(0x1a8, EDX)
 594  	CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
 595  }
 596  
 597  func CALC_55() {
 598  	CALC_F3_PRE(0x1ac, EAX)
 599  	PRECALC_23(Y5, 0x20, 0xc0)
 600  	CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
 601  }
 602  
 603  func CALC_56() {
 604  	CALC_F3_PRE(0x1c0, EDI)
 605  	PRECALC_16(Y12, Y8, Y5, Y3)
 606  	CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
 607  }
 608  
 609  func CALC_57() {
 610  	CALC_F3_PRE(0x1c4, ESI)
 611  	PRECALC_17(Y12, Y7, Y3)
 612  	CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
 613  }
 614  
 615  func CALC_58() {
 616  	CALC_F3_PRE(0x1c8, EBX)
 617  	PRECALC_18(Y3)
 618  	CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
 619  }
 620  
 621  func CALC_59() {
 622  	CALC_F2_PRE(0x1cc, EBX, ESI, ECX)
 623  	PRECALC_19(Y3)
 624  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 625  }
 626  
 627  func CALC_60() {
 628  	CALC_F2_PRE(0x1e0, ECX, EBX, EDX)
 629  	PRECALC_20(Y3)
 630  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 631  }
 632  
 633  func CALC_61() {
 634  	CALC_F2_PRE(0x1e4, EDX, ECX, EAX)
 635  	PRECALC_21(Y3)
 636  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 637  }
 638  
 639  func CALC_62() {
 640  	CALC_F2_PRE(0x1e8, EAX, EDX, EDI)
 641  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 642  }
 643  
 644  func CALC_63() {
 645  	CALC_F2_PRE(0x1ec, EDI, EAX, ESI)
 646  	PRECALC_23(Y3, 0x20, 0xe0)
 647  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 648  }
 649  
 650  func CALC_64() {
 651  	CALC_F2_PRE(0x200, ESI, EDI, EBX)
 652  	PRECALC_32(Y5, Y3)
 653  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 654  }
 655  
 656  func CALC_65() {
 657  	CALC_F2_PRE(0x204, EBX, ESI, ECX)
 658  	PRECALC_33(Y14, Y15)
 659  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 660  }
 661  
 662  func CALC_66() {
 663  	CALC_F2_PRE(0x208, ECX, EBX, EDX)
 664  	PRECALC_34(Y8)
 665  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 666  }
 667  
 668  func CALC_67() {
 669  	CALC_F2_PRE(0x20c, EDX, ECX, EAX)
 670  	PRECALC_35(Y15)
 671  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 672  }
 673  
 674  func CALC_68() {
 675  	CALC_F2_PRE(0x220, EAX, EDX, EDI)
 676  	PRECALC_36(Y15)
 677  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 678  }
 679  
 680  func CALC_69() {
 681  	CALC_F2_PRE(0x224, EDI, EAX, ESI)
 682  	PRECALC_37(Y15)
 683  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 684  }
 685  
 686  func CALC_70() {
 687  	CALC_F2_PRE(0x228, ESI, EDI, EBX)
 688  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 689  }
 690  
 691  func CALC_71() {
 692  	CALC_F2_PRE(0x22c, EBX, ESI, ECX)
 693  	PRECALC_39(Y15, 0x20, 0x100)
 694  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 695  }
 696  
 697  func CALC_72() {
 698  	CALC_F2_PRE(0x240, ECX, EBX, EDX)
 699  	PRECALC_32(Y3, Y15)
 700  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 701  }
 702  
 703  func CALC_73() {
 704  	CALC_F2_PRE(0x244, EDX, ECX, EAX)
 705  	PRECALC_33(Y13, Y14)
 706  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 707  }
 708  
 709  func CALC_74() {
 710  	CALC_F2_PRE(0x248, EAX, EDX, EDI)
 711  	PRECALC_34(Y7)
 712  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 713  }
 714  
 715  func CALC_75() {
 716  	CALC_F2_PRE(0x24c, EDI, EAX, ESI)
 717  	PRECALC_35(Y14)
 718  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 719  }
 720  
 721  func CALC_76() {
 722  	CALC_F2_PRE(0x260, ESI, EDI, EBX)
 723  	PRECALC_36(Y14)
 724  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 725  }
 726  
 727  func CALC_77() {
 728  	CALC_F2_PRE(0x264, EBX, ESI, ECX)
 729  	PRECALC_37(Y14)
 730  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 731  }
 732  
 733  func CALC_78() {
 734  	CALC_F2_PRE(0x268, ECX, EBX, EDX)
 735  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 736  }
 737  
 738  func CALC_79() {
 739  	ADDL(Mem{Base: R15}.Offset(0x26c), EAX)
 740  	LEAL(Mem{Base: AX, Index: CX, Scale: 1}, EAX)
 741  	RORXL(Imm(0x1b), EDX, R12L)
 742  	PRECALC_39(Y14, 0x20, 0x120)
 743  	ADDL(R12L, EAX)
 744  }
 745  
 746  // Similar to CALC_0
 747  func CALC_80() {
 748  	MOVL(ECX, EDX)
 749  	RORXL(Imm(2), ECX, ECX)
 750  	ANDNL(ESI, EDX, EBP)
 751  	ANDL(EBX, EDX)
 752  	XORL(EBP, EDX)
 753  	CALC_F1_PRE(0x10, EAX, EDX, EBX, EDI)
 754  	PRECALC_32(Y15, Y14)
 755  	CALC_F1_POST(EAX, ECX, EDI)
 756  }
 757  
 758  func CALC_81() {
 759  	CALC_F1_PRE(0x14, EDI, EAX, ECX, ESI)
 760  	PRECALC_33(Y12, Y13)
 761  	CALC_F1_POST(EDI, EDX, ESI)
 762  }
 763  
 764  func CALC_82() {
 765  	CALC_F1_PRE(0x18, ESI, EDI, EDX, EBX)
 766  	PRECALC_34(Y5)
 767  	CALC_F1_POST(ESI, EAX, EBX)
 768  }
 769  
 770  func CALC_83() {
 771  	CALC_F1_PRE(0x1c, EBX, ESI, EAX, ECX)
 772  	PRECALC_35(Y13)
 773  	CALC_F1_POST(EBX, EDI, ECX)
 774  }
 775  
 776  func CALC_84() {
 777  	CALC_F1_PRE(0x30, ECX, EBX, EDI, EDX)
 778  	PRECALC_36(Y13)
 779  	CALC_F1_POST(ECX, ESI, EDX)
 780  }
 781  
 782  func CALC_85() {
 783  	CALC_F1_PRE(0x34, EDX, ECX, ESI, EAX)
 784  	PRECALC_37(Y13)
 785  	CALC_F1_POST(EDX, EBX, EAX)
 786  }
 787  
 788  func CALC_86() {
 789  	CALC_F1_PRE(0x38, EAX, EDX, EBX, EDI)
 790  	CALC_F1_POST(EAX, ECX, EDI)
 791  }
 792  
 793  func CALC_87() {
 794  	CALC_F1_PRE(0x3c, EDI, EAX, ECX, ESI)
 795  	PRECALC_39(Y13, 0x40, 0x140)
 796  	CALC_F1_POST(EDI, EDX, ESI)
 797  }
 798  
 799  func CALC_88() {
 800  	CALC_F1_PRE(0x50, ESI, EDI, EDX, EBX)
 801  	PRECALC_32(Y14, Y13)
 802  	CALC_F1_POST(ESI, EAX, EBX)
 803  }
 804  
 805  func CALC_89() {
 806  	CALC_F1_PRE(0x54, EBX, ESI, EAX, ECX)
 807  	PRECALC_33(Y8, Y12)
 808  	CALC_F1_POST(EBX, EDI, ECX)
 809  }
 810  
 811  func CALC_90() {
 812  	CALC_F1_PRE(0x58, ECX, EBX, EDI, EDX)
 813  	PRECALC_34(Y3)
 814  	CALC_F1_POST(ECX, ESI, EDX)
 815  }
 816  
 817  func CALC_91() {
 818  	CALC_F1_PRE(0x5c, EDX, ECX, ESI, EAX)
 819  	PRECALC_35(Y12)
 820  	CALC_F1_POST(EDX, EBX, EAX)
 821  }
 822  
 823  func CALC_92() {
 824  	CALC_F1_PRE(0x70, EAX, EDX, EBX, EDI)
 825  	PRECALC_36(Y12)
 826  	CALC_F1_POST(EAX, ECX, EDI)
 827  }
 828  
 829  func CALC_93() {
 830  	CALC_F1_PRE(0x74, EDI, EAX, ECX, ESI)
 831  	PRECALC_37(Y12)
 832  	CALC_F1_POST(EDI, EDX, ESI)
 833  }
 834  
 835  func CALC_94() {
 836  	CALC_F1_PRE(0x78, ESI, EDI, EDX, EBX)
 837  	CALC_F1_POST(ESI, EAX, EBX)
 838  }
 839  
 840  func CALC_95() {
 841  	CALC_F1_PRE(0x7c, EBX, ESI, EAX, ECX)
 842  	PRECALC_39(Y12, 0x40, 0x160)
 843  	CALC_F1_POST(EBX, EDI, ECX)
 844  }
 845  
 846  func CALC_96() {
 847  	CALC_F1_PRE(0x90, ECX, EBX, EDI, EDX)
 848  	PRECALC_32(Y13, Y12)
 849  	CALC_F1_POST(ECX, ESI, EDX)
 850  }
 851  
 852  func CALC_97() {
 853  	CALC_F1_PRE(0x94, EDX, ECX, ESI, EAX)
 854  	PRECALC_33(Y7, Y8)
 855  	CALC_F1_POST(EDX, EBX, EAX)
 856  }
 857  
 858  func CALC_98() {
 859  	CALC_F1_PRE(0x98, EAX, EDX, EBX, EDI)
 860  	PRECALC_34(Y15)
 861  	CALC_F1_POST(EAX, ECX, EDI)
 862  }
 863  
 864  func CALC_99() {
 865  	CALC_F2_PRE(0x9c, EDI, EAX, ESI)
 866  	PRECALC_35(Y8)
 867  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 868  }
 869  
 870  func CALC_100() {
 871  	CALC_F2_PRE(0xb0, ESI, EDI, EBX)
 872  	PRECALC_36(Y8)
 873  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 874  }
 875  
 876  func CALC_101() {
 877  	CALC_F2_PRE(0xb4, EBX, ESI, ECX)
 878  	PRECALC_37(Y8)
 879  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 880  }
 881  
 882  func CALC_102() {
 883  	CALC_F2_PRE(0xb8, ECX, EBX, EDX)
 884  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 885  }
 886  
 887  func CALC_103() {
 888  	CALC_F2_PRE(0xbc, EDX, ECX, EAX)
 889  	PRECALC_39(Y8, 0x40, 0x180)
 890  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 891  }
 892  
 893  func CALC_104() {
 894  	CALC_F2_PRE(0xd0, EAX, EDX, EDI)
 895  	PRECALC_32(Y12, Y8)
 896  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 897  }
 898  
 899  func CALC_105() {
 900  	CALC_F2_PRE(0xd4, EDI, EAX, ESI)
 901  	PRECALC_33(Y5, Y7)
 902  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 903  }
 904  
 905  func CALC_106() {
 906  	CALC_F2_PRE(0xd8, ESI, EDI, EBX)
 907  	PRECALC_34(Y14)
 908  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 909  }
 910  
 911  func CALC_107() {
 912  	CALC_F2_PRE(0xdc, EBX, ESI, ECX)
 913  	PRECALC_35(Y7)
 914  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 915  }
 916  
 917  func CALC_108() {
 918  	CALC_F2_PRE(0xf0, ECX, EBX, EDX)
 919  	PRECALC_36(Y7)
 920  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 921  }
 922  
 923  func CALC_109() {
 924  	CALC_F2_PRE(0xf4, EDX, ECX, EAX)
 925  	PRECALC_37(Y7)
 926  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 927  }
 928  
 929  func CALC_110() {
 930  	CALC_F2_PRE(0xf8, EAX, EDX, EDI)
 931  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 932  }
 933  
 934  func CALC_111() {
 935  	CALC_F2_PRE(0xfc, EDI, EAX, ESI)
 936  	PRECALC_39(Y7, 0x40, 0x1a0)
 937  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 938  }
 939  
 940  func CALC_112() {
 941  	CALC_F2_PRE(0x110, ESI, EDI, EBX)
 942  	PRECALC_32(Y8, Y7)
 943  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 944  }
 945  
 946  func CALC_113() {
 947  	CALC_F2_PRE(0x114, EBX, ESI, ECX)
 948  	PRECALC_33(Y3, Y5)
 949  	CALC_F2_POST(EBX, EDI, EAX, ECX)
 950  }
 951  
 952  func CALC_114() {
 953  	CALC_F2_PRE(0x118, ECX, EBX, EDX)
 954  	PRECALC_34(Y13)
 955  	CALC_F2_POST(ECX, ESI, EDI, EDX)
 956  }
 957  
 958  func CALC_115() {
 959  	CALC_F2_PRE(0x11c, EDX, ECX, EAX)
 960  	PRECALC_35(Y5)
 961  	CALC_F2_POST(EDX, EBX, ESI, EAX)
 962  }
 963  
 964  func CALC_116() {
 965  	CALC_F2_PRE(0x130, EAX, EDX, EDI)
 966  	PRECALC_36(Y5)
 967  	CALC_F2_POST(EAX, ECX, EBX, EDI)
 968  }
 969  
 970  func CALC_117() {
 971  	CALC_F2_PRE(0x134, EDI, EAX, ESI)
 972  	PRECALC_37(Y5)
 973  	CALC_F2_POST(EDI, EDX, ECX, ESI)
 974  }
 975  
 976  func CALC_118() {
 977  	CALC_F2_PRE(0x138, ESI, EDI, EBX)
 978  	CALC_F2_POST(ESI, EAX, EDX, EBX)
 979  }
 980  
 981  func CALC_119() {
 982  	CALC_F3_PRE(0x13c, ECX)
 983  	PRECALC_39(Y5, 0x40, 0x1c0)
 984  	CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
 985  }
 986  
 987  func CALC_120() {
 988  	CALC_F3_PRE(0x150, EDX)
 989  	PRECALC_32(Y7, Y5)
 990  	CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
 991  }
 992  
 993  func CALC_121() {
 994  	CALC_F3_PRE(0x154, EAX)
 995  	PRECALC_33(Y15, Y3)
 996  	CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
 997  }
 998  
 999  func CALC_122() {
1000  	CALC_F3_PRE(0x158, EDI)
1001  	PRECALC_34(Y12)
1002  	CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
1003  }
1004  
1005  func CALC_123() {
1006  	CALC_F3_PRE(0x15c, ESI)
1007  	PRECALC_35(Y3)
1008  	CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
1009  }
1010  
1011  func CALC_124() {
1012  	CALC_F3_PRE(0x170, EBX)
1013  	PRECALC_36(Y3)
1014  	CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
1015  }
1016  
1017  func CALC_125() {
1018  	CALC_F3_PRE(0x174, ECX)
1019  	PRECALC_37(Y3)
1020  	CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
1021  }
1022  
1023  func CALC_126() {
1024  	CALC_F3_PRE(0x178, EDX)
1025  	CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
1026  }
1027  
1028  func CALC_127() {
1029  	CALC_F3_PRE(0x17c, EAX)
1030  	PRECALC_39(Y3, 0x60, 0x1e0)
1031  	CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
1032  }
1033  
1034  func CALC_128() {
1035  	CALC_F3_PRE(0x190, EDI)
1036  	PRECALC_32(Y5, Y3)
1037  	CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
1038  }
1039  
1040  func CALC_129() {
1041  	CALC_F3_PRE(0x194, ESI)
1042  	PRECALC_33(Y14, Y15)
1043  	CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
1044  }
1045  
1046  func CALC_130() {
1047  	CALC_F3_PRE(0x198, EBX)
1048  	PRECALC_34(Y8)
1049  	CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
1050  }
1051  
1052  func CALC_131() {
1053  	CALC_F3_PRE(0x19c, ECX)
1054  	PRECALC_35(Y15)
1055  	CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
1056  }
1057  
1058  func CALC_132() {
1059  	CALC_F3_PRE(0x1b0, EDX)
1060  	PRECALC_36(Y15)
1061  	CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
1062  }
1063  
1064  func CALC_133() {
1065  	CALC_F3_PRE(0x1b4, EAX)
1066  	PRECALC_37(Y15)
1067  	CALC_F3_POST(EDX, EBX, ESI, EAX, ECX)
1068  }
1069  
1070  func CALC_134() {
1071  	CALC_F3_PRE(0x1b8, EDI)
1072  	CALC_F3_POST(EAX, ECX, EBX, EDI, EDX)
1073  }
1074  
1075  func CALC_135() {
1076  	CALC_F3_PRE(0x1bc, ESI)
1077  	PRECALC_39(Y15, 0x60, 0x200)
1078  	CALC_F3_POST(EDI, EDX, ECX, ESI, EAX)
1079  }
1080  
1081  func CALC_136() {
1082  	CALC_F3_PRE(0x1d0, EBX)
1083  	PRECALC_32(Y3, Y15)
1084  	CALC_F3_POST(ESI, EAX, EDX, EBX, EDI)
1085  }
1086  
1087  func CALC_137() {
1088  	CALC_F3_PRE(0x1d4, ECX)
1089  	PRECALC_33(Y13, Y14)
1090  	CALC_F3_POST(EBX, EDI, EAX, ECX, ESI)
1091  }
1092  
1093  func CALC_138() {
1094  	CALC_F3_PRE(0x1d8, EDX)
1095  	PRECALC_34(Y7)
1096  	CALC_F3_POST(ECX, ESI, EDI, EDX, EBX)
1097  }
1098  
1099  func CALC_139() {
1100  	CALC_F2_PRE(0x1dc, EDX, ECX, EAX)
1101  	PRECALC_35(Y14)
1102  	CALC_F2_POST(EDX, EBX, ESI, EAX)
1103  }
1104  
1105  func CALC_140() {
1106  	CALC_F2_PRE(0x1f0, EAX, EDX, EDI)
1107  	PRECALC_36(Y14)
1108  	CALC_F2_POST(EAX, ECX, EBX, EDI)
1109  }
1110  
1111  func CALC_141() {
1112  	CALC_F2_PRE(0x1f4, EDI, EAX, ESI)
1113  	PRECALC_37(Y14)
1114  	CALC_F2_POST(EDI, EDX, ECX, ESI)
1115  }
1116  
1117  func CALC_142() {
1118  	CALC_F2_PRE(0x1f8, ESI, EDI, EBX)
1119  	CALC_F2_POST(ESI, EAX, EDX, EBX)
1120  }
1121  
1122  func CALC_143() {
1123  	CALC_F2_PRE(0x1fc, EBX, ESI, ECX)
1124  	PRECALC_39(Y14, 0x60, 0x220)
1125  	CALC_F2_POST(EBX, EDI, EAX, ECX)
1126  }
1127  
1128  func CALC_144() {
1129  	CALC_F2_PRE(0x210, ECX, EBX, EDX)
1130  	PRECALC_32(Y15, Y14)
1131  	CALC_F2_POST(ECX, ESI, EDI, EDX)
1132  }
1133  
1134  func CALC_145() {
1135  	CALC_F2_PRE(0x214, EDX, ECX, EAX)
1136  	PRECALC_33(Y12, Y13)
1137  	CALC_F2_POST(EDX, EBX, ESI, EAX)
1138  }
1139  
1140  func CALC_146() {
1141  	CALC_F2_PRE(0x218, EAX, EDX, EDI)
1142  	PRECALC_34(Y5)
1143  	CALC_F2_POST(EAX, ECX, EBX, EDI)
1144  }
1145  
1146  func CALC_147() {
1147  	CALC_F2_PRE(0x21c, EDI, EAX, ESI)
1148  	PRECALC_35(Y13)
1149  	CALC_F2_POST(EDI, EDX, ECX, ESI)
1150  }
1151  
1152  func CALC_148() {
1153  	CALC_F2_PRE(0x230, ESI, EDI, EBX)
1154  	PRECALC_36(Y13)
1155  	CALC_F2_POST(ESI, EAX, EDX, EBX)
1156  }
1157  
1158  func CALC_149() {
1159  	CALC_F2_PRE(0x234, EBX, ESI, ECX)
1160  	PRECALC_37(Y13)
1161  	CALC_F2_POST(EBX, EDI, EAX, ECX)
1162  }
1163  
1164  func CALC_150() {
1165  	CALC_F2_PRE(0x238, ECX, EBX, EDX)
1166  	CALC_F2_POST(ECX, ESI, EDI, EDX)
1167  }
1168  
1169  func CALC_151() {
1170  	CALC_F2_PRE(0x23c, EDX, ECX, EAX)
1171  	PRECALC_39(Y13, 0x60, 0x240)
1172  	CALC_F2_POST(EDX, EBX, ESI, EAX)
1173  }
1174  
1175  func CALC_152() {
1176  	CALC_F2_PRE(0x250, EAX, EDX, EDI)
1177  	PRECALC_32(Y14, Y13)
1178  	CALC_F2_POST(EAX, ECX, EBX, EDI)
1179  }
1180  
1181  func CALC_153() {
1182  	CALC_F2_PRE(0x254, EDI, EAX, ESI)
1183  	PRECALC_33(Y8, Y12)
1184  	CALC_F2_POST(EDI, EDX, ECX, ESI)
1185  }
1186  
1187  func CALC_154() {
1188  	CALC_F2_PRE(0x258, ESI, EDI, EBX)
1189  	PRECALC_34(Y3)
1190  	CALC_F2_POST(ESI, EAX, EDX, EBX)
1191  }
1192  
1193  func CALC_155() {
1194  	CALC_F2_PRE(0x25c, EBX, ESI, ECX)
1195  	PRECALC_35(Y12)
1196  	CALC_F2_POST(EBX, EDI, EAX, ECX)
1197  }
1198  
1199  func CALC_156() {
1200  	CALC_F2_PRE(0x270, ECX, EBX, EDX)
1201  	PRECALC_36(Y12)
1202  	CALC_F2_POST(ECX, ESI, EDI, EDX)
1203  }
1204  
1205  func CALC_157() {
1206  	CALC_F2_PRE(0x274, EDX, ECX, EAX)
1207  	PRECALC_37(Y12)
1208  	CALC_F2_POST(EDX, EBX, ESI, EAX)
1209  }
1210  
1211  func CALC_158() {
1212  	CALC_F2_PRE(0x278, EAX, EDX, EDI)
1213  	CALC_F2_POST(EAX, ECX, EBX, EDI)
1214  }
1215  
1216  func CALC_159() {
1217  	ADDL(Mem{Base: R15}.Offset(0x27c), ESI)
1218  	LEAL(Mem{Base: SI, Index: AX, Scale: 1}, ESI)
1219  	RORXL(Imm(0x1b), EDI, R12L)
1220  	PRECALC_39(Y12, 0x60, 0x260)
1221  	ADDL(R12L, ESI)
1222  }
1223  
1224  func CALC() {
1225  	MOVL(Mem{Base: R9}, ECX)
1226  	MOVL(Mem{Base: R9}.Offset(4), ESI)
1227  	MOVL(Mem{Base: R9}.Offset(8), EDI)
1228  	MOVL(Mem{Base: R9}.Offset(12), EAX)
1229  	MOVL(Mem{Base: R9}.Offset(16), EDX)
1230  	MOVQ(RSP, R14)
1231  	LEAQ(Mem{Base: SP}.Offset(2*4*80+32), R15)
1232  	PRECALC() // Precalc WK for first 2 blocks
1233  	XCHGQ(R15, R14)
1234  	loop_avx2()
1235  	begin()
1236  }
1237  
1238  // this loops is unrolled
1239  func loop_avx2() {
1240  	Label("loop")
1241  	CMPQ(R10, R8) // we use R8 value (set below) as a signal of a last block
1242  	JNE(LabelRef("begin"))
1243  	VZEROUPPER()
1244  	RET()
1245  }
1246  
1247  func begin() {
1248  	Label("begin")
1249  	CALC_0()
1250  	CALC_1()
1251  	CALC_2()
1252  	CALC_3()
1253  	CALC_4()
1254  	CALC_5()
1255  	CALC_6()
1256  	CALC_7()
1257  	CALC_8()
1258  	CALC_9()
1259  	CALC_10()
1260  	CALC_11()
1261  	CALC_12()
1262  	CALC_13()
1263  	CALC_14()
1264  	CALC_15()
1265  	CALC_16()
1266  	CALC_17()
1267  	CALC_18()
1268  	CALC_19()
1269  	CALC_20()
1270  	CALC_21()
1271  	CALC_22()
1272  	CALC_23()
1273  	CALC_24()
1274  	CALC_25()
1275  	CALC_26()
1276  	CALC_27()
1277  	CALC_28()
1278  	CALC_29()
1279  	CALC_30()
1280  	CALC_31()
1281  	CALC_32()
1282  	CALC_33()
1283  	CALC_34()
1284  	CALC_35()
1285  	CALC_36()
1286  	CALC_37()
1287  	CALC_38()
1288  	CALC_39()
1289  	CALC_40()
1290  	CALC_41()
1291  	CALC_42()
1292  	CALC_43()
1293  	CALC_44()
1294  	CALC_45()
1295  	CALC_46()
1296  	CALC_47()
1297  	CALC_48()
1298  	CALC_49()
1299  	CALC_50()
1300  	CALC_51()
1301  	CALC_52()
1302  	CALC_53()
1303  	CALC_54()
1304  	CALC_55()
1305  	CALC_56()
1306  	CALC_57()
1307  	CALC_58()
1308  	CALC_59()
1309  	ADDQ(Imm(128), R10) // move to next even-64-byte block
1310  	CMPQ(R10, R11)      // is current block the last one?
1311  	CMOVQCC(R8, R10)    // signal the last iteration smartly
1312  	CALC_60()
1313  	CALC_61()
1314  	CALC_62()
1315  	CALC_63()
1316  	CALC_64()
1317  	CALC_65()
1318  	CALC_66()
1319  	CALC_67()
1320  	CALC_68()
1321  	CALC_69()
1322  	CALC_70()
1323  	CALC_71()
1324  	CALC_72()
1325  	CALC_73()
1326  	CALC_74()
1327  	CALC_75()
1328  	CALC_76()
1329  	CALC_77()
1330  	CALC_78()
1331  	CALC_79()
1332  	UPDATE_HASH(EAX, EDX, EBX, ESI, EDI)
1333  	CMPQ(R10, R8) // is current block the last one?
1334  	JE(LabelRef("loop"))
1335  	MOVL(EDX, ECX)
1336  	CALC_80()
1337  	CALC_81()
1338  	CALC_82()
1339  	CALC_83()
1340  	CALC_84()
1341  	CALC_85()
1342  	CALC_86()
1343  	CALC_87()
1344  	CALC_88()
1345  	CALC_89()
1346  	CALC_90()
1347  	CALC_91()
1348  	CALC_92()
1349  	CALC_93()
1350  	CALC_94()
1351  	CALC_95()
1352  	CALC_96()
1353  	CALC_97()
1354  	CALC_98()
1355  	CALC_99()
1356  	CALC_100()
1357  	CALC_101()
1358  	CALC_102()
1359  	CALC_103()
1360  	CALC_104()
1361  	CALC_105()
1362  	CALC_106()
1363  	CALC_107()
1364  	CALC_108()
1365  	CALC_109()
1366  	CALC_110()
1367  	CALC_111()
1368  	CALC_112()
1369  	CALC_113()
1370  	CALC_114()
1371  	CALC_115()
1372  	CALC_116()
1373  	CALC_117()
1374  	CALC_118()
1375  	CALC_119()
1376  	CALC_120()
1377  	CALC_121()
1378  	CALC_122()
1379  	CALC_123()
1380  	CALC_124()
1381  	CALC_125()
1382  	CALC_126()
1383  	CALC_127()
1384  	CALC_128()
1385  	CALC_129()
1386  	CALC_130()
1387  	CALC_131()
1388  	CALC_132()
1389  	CALC_133()
1390  	CALC_134()
1391  	CALC_135()
1392  	CALC_136()
1393  	CALC_137()
1394  	CALC_138()
1395  	CALC_139()
1396  	ADDQ(Imm(128), R13) //move to next even-64-byte block
1397  	CMPQ(R13, R11)      //is current block the last one?
1398  	CMOVQCC(R8, R10)
1399  	CALC_140()
1400  	CALC_141()
1401  	CALC_142()
1402  	CALC_143()
1403  	CALC_144()
1404  	CALC_145()
1405  	CALC_146()
1406  	CALC_147()
1407  	CALC_148()
1408  	CALC_149()
1409  	CALC_150()
1410  	CALC_151()
1411  	CALC_152()
1412  	CALC_153()
1413  	CALC_154()
1414  	CALC_155()
1415  	CALC_156()
1416  	CALC_157()
1417  	CALC_158()
1418  	CALC_159()
1419  	UPDATE_HASH(ESI, EDI, EDX, ECX, EBX)
1420  	MOVL(ESI, R12L)
1421  	MOVL(EDI, ESI)
1422  	MOVL(EDX, EDI)
1423  	MOVL(EBX, EDX)
1424  	MOVL(ECX, EAX)
1425  	MOVL(R12L, ECX)
1426  	XCHGQ(R15, R14)
1427  	JMP(LabelRef("loop"))
1428  }
1429  
1430  func blockAVX2() {
1431  	Implement("blockAVX2")
1432  	AllocLocal(1408)
1433  
1434  	Load(Param("dig"), RDI)
1435  	Load(Param("p").Base(), RSI)
1436  	Load(Param("p").Len(), RDX)
1437  	SHRQ(Imm(6), RDX)
1438  	SHLQ(Imm(6), RDX)
1439  
1440  	K_XMM_AR := K_XMM_AR_DATA()
1441  	LEAQ(K_XMM_AR, R8)
1442  
1443  	MOVQ(RDI, R9)
1444  	MOVQ(RSI, R10)
1445  	LEAQ(Mem{Base: SI}.Offset(64), R13)
1446  
1447  	ADDQ(RSI, RDX)
1448  	ADDQ(Imm(64), RDX)
1449  	MOVQ(RDX, R11)
1450  
1451  	CMPQ(R13, R11)
1452  	CMOVQCC(R8, R13)
1453  
1454  	BSWAP_SHUFB_CTL := BSWAP_SHUFB_CTL_DATA()
1455  	VMOVDQU(BSWAP_SHUFB_CTL, Y10)
1456  	CALC()
1457  }
1458  
1459  // ##~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~##
1460  
1461  // Pointers for memoizing Data section symbols
1462  var (
1463  	K_XMM_AR_ptr, BSWAP_SHUFB_CTL_ptr *Mem
1464  )
1465  
1466  // To hold Round Constants for K_XMM_AR_DATA
1467  
1468  var _K = []uint32{
1469  	0x5A827999,
1470  	0x6ED9EBA1,
1471  	0x8F1BBCDC,
1472  	0xCA62C1D6,
1473  }
1474  
1475  func K_XMM_AR_DATA() Mem {
1476  	if K_XMM_AR_ptr != nil {
1477  		return *K_XMM_AR_ptr
1478  	}
1479  
1480  	K_XMM_AR := GLOBL("K_XMM_AR", RODATA)
1481  	K_XMM_AR_ptr = &K_XMM_AR
1482  
1483  	offset_idx := 0
1484  	for _, v := range _K {
1485  		DATA((offset_idx+0)*4, U32(v))
1486  		DATA((offset_idx+1)*4, U32(v))
1487  		DATA((offset_idx+2)*4, U32(v))
1488  		DATA((offset_idx+3)*4, U32(v))
1489  		DATA((offset_idx+4)*4, U32(v))
1490  		DATA((offset_idx+5)*4, U32(v))
1491  		DATA((offset_idx+6)*4, U32(v))
1492  		DATA((offset_idx+7)*4, U32(v))
1493  		offset_idx += 8
1494  	}
1495  	return K_XMM_AR
1496  }
1497  
1498  var BSWAP_SHUFB_CTL_CONSTANTS = [8]uint32{
1499  	0x00010203,
1500  	0x04050607,
1501  	0x08090a0b,
1502  	0x0c0d0e0f,
1503  	0x00010203,
1504  	0x04050607,
1505  	0x08090a0b,
1506  	0x0c0d0e0f,
1507  }
1508  
1509  func BSWAP_SHUFB_CTL_DATA() Mem {
1510  	if BSWAP_SHUFB_CTL_ptr != nil {
1511  		return *BSWAP_SHUFB_CTL_ptr
1512  	}
1513  
1514  	BSWAP_SHUFB_CTL := GLOBL("BSWAP_SHUFB_CTL", RODATA)
1515  	BSWAP_SHUFB_CTL_ptr = &BSWAP_SHUFB_CTL
1516  	for i, v := range BSWAP_SHUFB_CTL_CONSTANTS {
1517  
1518  		DATA(i*4, U32(v))
1519  	}
1520  	return BSWAP_SHUFB_CTL
1521  }
1522