sha256block_amd64.s raw

   1  // Code generated by command: go run sha256block_amd64_asm.go -out ../sha256block_amd64.s. DO NOT EDIT.
   2  
   3  //go:build !purego
   4  
   5  #include "textflag.h"
   6  
   7  // func blockAVX2(dig *Digest, p []byte)
   8  // Requires: AVX, AVX2, BMI2
   9  TEXT ·blockAVX2(SB), $536-32
  10  	MOVQ dig+0(FP), SI
  11  	MOVQ p_base+8(FP), DI
  12  	MOVQ p_len+16(FP), DX
  13  	LEAQ -64(DI)(DX*1), DX
  14  	MOVQ DX, 512(SP)
  15  	CMPQ DX, DI
  16  	JE   avx2_only_one_block
  17  
  18  	// Load initial digest
  19  	MOVL (SI), AX
  20  	MOVL 4(SI), BX
  21  	MOVL 8(SI), CX
  22  	MOVL 12(SI), R8
  23  	MOVL 16(SI), DX
  24  	MOVL 20(SI), R9
  25  	MOVL 24(SI), R10
  26  	MOVL 28(SI), R11
  27  
  28  avx2_loop0:
  29  	// at each iteration works with one block (512 bit)
  30  	VMOVDQU (DI), Y0
  31  	VMOVDQU 32(DI), Y1
  32  	VMOVDQU 64(DI), Y2
  33  	VMOVDQU 96(DI), Y3
  34  	VMOVDQU flip_mask<>+0(SB), Y13
  35  
  36  	// Apply Byte Flip Mask: LE -> BE
  37  	VPSHUFB Y13, Y0, Y0
  38  	VPSHUFB Y13, Y1, Y1
  39  	VPSHUFB Y13, Y2, Y2
  40  	VPSHUFB Y13, Y3, Y3
  41  
  42  	// Transpose data into high/low parts
  43  	VPERM2I128 $0x20, Y2, Y0, Y4
  44  	VPERM2I128 $0x31, Y2, Y0, Y5
  45  	VPERM2I128 $0x20, Y3, Y1, Y6
  46  	VPERM2I128 $0x31, Y3, Y1, Y7
  47  	LEAQ       K256<>+0(SB), BP
  48  
  49  avx2_last_block_enter:
  50  	ADDQ $0x40, DI
  51  	MOVQ DI, 520(SP)
  52  	XORQ SI, SI
  53  
  54  avx2_loop1:
  55  	// Do 4 rounds and scheduling
  56  	VPADDD   (BP)(SI*1), Y4, Y9
  57  	VMOVDQU  Y9, (SP)(SI*1)
  58  	MOVL     AX, DI
  59  	RORXL    $0x19, DX, R13
  60  	RORXL    $0x0b, DX, R14
  61  	ADDL     (SP)(SI*1), R11
  62  	ORL      CX, DI
  63  	VPALIGNR $0x04, Y6, Y7, Y0
  64  	MOVL     R9, R15
  65  	RORXL    $0x0d, AX, R12
  66  	XORL     R14, R13
  67  	XORL     R10, R15
  68  	VPADDD   Y4, Y0, Y0
  69  	RORXL    $0x06, DX, R14
  70  	ANDL     DX, R15
  71  	XORL     R14, R13
  72  	RORXL    $0x16, AX, R14
  73  	ADDL     R11, R8
  74  	ANDL     BX, DI
  75  	VPALIGNR $0x04, Y4, Y5, Y1
  76  	XORL     R12, R14
  77  	RORXL    $0x02, AX, R12
  78  	XORL     R10, R15
  79  	VPSRLD   $0x07, Y1, Y2
  80  	XORL     R12, R14
  81  	MOVL     AX, R12
  82  	ANDL     CX, R12
  83  	ADDL     R13, R15
  84  	VPSLLD   $0x19, Y1, Y3
  85  	ORL      R12, DI
  86  	ADDL     R14, R11
  87  	ADDL     R15, R8
  88  	VPOR     Y2, Y3, Y3
  89  	VPSRLD   $0x12, Y1, Y2
  90  	ADDL     R15, R11
  91  	ADDL     DI, R11
  92  	MOVL     R11, DI
  93  	RORXL    $0x19, R8, R13
  94  	RORXL    $0x0b, R8, R14
  95  	ADDL     4(SP)(SI*1), R10
  96  	ORL      BX, DI
  97  	VPSRLD   $0x03, Y1, Y8
  98  	MOVL     DX, R15
  99  	RORXL    $0x0d, R11, R12
 100  	XORL     R14, R13
 101  	XORL     R9, R15
 102  	RORXL    $0x06, R8, R14
 103  	XORL     R14, R13
 104  	RORXL    $0x16, R11, R14
 105  	ANDL     R8, R15
 106  	ADDL     R10, CX
 107  	VPSLLD   $0x0e, Y1, Y1
 108  	ANDL     AX, DI
 109  	XORL     R12, R14
 110  	VPXOR    Y1, Y3, Y3
 111  	RORXL    $0x02, R11, R12
 112  	XORL     R9, R15
 113  	VPXOR    Y2, Y3, Y3
 114  	XORL     R12, R14
 115  	MOVL     R11, R12
 116  	ANDL     BX, R12
 117  	ADDL     R13, R15
 118  	VPXOR    Y8, Y3, Y1
 119  	VPSHUFD  $0xfa, Y7, Y2
 120  	ORL      R12, DI
 121  	ADDL     R14, R10
 122  	VPADDD   Y1, Y0, Y0
 123  	ADDL     R15, CX
 124  	ADDL     R15, R10
 125  	ADDL     DI, R10
 126  	VPSRLD   $0x0a, Y2, Y8
 127  	MOVL     R10, DI
 128  	RORXL    $0x19, CX, R13
 129  	ADDL     8(SP)(SI*1), R9
 130  	VPSRLQ   $0x13, Y2, Y3
 131  	RORXL    $0x0b, CX, R14
 132  	ORL      AX, DI
 133  	MOVL     R8, R15
 134  	XORL     DX, R15
 135  	RORXL    $0x0d, R10, R12
 136  	XORL     R14, R13
 137  	VPSRLQ   $0x11, Y2, Y2
 138  	ANDL     CX, R15
 139  	RORXL    $0x06, CX, R14
 140  	VPXOR    Y3, Y2, Y2
 141  	ADDL     R9, BX
 142  	ANDL     R11, DI
 143  	XORL     R14, R13
 144  	RORXL    $0x16, R10, R14
 145  	VPXOR    Y2, Y8, Y8
 146  	XORL     DX, R15
 147  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
 148  	XORL     R12, R14
 149  	RORXL    $0x02, R10, R12
 150  	VPADDD   Y8, Y0, Y0
 151  	XORL     R12, R14
 152  	MOVL     R10, R12
 153  	ANDL     AX, R12
 154  	ADDL     R13, R15
 155  	VPSHUFD  $0x50, Y0, Y2
 156  	ORL      R12, DI
 157  	ADDL     R14, R9
 158  	ADDL     R15, BX
 159  	ADDL     R15, R9
 160  	ADDL     DI, R9
 161  	MOVL     R9, DI
 162  	RORXL    $0x19, BX, R13
 163  	RORXL    $0x0b, BX, R14
 164  	ADDL     12(SP)(SI*1), DX
 165  	ORL      R11, DI
 166  	VPSRLD   $0x0a, Y2, Y11
 167  	MOVL     CX, R15
 168  	RORXL    $0x0d, R9, R12
 169  	XORL     R14, R13
 170  	XORL     R8, R15
 171  	VPSRLQ   $0x13, Y2, Y3
 172  	RORXL    $0x06, BX, R14
 173  	ANDL     BX, R15
 174  	ADDL     DX, AX
 175  	ANDL     R10, DI
 176  	VPSRLQ   $0x11, Y2, Y2
 177  	XORL     R14, R13
 178  	XORL     R8, R15
 179  	VPXOR    Y3, Y2, Y2
 180  	RORXL    $0x16, R9, R14
 181  	ADDL     R13, R15
 182  	VPXOR    Y2, Y11, Y11
 183  	XORL     R12, R14
 184  	ADDL     R15, AX
 185  	RORXL    $0x02, R9, R12
 186  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
 187  	VPADDD   Y0, Y11, Y4
 188  	XORL     R12, R14
 189  	MOVL     R9, R12
 190  	ANDL     R11, R12
 191  	ORL      R12, DI
 192  	ADDL     R14, DX
 193  	ADDL     R15, DX
 194  	ADDL     DI, DX
 195  
 196  	// Do 4 rounds and scheduling
 197  	VPADDD   32(BP)(SI*1), Y5, Y9
 198  	VMOVDQU  Y9, 32(SP)(SI*1)
 199  	MOVL     DX, DI
 200  	RORXL    $0x19, AX, R13
 201  	RORXL    $0x0b, AX, R14
 202  	ADDL     32(SP)(SI*1), R8
 203  	ORL      R10, DI
 204  	VPALIGNR $0x04, Y7, Y4, Y0
 205  	MOVL     BX, R15
 206  	RORXL    $0x0d, DX, R12
 207  	XORL     R14, R13
 208  	XORL     CX, R15
 209  	VPADDD   Y5, Y0, Y0
 210  	RORXL    $0x06, AX, R14
 211  	ANDL     AX, R15
 212  	XORL     R14, R13
 213  	RORXL    $0x16, DX, R14
 214  	ADDL     R8, R11
 215  	ANDL     R9, DI
 216  	VPALIGNR $0x04, Y5, Y6, Y1
 217  	XORL     R12, R14
 218  	RORXL    $0x02, DX, R12
 219  	XORL     CX, R15
 220  	VPSRLD   $0x07, Y1, Y2
 221  	XORL     R12, R14
 222  	MOVL     DX, R12
 223  	ANDL     R10, R12
 224  	ADDL     R13, R15
 225  	VPSLLD   $0x19, Y1, Y3
 226  	ORL      R12, DI
 227  	ADDL     R14, R8
 228  	ADDL     R15, R11
 229  	VPOR     Y2, Y3, Y3
 230  	VPSRLD   $0x12, Y1, Y2
 231  	ADDL     R15, R8
 232  	ADDL     DI, R8
 233  	MOVL     R8, DI
 234  	RORXL    $0x19, R11, R13
 235  	RORXL    $0x0b, R11, R14
 236  	ADDL     36(SP)(SI*1), CX
 237  	ORL      R9, DI
 238  	VPSRLD   $0x03, Y1, Y8
 239  	MOVL     AX, R15
 240  	RORXL    $0x0d, R8, R12
 241  	XORL     R14, R13
 242  	XORL     BX, R15
 243  	RORXL    $0x06, R11, R14
 244  	XORL     R14, R13
 245  	RORXL    $0x16, R8, R14
 246  	ANDL     R11, R15
 247  	ADDL     CX, R10
 248  	VPSLLD   $0x0e, Y1, Y1
 249  	ANDL     DX, DI
 250  	XORL     R12, R14
 251  	VPXOR    Y1, Y3, Y3
 252  	RORXL    $0x02, R8, R12
 253  	XORL     BX, R15
 254  	VPXOR    Y2, Y3, Y3
 255  	XORL     R12, R14
 256  	MOVL     R8, R12
 257  	ANDL     R9, R12
 258  	ADDL     R13, R15
 259  	VPXOR    Y8, Y3, Y1
 260  	VPSHUFD  $0xfa, Y4, Y2
 261  	ORL      R12, DI
 262  	ADDL     R14, CX
 263  	VPADDD   Y1, Y0, Y0
 264  	ADDL     R15, R10
 265  	ADDL     R15, CX
 266  	ADDL     DI, CX
 267  	VPSRLD   $0x0a, Y2, Y8
 268  	MOVL     CX, DI
 269  	RORXL    $0x19, R10, R13
 270  	ADDL     40(SP)(SI*1), BX
 271  	VPSRLQ   $0x13, Y2, Y3
 272  	RORXL    $0x0b, R10, R14
 273  	ORL      DX, DI
 274  	MOVL     R11, R15
 275  	XORL     AX, R15
 276  	RORXL    $0x0d, CX, R12
 277  	XORL     R14, R13
 278  	VPSRLQ   $0x11, Y2, Y2
 279  	ANDL     R10, R15
 280  	RORXL    $0x06, R10, R14
 281  	VPXOR    Y3, Y2, Y2
 282  	ADDL     BX, R9
 283  	ANDL     R8, DI
 284  	XORL     R14, R13
 285  	RORXL    $0x16, CX, R14
 286  	VPXOR    Y2, Y8, Y8
 287  	XORL     AX, R15
 288  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
 289  	XORL     R12, R14
 290  	RORXL    $0x02, CX, R12
 291  	VPADDD   Y8, Y0, Y0
 292  	XORL     R12, R14
 293  	MOVL     CX, R12
 294  	ANDL     DX, R12
 295  	ADDL     R13, R15
 296  	VPSHUFD  $0x50, Y0, Y2
 297  	ORL      R12, DI
 298  	ADDL     R14, BX
 299  	ADDL     R15, R9
 300  	ADDL     R15, BX
 301  	ADDL     DI, BX
 302  	MOVL     BX, DI
 303  	RORXL    $0x19, R9, R13
 304  	RORXL    $0x0b, R9, R14
 305  	ADDL     44(SP)(SI*1), AX
 306  	ORL      R8, DI
 307  	VPSRLD   $0x0a, Y2, Y11
 308  	MOVL     R10, R15
 309  	RORXL    $0x0d, BX, R12
 310  	XORL     R14, R13
 311  	XORL     R11, R15
 312  	VPSRLQ   $0x13, Y2, Y3
 313  	RORXL    $0x06, R9, R14
 314  	ANDL     R9, R15
 315  	ADDL     AX, DX
 316  	ANDL     CX, DI
 317  	VPSRLQ   $0x11, Y2, Y2
 318  	XORL     R14, R13
 319  	XORL     R11, R15
 320  	VPXOR    Y3, Y2, Y2
 321  	RORXL    $0x16, BX, R14
 322  	ADDL     R13, R15
 323  	VPXOR    Y2, Y11, Y11
 324  	XORL     R12, R14
 325  	ADDL     R15, DX
 326  	RORXL    $0x02, BX, R12
 327  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
 328  	VPADDD   Y0, Y11, Y5
 329  	XORL     R12, R14
 330  	MOVL     BX, R12
 331  	ANDL     R8, R12
 332  	ORL      R12, DI
 333  	ADDL     R14, AX
 334  	ADDL     R15, AX
 335  	ADDL     DI, AX
 336  
 337  	// Do 4 rounds and scheduling
 338  	VPADDD   64(BP)(SI*1), Y6, Y9
 339  	VMOVDQU  Y9, 64(SP)(SI*1)
 340  	MOVL     AX, DI
 341  	RORXL    $0x19, DX, R13
 342  	RORXL    $0x0b, DX, R14
 343  	ADDL     64(SP)(SI*1), R11
 344  	ORL      CX, DI
 345  	VPALIGNR $0x04, Y4, Y5, Y0
 346  	MOVL     R9, R15
 347  	RORXL    $0x0d, AX, R12
 348  	XORL     R14, R13
 349  	XORL     R10, R15
 350  	VPADDD   Y6, Y0, Y0
 351  	RORXL    $0x06, DX, R14
 352  	ANDL     DX, R15
 353  	XORL     R14, R13
 354  	RORXL    $0x16, AX, R14
 355  	ADDL     R11, R8
 356  	ANDL     BX, DI
 357  	VPALIGNR $0x04, Y6, Y7, Y1
 358  	XORL     R12, R14
 359  	RORXL    $0x02, AX, R12
 360  	XORL     R10, R15
 361  	VPSRLD   $0x07, Y1, Y2
 362  	XORL     R12, R14
 363  	MOVL     AX, R12
 364  	ANDL     CX, R12
 365  	ADDL     R13, R15
 366  	VPSLLD   $0x19, Y1, Y3
 367  	ORL      R12, DI
 368  	ADDL     R14, R11
 369  	ADDL     R15, R8
 370  	VPOR     Y2, Y3, Y3
 371  	VPSRLD   $0x12, Y1, Y2
 372  	ADDL     R15, R11
 373  	ADDL     DI, R11
 374  	MOVL     R11, DI
 375  	RORXL    $0x19, R8, R13
 376  	RORXL    $0x0b, R8, R14
 377  	ADDL     68(SP)(SI*1), R10
 378  	ORL      BX, DI
 379  	VPSRLD   $0x03, Y1, Y8
 380  	MOVL     DX, R15
 381  	RORXL    $0x0d, R11, R12
 382  	XORL     R14, R13
 383  	XORL     R9, R15
 384  	RORXL    $0x06, R8, R14
 385  	XORL     R14, R13
 386  	RORXL    $0x16, R11, R14
 387  	ANDL     R8, R15
 388  	ADDL     R10, CX
 389  	VPSLLD   $0x0e, Y1, Y1
 390  	ANDL     AX, DI
 391  	XORL     R12, R14
 392  	VPXOR    Y1, Y3, Y3
 393  	RORXL    $0x02, R11, R12
 394  	XORL     R9, R15
 395  	VPXOR    Y2, Y3, Y3
 396  	XORL     R12, R14
 397  	MOVL     R11, R12
 398  	ANDL     BX, R12
 399  	ADDL     R13, R15
 400  	VPXOR    Y8, Y3, Y1
 401  	VPSHUFD  $0xfa, Y5, Y2
 402  	ORL      R12, DI
 403  	ADDL     R14, R10
 404  	VPADDD   Y1, Y0, Y0
 405  	ADDL     R15, CX
 406  	ADDL     R15, R10
 407  	ADDL     DI, R10
 408  	VPSRLD   $0x0a, Y2, Y8
 409  	MOVL     R10, DI
 410  	RORXL    $0x19, CX, R13
 411  	ADDL     72(SP)(SI*1), R9
 412  	VPSRLQ   $0x13, Y2, Y3
 413  	RORXL    $0x0b, CX, R14
 414  	ORL      AX, DI
 415  	MOVL     R8, R15
 416  	XORL     DX, R15
 417  	RORXL    $0x0d, R10, R12
 418  	XORL     R14, R13
 419  	VPSRLQ   $0x11, Y2, Y2
 420  	ANDL     CX, R15
 421  	RORXL    $0x06, CX, R14
 422  	VPXOR    Y3, Y2, Y2
 423  	ADDL     R9, BX
 424  	ANDL     R11, DI
 425  	XORL     R14, R13
 426  	RORXL    $0x16, R10, R14
 427  	VPXOR    Y2, Y8, Y8
 428  	XORL     DX, R15
 429  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
 430  	XORL     R12, R14
 431  	RORXL    $0x02, R10, R12
 432  	VPADDD   Y8, Y0, Y0
 433  	XORL     R12, R14
 434  	MOVL     R10, R12
 435  	ANDL     AX, R12
 436  	ADDL     R13, R15
 437  	VPSHUFD  $0x50, Y0, Y2
 438  	ORL      R12, DI
 439  	ADDL     R14, R9
 440  	ADDL     R15, BX
 441  	ADDL     R15, R9
 442  	ADDL     DI, R9
 443  	MOVL     R9, DI
 444  	RORXL    $0x19, BX, R13
 445  	RORXL    $0x0b, BX, R14
 446  	ADDL     76(SP)(SI*1), DX
 447  	ORL      R11, DI
 448  	VPSRLD   $0x0a, Y2, Y11
 449  	MOVL     CX, R15
 450  	RORXL    $0x0d, R9, R12
 451  	XORL     R14, R13
 452  	XORL     R8, R15
 453  	VPSRLQ   $0x13, Y2, Y3
 454  	RORXL    $0x06, BX, R14
 455  	ANDL     BX, R15
 456  	ADDL     DX, AX
 457  	ANDL     R10, DI
 458  	VPSRLQ   $0x11, Y2, Y2
 459  	XORL     R14, R13
 460  	XORL     R8, R15
 461  	VPXOR    Y3, Y2, Y2
 462  	RORXL    $0x16, R9, R14
 463  	ADDL     R13, R15
 464  	VPXOR    Y2, Y11, Y11
 465  	XORL     R12, R14
 466  	ADDL     R15, AX
 467  	RORXL    $0x02, R9, R12
 468  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
 469  	VPADDD   Y0, Y11, Y6
 470  	XORL     R12, R14
 471  	MOVL     R9, R12
 472  	ANDL     R11, R12
 473  	ORL      R12, DI
 474  	ADDL     R14, DX
 475  	ADDL     R15, DX
 476  	ADDL     DI, DX
 477  
 478  	// Do 4 rounds and scheduling
 479  	VPADDD   96(BP)(SI*1), Y7, Y9
 480  	VMOVDQU  Y9, 96(SP)(SI*1)
 481  	MOVL     DX, DI
 482  	RORXL    $0x19, AX, R13
 483  	RORXL    $0x0b, AX, R14
 484  	ADDL     96(SP)(SI*1), R8
 485  	ORL      R10, DI
 486  	VPALIGNR $0x04, Y5, Y6, Y0
 487  	MOVL     BX, R15
 488  	RORXL    $0x0d, DX, R12
 489  	XORL     R14, R13
 490  	XORL     CX, R15
 491  	VPADDD   Y7, Y0, Y0
 492  	RORXL    $0x06, AX, R14
 493  	ANDL     AX, R15
 494  	XORL     R14, R13
 495  	RORXL    $0x16, DX, R14
 496  	ADDL     R8, R11
 497  	ANDL     R9, DI
 498  	VPALIGNR $0x04, Y7, Y4, Y1
 499  	XORL     R12, R14
 500  	RORXL    $0x02, DX, R12
 501  	XORL     CX, R15
 502  	VPSRLD   $0x07, Y1, Y2
 503  	XORL     R12, R14
 504  	MOVL     DX, R12
 505  	ANDL     R10, R12
 506  	ADDL     R13, R15
 507  	VPSLLD   $0x19, Y1, Y3
 508  	ORL      R12, DI
 509  	ADDL     R14, R8
 510  	ADDL     R15, R11
 511  	VPOR     Y2, Y3, Y3
 512  	VPSRLD   $0x12, Y1, Y2
 513  	ADDL     R15, R8
 514  	ADDL     DI, R8
 515  	MOVL     R8, DI
 516  	RORXL    $0x19, R11, R13
 517  	RORXL    $0x0b, R11, R14
 518  	ADDL     100(SP)(SI*1), CX
 519  	ORL      R9, DI
 520  	VPSRLD   $0x03, Y1, Y8
 521  	MOVL     AX, R15
 522  	RORXL    $0x0d, R8, R12
 523  	XORL     R14, R13
 524  	XORL     BX, R15
 525  	RORXL    $0x06, R11, R14
 526  	XORL     R14, R13
 527  	RORXL    $0x16, R8, R14
 528  	ANDL     R11, R15
 529  	ADDL     CX, R10
 530  	VPSLLD   $0x0e, Y1, Y1
 531  	ANDL     DX, DI
 532  	XORL     R12, R14
 533  	VPXOR    Y1, Y3, Y3
 534  	RORXL    $0x02, R8, R12
 535  	XORL     BX, R15
 536  	VPXOR    Y2, Y3, Y3
 537  	XORL     R12, R14
 538  	MOVL     R8, R12
 539  	ANDL     R9, R12
 540  	ADDL     R13, R15
 541  	VPXOR    Y8, Y3, Y1
 542  	VPSHUFD  $0xfa, Y6, Y2
 543  	ORL      R12, DI
 544  	ADDL     R14, CX
 545  	VPADDD   Y1, Y0, Y0
 546  	ADDL     R15, R10
 547  	ADDL     R15, CX
 548  	ADDL     DI, CX
 549  	VPSRLD   $0x0a, Y2, Y8
 550  	MOVL     CX, DI
 551  	RORXL    $0x19, R10, R13
 552  	ADDL     104(SP)(SI*1), BX
 553  	VPSRLQ   $0x13, Y2, Y3
 554  	RORXL    $0x0b, R10, R14
 555  	ORL      DX, DI
 556  	MOVL     R11, R15
 557  	XORL     AX, R15
 558  	RORXL    $0x0d, CX, R12
 559  	XORL     R14, R13
 560  	VPSRLQ   $0x11, Y2, Y2
 561  	ANDL     R10, R15
 562  	RORXL    $0x06, R10, R14
 563  	VPXOR    Y3, Y2, Y2
 564  	ADDL     BX, R9
 565  	ANDL     R8, DI
 566  	XORL     R14, R13
 567  	RORXL    $0x16, CX, R14
 568  	VPXOR    Y2, Y8, Y8
 569  	XORL     AX, R15
 570  	VPSHUFB  shuff_00BA<>+0(SB), Y8, Y8
 571  	XORL     R12, R14
 572  	RORXL    $0x02, CX, R12
 573  	VPADDD   Y8, Y0, Y0
 574  	XORL     R12, R14
 575  	MOVL     CX, R12
 576  	ANDL     DX, R12
 577  	ADDL     R13, R15
 578  	VPSHUFD  $0x50, Y0, Y2
 579  	ORL      R12, DI
 580  	ADDL     R14, BX
 581  	ADDL     R15, R9
 582  	ADDL     R15, BX
 583  	ADDL     DI, BX
 584  	MOVL     BX, DI
 585  	RORXL    $0x19, R9, R13
 586  	RORXL    $0x0b, R9, R14
 587  	ADDL     108(SP)(SI*1), AX
 588  	ORL      R8, DI
 589  	VPSRLD   $0x0a, Y2, Y11
 590  	MOVL     R10, R15
 591  	RORXL    $0x0d, BX, R12
 592  	XORL     R14, R13
 593  	XORL     R11, R15
 594  	VPSRLQ   $0x13, Y2, Y3
 595  	RORXL    $0x06, R9, R14
 596  	ANDL     R9, R15
 597  	ADDL     AX, DX
 598  	ANDL     CX, DI
 599  	VPSRLQ   $0x11, Y2, Y2
 600  	XORL     R14, R13
 601  	XORL     R11, R15
 602  	VPXOR    Y3, Y2, Y2
 603  	RORXL    $0x16, BX, R14
 604  	ADDL     R13, R15
 605  	VPXOR    Y2, Y11, Y11
 606  	XORL     R12, R14
 607  	ADDL     R15, DX
 608  	RORXL    $0x02, BX, R12
 609  	VPSHUFB  shuff_DC00<>+0(SB), Y11, Y11
 610  	VPADDD   Y0, Y11, Y7
 611  	XORL     R12, R14
 612  	MOVL     BX, R12
 613  	ANDL     R8, R12
 614  	ORL      R12, DI
 615  	ADDL     R14, AX
 616  	ADDL     R15, AX
 617  	ADDL     DI, AX
 618  	ADDQ     $0x80, SI
 619  	CMPQ     SI, $0x00000180
 620  	JB       avx2_loop1
 621  
 622  avx2_loop2:
 623  	VPADDD  (BP)(SI*1), Y4, Y9
 624  	VMOVDQU Y9, (SP)(SI*1)
 625  	MOVL    R9, R15
 626  	RORXL   $0x19, DX, R13
 627  	RORXL   $0x0b, DX, R14
 628  	XORL    R10, R15
 629  	XORL    R14, R13
 630  	RORXL   $0x06, DX, R14
 631  	ANDL    DX, R15
 632  	XORL    R14, R13
 633  	RORXL   $0x0d, AX, R12
 634  	XORL    R10, R15
 635  	RORXL   $0x16, AX, R14
 636  	MOVL    AX, DI
 637  	XORL    R12, R14
 638  	RORXL   $0x02, AX, R12
 639  	ADDL    (SP)(SI*1), R11
 640  	ORL     CX, DI
 641  	XORL    R12, R14
 642  	MOVL    AX, R12
 643  	ANDL    BX, DI
 644  	ANDL    CX, R12
 645  	ADDL    R13, R15
 646  	ADDL    R11, R8
 647  	ORL     R12, DI
 648  	ADDL    R14, R11
 649  	ADDL    R15, R8
 650  	ADDL    R15, R11
 651  	MOVL    DX, R15
 652  	RORXL   $0x19, R8, R13
 653  	RORXL   $0x0b, R8, R14
 654  	XORL    R9, R15
 655  	XORL    R14, R13
 656  	RORXL   $0x06, R8, R14
 657  	ANDL    R8, R15
 658  	ADDL    DI, R11
 659  	XORL    R14, R13
 660  	RORXL   $0x0d, R11, R12
 661  	XORL    R9, R15
 662  	RORXL   $0x16, R11, R14
 663  	MOVL    R11, DI
 664  	XORL    R12, R14
 665  	RORXL   $0x02, R11, R12
 666  	ADDL    4(SP)(SI*1), R10
 667  	ORL     BX, DI
 668  	XORL    R12, R14
 669  	MOVL    R11, R12
 670  	ANDL    AX, DI
 671  	ANDL    BX, R12
 672  	ADDL    R13, R15
 673  	ADDL    R10, CX
 674  	ORL     R12, DI
 675  	ADDL    R14, R10
 676  	ADDL    R15, CX
 677  	ADDL    R15, R10
 678  	MOVL    R8, R15
 679  	RORXL   $0x19, CX, R13
 680  	RORXL   $0x0b, CX, R14
 681  	XORL    DX, R15
 682  	XORL    R14, R13
 683  	RORXL   $0x06, CX, R14
 684  	ANDL    CX, R15
 685  	ADDL    DI, R10
 686  	XORL    R14, R13
 687  	RORXL   $0x0d, R10, R12
 688  	XORL    DX, R15
 689  	RORXL   $0x16, R10, R14
 690  	MOVL    R10, DI
 691  	XORL    R12, R14
 692  	RORXL   $0x02, R10, R12
 693  	ADDL    8(SP)(SI*1), R9
 694  	ORL     AX, DI
 695  	XORL    R12, R14
 696  	MOVL    R10, R12
 697  	ANDL    R11, DI
 698  	ANDL    AX, R12
 699  	ADDL    R13, R15
 700  	ADDL    R9, BX
 701  	ORL     R12, DI
 702  	ADDL    R14, R9
 703  	ADDL    R15, BX
 704  	ADDL    R15, R9
 705  	MOVL    CX, R15
 706  	RORXL   $0x19, BX, R13
 707  	RORXL   $0x0b, BX, R14
 708  	XORL    R8, R15
 709  	XORL    R14, R13
 710  	RORXL   $0x06, BX, R14
 711  	ANDL    BX, R15
 712  	ADDL    DI, R9
 713  	XORL    R14, R13
 714  	RORXL   $0x0d, R9, R12
 715  	XORL    R8, R15
 716  	RORXL   $0x16, R9, R14
 717  	MOVL    R9, DI
 718  	XORL    R12, R14
 719  	RORXL   $0x02, R9, R12
 720  	ADDL    12(SP)(SI*1), DX
 721  	ORL     R11, DI
 722  	XORL    R12, R14
 723  	MOVL    R9, R12
 724  	ANDL    R10, DI
 725  	ANDL    R11, R12
 726  	ADDL    R13, R15
 727  	ADDL    DX, AX
 728  	ORL     R12, DI
 729  	ADDL    R14, DX
 730  	ADDL    R15, AX
 731  	ADDL    R15, DX
 732  	ADDL    DI, DX
 733  	VPADDD  32(BP)(SI*1), Y5, Y9
 734  	VMOVDQU Y9, 32(SP)(SI*1)
 735  	MOVL    BX, R15
 736  	RORXL   $0x19, AX, R13
 737  	RORXL   $0x0b, AX, R14
 738  	XORL    CX, R15
 739  	XORL    R14, R13
 740  	RORXL   $0x06, AX, R14
 741  	ANDL    AX, R15
 742  	XORL    R14, R13
 743  	RORXL   $0x0d, DX, R12
 744  	XORL    CX, R15
 745  	RORXL   $0x16, DX, R14
 746  	MOVL    DX, DI
 747  	XORL    R12, R14
 748  	RORXL   $0x02, DX, R12
 749  	ADDL    32(SP)(SI*1), R8
 750  	ORL     R10, DI
 751  	XORL    R12, R14
 752  	MOVL    DX, R12
 753  	ANDL    R9, DI
 754  	ANDL    R10, R12
 755  	ADDL    R13, R15
 756  	ADDL    R8, R11
 757  	ORL     R12, DI
 758  	ADDL    R14, R8
 759  	ADDL    R15, R11
 760  	ADDL    R15, R8
 761  	MOVL    AX, R15
 762  	RORXL   $0x19, R11, R13
 763  	RORXL   $0x0b, R11, R14
 764  	XORL    BX, R15
 765  	XORL    R14, R13
 766  	RORXL   $0x06, R11, R14
 767  	ANDL    R11, R15
 768  	ADDL    DI, R8
 769  	XORL    R14, R13
 770  	RORXL   $0x0d, R8, R12
 771  	XORL    BX, R15
 772  	RORXL   $0x16, R8, R14
 773  	MOVL    R8, DI
 774  	XORL    R12, R14
 775  	RORXL   $0x02, R8, R12
 776  	ADDL    36(SP)(SI*1), CX
 777  	ORL     R9, DI
 778  	XORL    R12, R14
 779  	MOVL    R8, R12
 780  	ANDL    DX, DI
 781  	ANDL    R9, R12
 782  	ADDL    R13, R15
 783  	ADDL    CX, R10
 784  	ORL     R12, DI
 785  	ADDL    R14, CX
 786  	ADDL    R15, R10
 787  	ADDL    R15, CX
 788  	MOVL    R11, R15
 789  	RORXL   $0x19, R10, R13
 790  	RORXL   $0x0b, R10, R14
 791  	XORL    AX, R15
 792  	XORL    R14, R13
 793  	RORXL   $0x06, R10, R14
 794  	ANDL    R10, R15
 795  	ADDL    DI, CX
 796  	XORL    R14, R13
 797  	RORXL   $0x0d, CX, R12
 798  	XORL    AX, R15
 799  	RORXL   $0x16, CX, R14
 800  	MOVL    CX, DI
 801  	XORL    R12, R14
 802  	RORXL   $0x02, CX, R12
 803  	ADDL    40(SP)(SI*1), BX
 804  	ORL     DX, DI
 805  	XORL    R12, R14
 806  	MOVL    CX, R12
 807  	ANDL    R8, DI
 808  	ANDL    DX, R12
 809  	ADDL    R13, R15
 810  	ADDL    BX, R9
 811  	ORL     R12, DI
 812  	ADDL    R14, BX
 813  	ADDL    R15, R9
 814  	ADDL    R15, BX
 815  	MOVL    R10, R15
 816  	RORXL   $0x19, R9, R13
 817  	RORXL   $0x0b, R9, R14
 818  	XORL    R11, R15
 819  	XORL    R14, R13
 820  	RORXL   $0x06, R9, R14
 821  	ANDL    R9, R15
 822  	ADDL    DI, BX
 823  	XORL    R14, R13
 824  	RORXL   $0x0d, BX, R12
 825  	XORL    R11, R15
 826  	RORXL   $0x16, BX, R14
 827  	MOVL    BX, DI
 828  	XORL    R12, R14
 829  	RORXL   $0x02, BX, R12
 830  	ADDL    44(SP)(SI*1), AX
 831  	ORL     R8, DI
 832  	XORL    R12, R14
 833  	MOVL    BX, R12
 834  	ANDL    CX, DI
 835  	ANDL    R8, R12
 836  	ADDL    R13, R15
 837  	ADDL    AX, DX
 838  	ORL     R12, DI
 839  	ADDL    R14, AX
 840  	ADDL    R15, DX
 841  	ADDL    R15, AX
 842  	ADDL    DI, AX
 843  	ADDQ    $0x40, SI
 844  	VMOVDQU Y6, Y4
 845  	VMOVDQU Y7, Y5
 846  	CMPQ    SI, $0x00000200
 847  	JB      avx2_loop2
 848  	MOVQ    dig+0(FP), SI
 849  	MOVQ    520(SP), DI
 850  	ADDL    AX, (SI)
 851  	MOVL    (SI), AX
 852  	ADDL    BX, 4(SI)
 853  	MOVL    4(SI), BX
 854  	ADDL    CX, 8(SI)
 855  	MOVL    8(SI), CX
 856  	ADDL    R8, 12(SI)
 857  	MOVL    12(SI), R8
 858  	ADDL    DX, 16(SI)
 859  	MOVL    16(SI), DX
 860  	ADDL    R9, 20(SI)
 861  	MOVL    20(SI), R9
 862  	ADDL    R10, 24(SI)
 863  	MOVL    24(SI), R10
 864  	ADDL    R11, 28(SI)
 865  	MOVL    28(SI), R11
 866  	CMPQ    512(SP), DI
 867  	JB      done_hash
 868  	XORQ    SI, SI
 869  
 870  avx2_loop3:
 871  	MOVL  R9, R15
 872  	RORXL $0x19, DX, R13
 873  	RORXL $0x0b, DX, R14
 874  	XORL  R10, R15
 875  	XORL  R14, R13
 876  	RORXL $0x06, DX, R14
 877  	ANDL  DX, R15
 878  	XORL  R14, R13
 879  	RORXL $0x0d, AX, R12
 880  	XORL  R10, R15
 881  	RORXL $0x16, AX, R14
 882  	MOVL  AX, DI
 883  	XORL  R12, R14
 884  	RORXL $0x02, AX, R12
 885  	ADDL  16(SP)(SI*1), R11
 886  	ORL   CX, DI
 887  	XORL  R12, R14
 888  	MOVL  AX, R12
 889  	ANDL  BX, DI
 890  	ANDL  CX, R12
 891  	ADDL  R13, R15
 892  	ADDL  R11, R8
 893  	ORL   R12, DI
 894  	ADDL  R14, R11
 895  	ADDL  R15, R8
 896  	ADDL  R15, R11
 897  	MOVL  DX, R15
 898  	RORXL $0x19, R8, R13
 899  	RORXL $0x0b, R8, R14
 900  	XORL  R9, R15
 901  	XORL  R14, R13
 902  	RORXL $0x06, R8, R14
 903  	ANDL  R8, R15
 904  	ADDL  DI, R11
 905  	XORL  R14, R13
 906  	RORXL $0x0d, R11, R12
 907  	XORL  R9, R15
 908  	RORXL $0x16, R11, R14
 909  	MOVL  R11, DI
 910  	XORL  R12, R14
 911  	RORXL $0x02, R11, R12
 912  	ADDL  20(SP)(SI*1), R10
 913  	ORL   BX, DI
 914  	XORL  R12, R14
 915  	MOVL  R11, R12
 916  	ANDL  AX, DI
 917  	ANDL  BX, R12
 918  	ADDL  R13, R15
 919  	ADDL  R10, CX
 920  	ORL   R12, DI
 921  	ADDL  R14, R10
 922  	ADDL  R15, CX
 923  	ADDL  R15, R10
 924  	MOVL  R8, R15
 925  	RORXL $0x19, CX, R13
 926  	RORXL $0x0b, CX, R14
 927  	XORL  DX, R15
 928  	XORL  R14, R13
 929  	RORXL $0x06, CX, R14
 930  	ANDL  CX, R15
 931  	ADDL  DI, R10
 932  	XORL  R14, R13
 933  	RORXL $0x0d, R10, R12
 934  	XORL  DX, R15
 935  	RORXL $0x16, R10, R14
 936  	MOVL  R10, DI
 937  	XORL  R12, R14
 938  	RORXL $0x02, R10, R12
 939  	ADDL  24(SP)(SI*1), R9
 940  	ORL   AX, DI
 941  	XORL  R12, R14
 942  	MOVL  R10, R12
 943  	ANDL  R11, DI
 944  	ANDL  AX, R12
 945  	ADDL  R13, R15
 946  	ADDL  R9, BX
 947  	ORL   R12, DI
 948  	ADDL  R14, R9
 949  	ADDL  R15, BX
 950  	ADDL  R15, R9
 951  	MOVL  CX, R15
 952  	RORXL $0x19, BX, R13
 953  	RORXL $0x0b, BX, R14
 954  	XORL  R8, R15
 955  	XORL  R14, R13
 956  	RORXL $0x06, BX, R14
 957  	ANDL  BX, R15
 958  	ADDL  DI, R9
 959  	XORL  R14, R13
 960  	RORXL $0x0d, R9, R12
 961  	XORL  R8, R15
 962  	RORXL $0x16, R9, R14
 963  	MOVL  R9, DI
 964  	XORL  R12, R14
 965  	RORXL $0x02, R9, R12
 966  	ADDL  28(SP)(SI*1), DX
 967  	ORL   R11, DI
 968  	XORL  R12, R14
 969  	MOVL  R9, R12
 970  	ANDL  R10, DI
 971  	ANDL  R11, R12
 972  	ADDL  R13, R15
 973  	ADDL  DX, AX
 974  	ORL   R12, DI
 975  	ADDL  R14, DX
 976  	ADDL  R15, AX
 977  	ADDL  R15, DX
 978  	ADDL  DI, DX
 979  	MOVL  BX, R15
 980  	RORXL $0x19, AX, R13
 981  	RORXL $0x0b, AX, R14
 982  	XORL  CX, R15
 983  	XORL  R14, R13
 984  	RORXL $0x06, AX, R14
 985  	ANDL  AX, R15
 986  	XORL  R14, R13
 987  	RORXL $0x0d, DX, R12
 988  	XORL  CX, R15
 989  	RORXL $0x16, DX, R14
 990  	MOVL  DX, DI
 991  	XORL  R12, R14
 992  	RORXL $0x02, DX, R12
 993  	ADDL  48(SP)(SI*1), R8
 994  	ORL   R10, DI
 995  	XORL  R12, R14
 996  	MOVL  DX, R12
 997  	ANDL  R9, DI
 998  	ANDL  R10, R12
 999  	ADDL  R13, R15
1000  	ADDL  R8, R11
1001  	ORL   R12, DI
1002  	ADDL  R14, R8
1003  	ADDL  R15, R11
1004  	ADDL  R15, R8
1005  	MOVL  AX, R15
1006  	RORXL $0x19, R11, R13
1007  	RORXL $0x0b, R11, R14
1008  	XORL  BX, R15
1009  	XORL  R14, R13
1010  	RORXL $0x06, R11, R14
1011  	ANDL  R11, R15
1012  	ADDL  DI, R8
1013  	XORL  R14, R13
1014  	RORXL $0x0d, R8, R12
1015  	XORL  BX, R15
1016  	RORXL $0x16, R8, R14
1017  	MOVL  R8, DI
1018  	XORL  R12, R14
1019  	RORXL $0x02, R8, R12
1020  	ADDL  52(SP)(SI*1), CX
1021  	ORL   R9, DI
1022  	XORL  R12, R14
1023  	MOVL  R8, R12
1024  	ANDL  DX, DI
1025  	ANDL  R9, R12
1026  	ADDL  R13, R15
1027  	ADDL  CX, R10
1028  	ORL   R12, DI
1029  	ADDL  R14, CX
1030  	ADDL  R15, R10
1031  	ADDL  R15, CX
1032  	MOVL  R11, R15
1033  	RORXL $0x19, R10, R13
1034  	RORXL $0x0b, R10, R14
1035  	XORL  AX, R15
1036  	XORL  R14, R13
1037  	RORXL $0x06, R10, R14
1038  	ANDL  R10, R15
1039  	ADDL  DI, CX
1040  	XORL  R14, R13
1041  	RORXL $0x0d, CX, R12
1042  	XORL  AX, R15
1043  	RORXL $0x16, CX, R14
1044  	MOVL  CX, DI
1045  	XORL  R12, R14
1046  	RORXL $0x02, CX, R12
1047  	ADDL  56(SP)(SI*1), BX
1048  	ORL   DX, DI
1049  	XORL  R12, R14
1050  	MOVL  CX, R12
1051  	ANDL  R8, DI
1052  	ANDL  DX, R12
1053  	ADDL  R13, R15
1054  	ADDL  BX, R9
1055  	ORL   R12, DI
1056  	ADDL  R14, BX
1057  	ADDL  R15, R9
1058  	ADDL  R15, BX
1059  	MOVL  R10, R15
1060  	RORXL $0x19, R9, R13
1061  	RORXL $0x0b, R9, R14
1062  	XORL  R11, R15
1063  	XORL  R14, R13
1064  	RORXL $0x06, R9, R14
1065  	ANDL  R9, R15
1066  	ADDL  DI, BX
1067  	XORL  R14, R13
1068  	RORXL $0x0d, BX, R12
1069  	XORL  R11, R15
1070  	RORXL $0x16, BX, R14
1071  	MOVL  BX, DI
1072  	XORL  R12, R14
1073  	RORXL $0x02, BX, R12
1074  	ADDL  60(SP)(SI*1), AX
1075  	ORL   R8, DI
1076  	XORL  R12, R14
1077  	MOVL  BX, R12
1078  	ANDL  CX, DI
1079  	ANDL  R8, R12
1080  	ADDL  R13, R15
1081  	ADDL  AX, DX
1082  	ORL   R12, DI
1083  	ADDL  R14, AX
1084  	ADDL  R15, DX
1085  	ADDL  R15, AX
1086  	ADDL  DI, AX
1087  	ADDQ  $0x40, SI
1088  	CMPQ  SI, $0x00000200
1089  	JB    avx2_loop3
1090  	MOVQ  dig+0(FP), SI
1091  	MOVQ  520(SP), DI
1092  	ADDQ  $0x40, DI
1093  	ADDL  AX, (SI)
1094  	MOVL  (SI), AX
1095  	ADDL  BX, 4(SI)
1096  	MOVL  4(SI), BX
1097  	ADDL  CX, 8(SI)
1098  	MOVL  8(SI), CX
1099  	ADDL  R8, 12(SI)
1100  	MOVL  12(SI), R8
1101  	ADDL  DX, 16(SI)
1102  	MOVL  16(SI), DX
1103  	ADDL  R9, 20(SI)
1104  	MOVL  20(SI), R9
1105  	ADDL  R10, 24(SI)
1106  	MOVL  24(SI), R10
1107  	ADDL  R11, 28(SI)
1108  	MOVL  28(SI), R11
1109  	CMPQ  512(SP), DI
1110  	JA    avx2_loop0
1111  	JB    done_hash
1112  
1113  avx2_do_last_block:
1114  	VMOVDQU (DI), X4
1115  	VMOVDQU 16(DI), X5
1116  	VMOVDQU 32(DI), X6
1117  	VMOVDQU 48(DI), X7
1118  	VMOVDQU flip_mask<>+0(SB), Y13
1119  	VPSHUFB X13, X4, X4
1120  	VPSHUFB X13, X5, X5
1121  	VPSHUFB X13, X6, X6
1122  	VPSHUFB X13, X7, X7
1123  	LEAQ    K256<>+0(SB), BP
1124  	JMP     avx2_last_block_enter
1125  
1126  avx2_only_one_block:
1127  	MOVL (SI), AX
1128  	MOVL 4(SI), BX
1129  	MOVL 8(SI), CX
1130  	MOVL 12(SI), R8
1131  	MOVL 16(SI), DX
1132  	MOVL 20(SI), R9
1133  	MOVL 24(SI), R10
1134  	MOVL 28(SI), R11
1135  	JMP  avx2_do_last_block
1136  
1137  done_hash:
1138  	VZEROUPPER
1139  	RET
1140  
1141  DATA flip_mask<>+0(SB)/8, $0x0405060700010203
1142  DATA flip_mask<>+8(SB)/8, $0x0c0d0e0f08090a0b
1143  DATA flip_mask<>+16(SB)/8, $0x0405060700010203
1144  DATA flip_mask<>+24(SB)/8, $0x0c0d0e0f08090a0b
1145  GLOBL flip_mask<>(SB), RODATA, $32
1146  
1147  DATA K256<>+0(SB)/4, $0x428a2f98
1148  DATA K256<>+4(SB)/4, $0x71374491
1149  DATA K256<>+8(SB)/4, $0xb5c0fbcf
1150  DATA K256<>+12(SB)/4, $0xe9b5dba5
1151  DATA K256<>+16(SB)/4, $0x428a2f98
1152  DATA K256<>+20(SB)/4, $0x71374491
1153  DATA K256<>+24(SB)/4, $0xb5c0fbcf
1154  DATA K256<>+28(SB)/4, $0xe9b5dba5
1155  DATA K256<>+32(SB)/4, $0x3956c25b
1156  DATA K256<>+36(SB)/4, $0x59f111f1
1157  DATA K256<>+40(SB)/4, $0x923f82a4
1158  DATA K256<>+44(SB)/4, $0xab1c5ed5
1159  DATA K256<>+48(SB)/4, $0x3956c25b
1160  DATA K256<>+52(SB)/4, $0x59f111f1
1161  DATA K256<>+56(SB)/4, $0x923f82a4
1162  DATA K256<>+60(SB)/4, $0xab1c5ed5
1163  DATA K256<>+64(SB)/4, $0xd807aa98
1164  DATA K256<>+68(SB)/4, $0x12835b01
1165  DATA K256<>+72(SB)/4, $0x243185be
1166  DATA K256<>+76(SB)/4, $0x550c7dc3
1167  DATA K256<>+80(SB)/4, $0xd807aa98
1168  DATA K256<>+84(SB)/4, $0x12835b01
1169  DATA K256<>+88(SB)/4, $0x243185be
1170  DATA K256<>+92(SB)/4, $0x550c7dc3
1171  DATA K256<>+96(SB)/4, $0x72be5d74
1172  DATA K256<>+100(SB)/4, $0x80deb1fe
1173  DATA K256<>+104(SB)/4, $0x9bdc06a7
1174  DATA K256<>+108(SB)/4, $0xc19bf174
1175  DATA K256<>+112(SB)/4, $0x72be5d74
1176  DATA K256<>+116(SB)/4, $0x80deb1fe
1177  DATA K256<>+120(SB)/4, $0x9bdc06a7
1178  DATA K256<>+124(SB)/4, $0xc19bf174
1179  DATA K256<>+128(SB)/4, $0xe49b69c1
1180  DATA K256<>+132(SB)/4, $0xefbe4786
1181  DATA K256<>+136(SB)/4, $0x0fc19dc6
1182  DATA K256<>+140(SB)/4, $0x240ca1cc
1183  DATA K256<>+144(SB)/4, $0xe49b69c1
1184  DATA K256<>+148(SB)/4, $0xefbe4786
1185  DATA K256<>+152(SB)/4, $0x0fc19dc6
1186  DATA K256<>+156(SB)/4, $0x240ca1cc
1187  DATA K256<>+160(SB)/4, $0x2de92c6f
1188  DATA K256<>+164(SB)/4, $0x4a7484aa
1189  DATA K256<>+168(SB)/4, $0x5cb0a9dc
1190  DATA K256<>+172(SB)/4, $0x76f988da
1191  DATA K256<>+176(SB)/4, $0x2de92c6f
1192  DATA K256<>+180(SB)/4, $0x4a7484aa
1193  DATA K256<>+184(SB)/4, $0x5cb0a9dc
1194  DATA K256<>+188(SB)/4, $0x76f988da
1195  DATA K256<>+192(SB)/4, $0x983e5152
1196  DATA K256<>+196(SB)/4, $0xa831c66d
1197  DATA K256<>+200(SB)/4, $0xb00327c8
1198  DATA K256<>+204(SB)/4, $0xbf597fc7
1199  DATA K256<>+208(SB)/4, $0x983e5152
1200  DATA K256<>+212(SB)/4, $0xa831c66d
1201  DATA K256<>+216(SB)/4, $0xb00327c8
1202  DATA K256<>+220(SB)/4, $0xbf597fc7
1203  DATA K256<>+224(SB)/4, $0xc6e00bf3
1204  DATA K256<>+228(SB)/4, $0xd5a79147
1205  DATA K256<>+232(SB)/4, $0x06ca6351
1206  DATA K256<>+236(SB)/4, $0x14292967
1207  DATA K256<>+240(SB)/4, $0xc6e00bf3
1208  DATA K256<>+244(SB)/4, $0xd5a79147
1209  DATA K256<>+248(SB)/4, $0x06ca6351
1210  DATA K256<>+252(SB)/4, $0x14292967
1211  DATA K256<>+256(SB)/4, $0x27b70a85
1212  DATA K256<>+260(SB)/4, $0x2e1b2138
1213  DATA K256<>+264(SB)/4, $0x4d2c6dfc
1214  DATA K256<>+268(SB)/4, $0x53380d13
1215  DATA K256<>+272(SB)/4, $0x27b70a85
1216  DATA K256<>+276(SB)/4, $0x2e1b2138
1217  DATA K256<>+280(SB)/4, $0x4d2c6dfc
1218  DATA K256<>+284(SB)/4, $0x53380d13
1219  DATA K256<>+288(SB)/4, $0x650a7354
1220  DATA K256<>+292(SB)/4, $0x766a0abb
1221  DATA K256<>+296(SB)/4, $0x81c2c92e
1222  DATA K256<>+300(SB)/4, $0x92722c85
1223  DATA K256<>+304(SB)/4, $0x650a7354
1224  DATA K256<>+308(SB)/4, $0x766a0abb
1225  DATA K256<>+312(SB)/4, $0x81c2c92e
1226  DATA K256<>+316(SB)/4, $0x92722c85
1227  DATA K256<>+320(SB)/4, $0xa2bfe8a1
1228  DATA K256<>+324(SB)/4, $0xa81a664b
1229  DATA K256<>+328(SB)/4, $0xc24b8b70
1230  DATA K256<>+332(SB)/4, $0xc76c51a3
1231  DATA K256<>+336(SB)/4, $0xa2bfe8a1
1232  DATA K256<>+340(SB)/4, $0xa81a664b
1233  DATA K256<>+344(SB)/4, $0xc24b8b70
1234  DATA K256<>+348(SB)/4, $0xc76c51a3
1235  DATA K256<>+352(SB)/4, $0xd192e819
1236  DATA K256<>+356(SB)/4, $0xd6990624
1237  DATA K256<>+360(SB)/4, $0xf40e3585
1238  DATA K256<>+364(SB)/4, $0x106aa070
1239  DATA K256<>+368(SB)/4, $0xd192e819
1240  DATA K256<>+372(SB)/4, $0xd6990624
1241  DATA K256<>+376(SB)/4, $0xf40e3585
1242  DATA K256<>+380(SB)/4, $0x106aa070
1243  DATA K256<>+384(SB)/4, $0x19a4c116
1244  DATA K256<>+388(SB)/4, $0x1e376c08
1245  DATA K256<>+392(SB)/4, $0x2748774c
1246  DATA K256<>+396(SB)/4, $0x34b0bcb5
1247  DATA K256<>+400(SB)/4, $0x19a4c116
1248  DATA K256<>+404(SB)/4, $0x1e376c08
1249  DATA K256<>+408(SB)/4, $0x2748774c
1250  DATA K256<>+412(SB)/4, $0x34b0bcb5
1251  DATA K256<>+416(SB)/4, $0x391c0cb3
1252  DATA K256<>+420(SB)/4, $0x4ed8aa4a
1253  DATA K256<>+424(SB)/4, $0x5b9cca4f
1254  DATA K256<>+428(SB)/4, $0x682e6ff3
1255  DATA K256<>+432(SB)/4, $0x391c0cb3
1256  DATA K256<>+436(SB)/4, $0x4ed8aa4a
1257  DATA K256<>+440(SB)/4, $0x5b9cca4f
1258  DATA K256<>+444(SB)/4, $0x682e6ff3
1259  DATA K256<>+448(SB)/4, $0x748f82ee
1260  DATA K256<>+452(SB)/4, $0x78a5636f
1261  DATA K256<>+456(SB)/4, $0x84c87814
1262  DATA K256<>+460(SB)/4, $0x8cc70208
1263  DATA K256<>+464(SB)/4, $0x748f82ee
1264  DATA K256<>+468(SB)/4, $0x78a5636f
1265  DATA K256<>+472(SB)/4, $0x84c87814
1266  DATA K256<>+476(SB)/4, $0x8cc70208
1267  DATA K256<>+480(SB)/4, $0x90befffa
1268  DATA K256<>+484(SB)/4, $0xa4506ceb
1269  DATA K256<>+488(SB)/4, $0xbef9a3f7
1270  DATA K256<>+492(SB)/4, $0xc67178f2
1271  DATA K256<>+496(SB)/4, $0x90befffa
1272  DATA K256<>+500(SB)/4, $0xa4506ceb
1273  DATA K256<>+504(SB)/4, $0xbef9a3f7
1274  DATA K256<>+508(SB)/4, $0xc67178f2
1275  GLOBL K256<>(SB), RODATA|NOPTR, $512
1276  
1277  DATA shuff_00BA<>+0(SB)/8, $0x0b0a090803020100
1278  DATA shuff_00BA<>+8(SB)/8, $0xffffffffffffffff
1279  DATA shuff_00BA<>+16(SB)/8, $0x0b0a090803020100
1280  DATA shuff_00BA<>+24(SB)/8, $0xffffffffffffffff
1281  GLOBL shuff_00BA<>(SB), RODATA, $32
1282  
1283  DATA shuff_DC00<>+0(SB)/8, $0xffffffffffffffff
1284  DATA shuff_DC00<>+8(SB)/8, $0x0b0a090803020100
1285  DATA shuff_DC00<>+16(SB)/8, $0xffffffffffffffff
1286  DATA shuff_DC00<>+24(SB)/8, $0x0b0a090803020100
1287  GLOBL shuff_DC00<>(SB), RODATA, $32
1288  
1289  // func blockSHANI(dig *Digest, p []byte)
1290  // Requires: AVX, SHA, SSE2, SSE4.1, SSSE3
1291  TEXT ·blockSHANI(SB), $0-32
1292  	MOVQ    dig+0(FP), DI
1293  	MOVQ    p_base+8(FP), SI
1294  	MOVQ    p_len+16(FP), DX
1295  	SHRQ    $0x06, DX
1296  	SHLQ    $0x06, DX
1297  	CMPQ    DX, $0x00
1298  	JEQ     done
1299  	ADDQ    SI, DX
1300  	VMOVDQU (DI), X1
1301  	VMOVDQU 16(DI), X2
1302  	PSHUFD  $0xb1, X1, X1
1303  	PSHUFD  $0x1b, X2, X2
1304  	VMOVDQA X1, X7
1305  	PALIGNR $0x08, X2, X1
1306  	PBLENDW $0xf0, X7, X2
1307  	VMOVDQA flip_mask<>+0(SB), X8
1308  	LEAQ    K256<>+0(SB), AX
1309  
1310  roundLoop:
1311  	// save hash values for addition after rounds
1312  	VMOVDQA X1, X9
1313  	VMOVDQA X2, X10
1314  
1315  	// do rounds 0-59
1316  	VMOVDQU     (SI), X0
1317  	PSHUFB      X8, X0
1318  	VMOVDQA     X0, X3
1319  	PADDD       (AX), X0
1320  	SHA256RNDS2 X0, X1, X2
1321  	PSHUFD      $0x0e, X0, X0
1322  	SHA256RNDS2 X0, X2, X1
1323  	VMOVDQU     16(SI), X0
1324  	PSHUFB      X8, X0
1325  	VMOVDQA     X0, X4
1326  	PADDD       32(AX), X0
1327  	SHA256RNDS2 X0, X1, X2
1328  	PSHUFD      $0x0e, X0, X0
1329  	SHA256RNDS2 X0, X2, X1
1330  	SHA256MSG1  X4, X3
1331  	VMOVDQU     32(SI), X0
1332  	PSHUFB      X8, X0
1333  	VMOVDQA     X0, X5
1334  	PADDD       64(AX), X0
1335  	SHA256RNDS2 X0, X1, X2
1336  	PSHUFD      $0x0e, X0, X0
1337  	SHA256RNDS2 X0, X2, X1
1338  	SHA256MSG1  X5, X4
1339  	VMOVDQU     48(SI), X0
1340  	PSHUFB      X8, X0
1341  	VMOVDQA     X0, X6
1342  	PADDD       96(AX), X0
1343  	SHA256RNDS2 X0, X1, X2
1344  	VMOVDQA     X6, X7
1345  	PALIGNR     $0x04, X5, X7
1346  	PADDD       X7, X3
1347  	SHA256MSG2  X6, X3
1348  	PSHUFD      $0x0e, X0, X0
1349  	SHA256RNDS2 X0, X2, X1
1350  	SHA256MSG1  X6, X5
1351  	VMOVDQA     X3, X0
1352  	PADDD       128(AX), X0
1353  	SHA256RNDS2 X0, X1, X2
1354  	VMOVDQA     X3, X7
1355  	PALIGNR     $0x04, X6, X7
1356  	PADDD       X7, X4
1357  	SHA256MSG2  X3, X4
1358  	PSHUFD      $0x0e, X0, X0
1359  	SHA256RNDS2 X0, X2, X1
1360  	SHA256MSG1  X3, X6
1361  	VMOVDQA     X4, X0
1362  	PADDD       160(AX), X0
1363  	SHA256RNDS2 X0, X1, X2
1364  	VMOVDQA     X4, X7
1365  	PALIGNR     $0x04, X3, X7
1366  	PADDD       X7, X5
1367  	SHA256MSG2  X4, X5
1368  	PSHUFD      $0x0e, X0, X0
1369  	SHA256RNDS2 X0, X2, X1
1370  	SHA256MSG1  X4, X3
1371  	VMOVDQA     X5, X0
1372  	PADDD       192(AX), X0
1373  	SHA256RNDS2 X0, X1, X2
1374  	VMOVDQA     X5, X7
1375  	PALIGNR     $0x04, X4, X7
1376  	PADDD       X7, X6
1377  	SHA256MSG2  X5, X6
1378  	PSHUFD      $0x0e, X0, X0
1379  	SHA256RNDS2 X0, X2, X1
1380  	SHA256MSG1  X5, X4
1381  	VMOVDQA     X6, X0
1382  	PADDD       224(AX), X0
1383  	SHA256RNDS2 X0, X1, X2
1384  	VMOVDQA     X6, X7
1385  	PALIGNR     $0x04, X5, X7
1386  	PADDD       X7, X3
1387  	SHA256MSG2  X6, X3
1388  	PSHUFD      $0x0e, X0, X0
1389  	SHA256RNDS2 X0, X2, X1
1390  	SHA256MSG1  X6, X5
1391  	VMOVDQA     X3, X0
1392  	PADDD       256(AX), X0
1393  	SHA256RNDS2 X0, X1, X2
1394  	VMOVDQA     X3, X7
1395  	PALIGNR     $0x04, X6, X7
1396  	PADDD       X7, X4
1397  	SHA256MSG2  X3, X4
1398  	PSHUFD      $0x0e, X0, X0
1399  	SHA256RNDS2 X0, X2, X1
1400  	SHA256MSG1  X3, X6
1401  	VMOVDQA     X4, X0
1402  	PADDD       288(AX), X0
1403  	SHA256RNDS2 X0, X1, X2
1404  	VMOVDQA     X4, X7
1405  	PALIGNR     $0x04, X3, X7
1406  	PADDD       X7, X5
1407  	SHA256MSG2  X4, X5
1408  	PSHUFD      $0x0e, X0, X0
1409  	SHA256RNDS2 X0, X2, X1
1410  	SHA256MSG1  X4, X3
1411  	VMOVDQA     X5, X0
1412  	PADDD       320(AX), X0
1413  	SHA256RNDS2 X0, X1, X2
1414  	VMOVDQA     X5, X7
1415  	PALIGNR     $0x04, X4, X7
1416  	PADDD       X7, X6
1417  	SHA256MSG2  X5, X6
1418  	PSHUFD      $0x0e, X0, X0
1419  	SHA256RNDS2 X0, X2, X1
1420  	SHA256MSG1  X5, X4
1421  	VMOVDQA     X6, X0
1422  	PADDD       352(AX), X0
1423  	SHA256RNDS2 X0, X1, X2
1424  	VMOVDQA     X6, X7
1425  	PALIGNR     $0x04, X5, X7
1426  	PADDD       X7, X3
1427  	SHA256MSG2  X6, X3
1428  	PSHUFD      $0x0e, X0, X0
1429  	SHA256RNDS2 X0, X2, X1
1430  	SHA256MSG1  X6, X5
1431  	VMOVDQA     X3, X0
1432  	PADDD       384(AX), X0
1433  	SHA256RNDS2 X0, X1, X2
1434  	VMOVDQA     X3, X7
1435  	PALIGNR     $0x04, X6, X7
1436  	PADDD       X7, X4
1437  	SHA256MSG2  X3, X4
1438  	PSHUFD      $0x0e, X0, X0
1439  	SHA256RNDS2 X0, X2, X1
1440  	SHA256MSG1  X3, X6
1441  	VMOVDQA     X4, X0
1442  	PADDD       416(AX), X0
1443  	SHA256RNDS2 X0, X1, X2
1444  	VMOVDQA     X4, X7
1445  	PALIGNR     $0x04, X3, X7
1446  	PADDD       X7, X5
1447  	SHA256MSG2  X4, X5
1448  	PSHUFD      $0x0e, X0, X0
1449  	SHA256RNDS2 X0, X2, X1
1450  	VMOVDQA     X5, X0
1451  	PADDD       448(AX), X0
1452  	SHA256RNDS2 X0, X1, X2
1453  	VMOVDQA     X5, X7
1454  	PALIGNR     $0x04, X4, X7
1455  	PADDD       X7, X6
1456  	SHA256MSG2  X5, X6
1457  	PSHUFD      $0x0e, X0, X0
1458  	SHA256RNDS2 X0, X2, X1
1459  
1460  	// do rounds 60-63
1461  	VMOVDQA     X6, X0
1462  	PADDD       480(AX), X0
1463  	SHA256RNDS2 X0, X1, X2
1464  	PSHUFD      $0x0e, X0, X0
1465  	SHA256RNDS2 X0, X2, X1
1466  
1467  	// add current hash values with previously saved
1468  	PADDD X9, X1
1469  	PADDD X10, X2
1470  
1471  	// advance data pointer; loop until buffer empty
1472  	ADDQ $0x40, SI
1473  	CMPQ DX, SI
1474  	JNE  roundLoop
1475  
1476  	// write hash values back in the correct order
1477  	PSHUFD  $0x1b, X1, X1
1478  	PSHUFD  $0xb1, X2, X2
1479  	VMOVDQA X1, X7
1480  	PBLENDW $0xf0, X2, X1
1481  	PALIGNR $0x08, X7, X2
1482  	VMOVDQU X1, (DI)
1483  	VMOVDQU X2, 16(DI)
1484  
1485  done:
1486  	RET
1487