blocks_amd64.s raw

   1  // Code generated by command: go run gen_amd64_compress_asm.go -out ../compress/blocks_amd64.s -stubs ../compress/blocks_amd64.go -pkg compress. DO NOT EDIT.
   2  
   3  //go:build !purego
   4  
   5  #include "textflag.h"
   6  
   7  DATA first_8_blake_consts<>+0(SB)/8, $0x85a308d3243f6a88
   8  DATA first_8_blake_consts<>+8(SB)/8, $0x0370734413198a2e
   9  DATA first_8_blake_consts<>+16(SB)/8, $0x299f31d0a4093822
  10  DATA first_8_blake_consts<>+24(SB)/8, $0xec4e6c89082efa98
  11  GLOBL first_8_blake_consts<>(SB), RODATA|NOPTR, $32
  12  
  13  DATA permuted_blake_consts<>+0(SB)/8, $0x0370734485a308d3
  14  DATA permuted_blake_consts<>+8(SB)/8, $0xec4e6c89299f31d0
  15  DATA permuted_blake_consts<>+16(SB)/8, $0x13198a2e243f6a88
  16  DATA permuted_blake_consts<>+24(SB)/8, $0x082efa98a4093822
  17  DATA permuted_blake_consts<>+32(SB)/8, $0x34e90c6c38d01377
  18  DATA permuted_blake_consts<>+40(SB)/8, $0xb5470917c97c50dd
  19  DATA permuted_blake_consts<>+48(SB)/8, $0xbe5466cf452821e6
  20  DATA permuted_blake_consts<>+56(SB)/8, $0x3f84d5b5c0ac29b7
  21  DATA permuted_blake_consts<>+64(SB)/8, $0x452821e6be5466cf
  22  DATA permuted_blake_consts<>+72(SB)/8, $0x082efa98b5470917
  23  DATA permuted_blake_consts<>+80(SB)/8, $0xa40938223f84d5b5
  24  DATA permuted_blake_consts<>+88(SB)/8, $0xc97c50dd38d01377
  25  DATA permuted_blake_consts<>+96(SB)/8, $0x13198a2ec0ac29b7
  26  DATA permuted_blake_consts<>+104(SB)/8, $0x03707344ec4e6c89
  27  DATA permuted_blake_consts<>+112(SB)/8, $0x243f6a8885a308d3
  28  DATA permuted_blake_consts<>+120(SB)/8, $0x299f31d034e90c6c
  29  DATA permuted_blake_consts<>+128(SB)/8, $0x243f6a88452821e6
  30  DATA permuted_blake_consts<>+136(SB)/8, $0xc97c50dd13198a2e
  31  DATA permuted_blake_consts<>+144(SB)/8, $0xc0ac29b734e90c6c
  32  DATA permuted_blake_consts<>+152(SB)/8, $0xb5470917299f31d0
  33  DATA permuted_blake_consts<>+160(SB)/8, $0x082efa983f84d5b5
  34  DATA permuted_blake_consts<>+168(SB)/8, $0xa409382285a308d3
  35  DATA permuted_blake_consts<>+176(SB)/8, $0x03707344be5466cf
  36  DATA permuted_blake_consts<>+184(SB)/8, $0x38d01377ec4e6c89
  37  DATA permuted_blake_consts<>+192(SB)/8, $0x85a308d338d01377
  38  DATA permuted_blake_consts<>+200(SB)/8, $0x3f84d5b5c0ac29b7
  39  DATA permuted_blake_consts<>+208(SB)/8, $0x03707344ec4e6c89
  40  DATA permuted_blake_consts<>+216(SB)/8, $0x34e90c6cc97c50dd
  41  DATA permuted_blake_consts<>+224(SB)/8, $0xbe5466cf082efa98
  42  DATA permuted_blake_consts<>+232(SB)/8, $0x452821e6243f6a88
  43  DATA permuted_blake_consts<>+240(SB)/8, $0x299f31d013198a2e
  44  DATA permuted_blake_consts<>+248(SB)/8, $0xb5470917a4093822
  45  DATA permuted_blake_consts<>+256(SB)/8, $0xec4e6c89243f6a88
  46  DATA permuted_blake_consts<>+264(SB)/8, $0xb5470917a4093822
  47  DATA permuted_blake_consts<>+272(SB)/8, $0x299f31d038d01377
  48  DATA permuted_blake_consts<>+280(SB)/8, $0xbe5466cf13198a2e
  49  DATA permuted_blake_consts<>+288(SB)/8, $0xc0ac29b785a308d3
  50  DATA permuted_blake_consts<>+296(SB)/8, $0xc97c50dd452821e6
  51  DATA permuted_blake_consts<>+304(SB)/8, $0x34e90c6c3f84d5b5
  52  DATA permuted_blake_consts<>+312(SB)/8, $0x03707344082efa98
  53  DATA permuted_blake_consts<>+320(SB)/8, $0xbe5466cfc0ac29b7
  54  DATA permuted_blake_consts<>+328(SB)/8, $0x0370734434e90c6c
  55  DATA permuted_blake_consts<>+336(SB)/8, $0x082efa9813198a2e
  56  DATA permuted_blake_consts<>+344(SB)/8, $0x452821e6243f6a88
  57  DATA permuted_blake_consts<>+352(SB)/8, $0x299f31d0c97c50dd
  58  DATA permuted_blake_consts<>+360(SB)/8, $0x38d013773f84d5b5
  59  DATA permuted_blake_consts<>+368(SB)/8, $0xec4e6c89a4093822
  60  DATA permuted_blake_consts<>+376(SB)/8, $0x85a308d3b5470917
  61  DATA permuted_blake_consts<>+384(SB)/8, $0xb5470917299f31d0
  62  DATA permuted_blake_consts<>+392(SB)/8, $0xbe5466cfc97c50dd
  63  DATA permuted_blake_consts<>+400(SB)/8, $0x85a308d3c0ac29b7
  64  DATA permuted_blake_consts<>+408(SB)/8, $0xa40938223f84d5b5
  65  DATA permuted_blake_consts<>+416(SB)/8, $0x03707344ec4e6c89
  66  DATA permuted_blake_consts<>+424(SB)/8, $0x34e90c6c13198a2e
  67  DATA permuted_blake_consts<>+432(SB)/8, $0x082efa98243f6a88
  68  DATA permuted_blake_consts<>+440(SB)/8, $0x452821e638d01377
  69  DATA permuted_blake_consts<>+448(SB)/8, $0x3f84d5b534e90c6c
  70  DATA permuted_blake_consts<>+456(SB)/8, $0x38d0137785a308d3
  71  DATA permuted_blake_consts<>+464(SB)/8, $0xec4e6c89c97c50dd
  72  DATA permuted_blake_consts<>+472(SB)/8, $0x03707344c0ac29b7
  73  DATA permuted_blake_consts<>+480(SB)/8, $0xa4093822243f6a88
  74  DATA permuted_blake_consts<>+488(SB)/8, $0xbe5466cf082efa98
  75  DATA permuted_blake_consts<>+496(SB)/8, $0xb5470917299f31d0
  76  DATA permuted_blake_consts<>+504(SB)/8, $0x13198a2e452821e6
  77  DATA permuted_blake_consts<>+512(SB)/8, $0x38d01377b5470917
  78  DATA permuted_blake_consts<>+520(SB)/8, $0x452821e603707344
  79  DATA permuted_blake_consts<>+528(SB)/8, $0x3f84d5b5082efa98
  80  DATA permuted_blake_consts<>+536(SB)/8, $0x243f6a8834e90c6c
  81  DATA permuted_blake_consts<>+544(SB)/8, $0xec4e6c8913198a2e
  82  DATA permuted_blake_consts<>+552(SB)/8, $0x299f31d0a4093822
  83  DATA permuted_blake_consts<>+560(SB)/8, $0xc97c50ddc0ac29b7
  84  DATA permuted_blake_consts<>+568(SB)/8, $0xbe5466cf85a308d3
  85  DATA permuted_blake_consts<>+576(SB)/8, $0xa409382213198a2e
  86  DATA permuted_blake_consts<>+584(SB)/8, $0x299f31d0082efa98
  87  DATA permuted_blake_consts<>+592(SB)/8, $0x452821e6be5466cf
  88  DATA permuted_blake_consts<>+600(SB)/8, $0x85a308d3ec4e6c89
  89  DATA permuted_blake_consts<>+608(SB)/8, $0x3f84d5b534e90c6c
  90  DATA permuted_blake_consts<>+616(SB)/8, $0x243f6a88c0ac29b7
  91  DATA permuted_blake_consts<>+624(SB)/8, $0x38d01377b5470917
  92  DATA permuted_blake_consts<>+632(SB)/8, $0xc97c50dd03707344
  93  GLOBL permuted_blake_consts<>(SB), RODATA|NOPTR, $640
  94  
  95  DATA shuffle_rotr8_4x32<>+0(SB)/8, $0x0407060500030201
  96  DATA shuffle_rotr8_4x32<>+8(SB)/8, $0x0c0f0e0d080b0a09
  97  GLOBL shuffle_rotr8_4x32<>(SB), RODATA|NOPTR, $16
  98  
  99  DATA shuffle_rotr16_4x32<>+0(SB)/8, $0x0504070601000302
 100  DATA shuffle_rotr16_4x32<>+8(SB)/8, $0x0d0c0f0e09080b0a
 101  GLOBL shuffle_rotr16_4x32<>(SB), RODATA|NOPTR, $16
 102  
 103  DATA shuffle_le_to_be_4x32<>+0(SB)/8, $0x0405060700010203
 104  DATA shuffle_le_to_be_4x32<>+8(SB)/8, $0x0c0d0e0f08090a0b
 105  GLOBL shuffle_le_to_be_4x32<>(SB), RODATA|NOPTR, $16
 106  
 107  // func blocksSSE2(state *State, msg []byte, counter uint64)
 108  // Requires: SSE2
 109  TEXT ·blocksSSE2(SB), $64-40
 110  	MOVQ state+0(FP), AX
 111  	MOVQ counter+32(FP), CX
 112  	MOVQ msg_base+8(FP), DX
 113  	MOVQ msg_len+16(FP), BX
 114  
 115  	// Convert message len to number of blocks for loop counter.
 116  	SHRQ $0x06, BX
 117  
 118  	// Initialize state matrix.
 119  	// row0 = |v0  v1  v2  v3|   |  h0     h1     h2     h3 |
 120  	// row1 = |v4  v5  v6  v7|   |  h4     h5     h6     h7 |
 121  	MOVOU 32(AX), X0
 122  	MOVOU (AX), X1
 123  	MOVOU 16(AX), X2
 124  
 125  compressLoop:
 126  	// row2 = |v8  v9  va  vb| = |s0^c0  s1^c1  s2^c2  s3^c3|
 127  	// row3 = |vc  vd  ve  vf|   |t0^c4  t0^c5  t1^c6  t1^c7|
 128  	MOVOU  first_8_blake_consts<>+0(SB), X3
 129  	PXOR   X0, X3
 130  	MOVD   CX, X4
 131  	PSHUFD $0x50, X4, X4
 132  	PXOR   first_8_blake_consts<>+16(SB), X4
 133  	MOVO   X1, X5
 134  	MOVO   X2, X6
 135  
 136  	// Convert message to big endian.
 137  	MOVL   (DX), SI
 138  	MOVL   4(DX), DI
 139  	MOVL   8(DX), R8
 140  	MOVL   12(DX), R9
 141  	MOVL   16(DX), R10
 142  	MOVL   20(DX), R11
 143  	MOVL   24(DX), R12
 144  	MOVL   28(DX), R13
 145  	BSWAPL SI
 146  	MOVL   SI, (SP)
 147  	BSWAPL DI
 148  	MOVL   DI, 4(SP)
 149  	BSWAPL R8
 150  	MOVL   R8, 8(SP)
 151  	BSWAPL R9
 152  	MOVL   R9, 12(SP)
 153  	BSWAPL R10
 154  	MOVL   R10, 16(SP)
 155  	BSWAPL R11
 156  	MOVL   R11, 20(SP)
 157  	BSWAPL R12
 158  	MOVL   R12, 24(SP)
 159  	BSWAPL R13
 160  	MOVL   R13, 28(SP)
 161  	MOVL   32(DX), SI
 162  	MOVL   36(DX), DI
 163  	MOVL   40(DX), R8
 164  	MOVL   44(DX), R9
 165  	MOVL   48(DX), R10
 166  	MOVL   52(DX), R11
 167  	MOVL   56(DX), R12
 168  	MOVL   60(DX), R13
 169  	BSWAPL SI
 170  	MOVL   SI, 32(SP)
 171  	BSWAPL DI
 172  	MOVL   DI, 36(SP)
 173  	BSWAPL R8
 174  	MOVL   R8, 40(SP)
 175  	BSWAPL R9
 176  	MOVL   R9, 44(SP)
 177  	BSWAPL R10
 178  	MOVL   R10, 48(SP)
 179  	BSWAPL R11
 180  	MOVL   R11, 52(SP)
 181  	BSWAPL R12
 182  	MOVL   R12, 56(SP)
 183  	BSWAPL R13
 184  	MOVL   R13, 60(SP)
 185  
 186  	// Round 1 column step.
 187  	MOVD       24(SP), X9
 188  	MOVD       16(SP), X7
 189  	MOVOA      X7, X8
 190  	PUNPCKLLQ  X9, X8
 191  	MOVD       8(SP), X7
 192  	MOVD       (SP), X9
 193  	PUNPCKLLQ  X7, X9
 194  	PUNPCKLQDQ X8, X9
 195  	MOVOU      permuted_blake_consts<>+0(SB), X8
 196  	PXOR       X9, X8
 197  	PADDD      X8, X1
 198  	MOVD       28(SP), X9
 199  	MOVD       20(SP), X7
 200  	MOVOA      X7, X8
 201  	PUNPCKLLQ  X9, X8
 202  	MOVD       12(SP), X7
 203  	MOVD       4(SP), X9
 204  	PUNPCKLLQ  X7, X9
 205  	PUNPCKLQDQ X8, X9
 206  	MOVOU      permuted_blake_consts<>+16(SB), X8
 207  	PXOR       X9, X8
 208  	PADDD      X2, X1
 209  	PXOR       X1, X4
 210  	MOVO       X4, X7
 211  	PSRLL      $0x10, X7
 212  	PSLLL      $0x10, X4
 213  	PXOR       X7, X4
 214  	PADDD      X4, X3
 215  	PXOR       X3, X2
 216  	MOVO       X2, X7
 217  	PSRLL      $0x0c, X7
 218  	PSLLL      $0x14, X2
 219  	PXOR       X7, X2
 220  	PADDD      X8, X1
 221  	PADDD      X2, X1
 222  	PXOR       X1, X4
 223  	MOVO       X4, X7
 224  	PSRLL      $0x08, X7
 225  	PSLLL      $0x18, X4
 226  	PXOR       X7, X4
 227  	PADDD      X4, X3
 228  	PXOR       X3, X2
 229  	MOVO       X2, X7
 230  	PSRLL      $0x07, X7
 231  	PSLLL      $0x19, X2
 232  	PXOR       X7, X2
 233  
 234  	// Round 1 diagonal step part 1: diagonalize.
 235  	PSHUFD $0x39, X2, X2
 236  	PSHUFD $0x4e, X3, X3
 237  	PSHUFD $0x93, X4, X4
 238  
 239  	// Round 1 diagonal step part 2: column step.
 240  	MOVD       56(SP), X9
 241  	MOVD       48(SP), X7
 242  	MOVOA      X7, X8
 243  	PUNPCKLLQ  X9, X8
 244  	MOVD       40(SP), X7
 245  	MOVD       32(SP), X9
 246  	PUNPCKLLQ  X7, X9
 247  	PUNPCKLQDQ X8, X9
 248  	MOVOU      permuted_blake_consts<>+32(SB), X8
 249  	PXOR       X9, X8
 250  	PADDD      X8, X1
 251  	MOVD       60(SP), X9
 252  	MOVD       52(SP), X7
 253  	MOVOA      X7, X8
 254  	PUNPCKLLQ  X9, X8
 255  	MOVD       44(SP), X7
 256  	MOVD       36(SP), X9
 257  	PUNPCKLLQ  X7, X9
 258  	PUNPCKLQDQ X8, X9
 259  	MOVOU      permuted_blake_consts<>+48(SB), X8
 260  	PXOR       X9, X8
 261  	PADDD      X2, X1
 262  	PXOR       X1, X4
 263  	MOVO       X4, X7
 264  	PSRLL      $0x10, X7
 265  	PSLLL      $0x10, X4
 266  	PXOR       X7, X4
 267  	PADDD      X4, X3
 268  	PXOR       X3, X2
 269  	MOVO       X2, X7
 270  	PSRLL      $0x0c, X7
 271  	PSLLL      $0x14, X2
 272  	PXOR       X7, X2
 273  	PADDD      X8, X1
 274  	PADDD      X2, X1
 275  	PXOR       X1, X4
 276  	MOVO       X4, X7
 277  	PSRLL      $0x08, X7
 278  	PSLLL      $0x18, X4
 279  	PXOR       X7, X4
 280  	PADDD      X4, X3
 281  	PXOR       X3, X2
 282  	MOVO       X2, X7
 283  	PSRLL      $0x07, X7
 284  	PSLLL      $0x19, X2
 285  	PXOR       X7, X2
 286  
 287  	// Round 1 diagonal step part 3: undiagonalize.
 288  	PSHUFD $0x93, X2, X2
 289  	PSHUFD $0x4e, X3, X3
 290  	PSHUFD $0x39, X4, X4
 291  
 292  	// Round 2 column step.
 293  	MOVD       52(SP), X9
 294  	MOVD       36(SP), X7
 295  	MOVOA      X7, X8
 296  	PUNPCKLLQ  X9, X8
 297  	MOVD       16(SP), X7
 298  	MOVD       56(SP), X9
 299  	PUNPCKLLQ  X7, X9
 300  	PUNPCKLQDQ X8, X9
 301  	MOVOU      permuted_blake_consts<>+64(SB), X8
 302  	PXOR       X9, X8
 303  	PADDD      X8, X1
 304  	MOVD       24(SP), X9
 305  	MOVD       60(SP), X7
 306  	MOVOA      X7, X8
 307  	PUNPCKLLQ  X9, X8
 308  	MOVD       32(SP), X7
 309  	MOVD       40(SP), X9
 310  	PUNPCKLLQ  X7, X9
 311  	PUNPCKLQDQ X8, X9
 312  	MOVOU      permuted_blake_consts<>+80(SB), X8
 313  	PXOR       X9, X8
 314  	PADDD      X2, X1
 315  	PXOR       X1, X4
 316  	MOVO       X4, X7
 317  	PSRLL      $0x10, X7
 318  	PSLLL      $0x10, X4
 319  	PXOR       X7, X4
 320  	PADDD      X4, X3
 321  	PXOR       X3, X2
 322  	MOVO       X2, X7
 323  	PSRLL      $0x0c, X7
 324  	PSLLL      $0x14, X2
 325  	PXOR       X7, X2
 326  	PADDD      X8, X1
 327  	PADDD      X2, X1
 328  	PXOR       X1, X4
 329  	MOVO       X4, X7
 330  	PSRLL      $0x08, X7
 331  	PSLLL      $0x18, X4
 332  	PXOR       X7, X4
 333  	PADDD      X4, X3
 334  	PXOR       X3, X2
 335  	MOVO       X2, X7
 336  	PSRLL      $0x07, X7
 337  	PSLLL      $0x19, X2
 338  	PXOR       X7, X2
 339  
 340  	// Round 2 diagonal step part 1: diagonalize.
 341  	PSHUFD $0x39, X2, X2
 342  	PSHUFD $0x4e, X3, X3
 343  	PSHUFD $0x93, X4, X4
 344  
 345  	// Round 2 diagonal step part 2: column step.
 346  	MOVD       20(SP), X9
 347  	MOVD       44(SP), X7
 348  	MOVOA      X7, X8
 349  	PUNPCKLLQ  X9, X8
 350  	MOVD       (SP), X7
 351  	MOVD       4(SP), X9
 352  	PUNPCKLLQ  X7, X9
 353  	PUNPCKLQDQ X8, X9
 354  	MOVOU      permuted_blake_consts<>+96(SB), X8
 355  	PXOR       X9, X8
 356  	PADDD      X8, X1
 357  	MOVD       12(SP), X9
 358  	MOVD       28(SP), X7
 359  	MOVOA      X7, X8
 360  	PUNPCKLLQ  X9, X8
 361  	MOVD       8(SP), X7
 362  	MOVD       48(SP), X9
 363  	PUNPCKLLQ  X7, X9
 364  	PUNPCKLQDQ X8, X9
 365  	MOVOU      permuted_blake_consts<>+112(SB), X8
 366  	PXOR       X9, X8
 367  	PADDD      X2, X1
 368  	PXOR       X1, X4
 369  	MOVO       X4, X7
 370  	PSRLL      $0x10, X7
 371  	PSLLL      $0x10, X4
 372  	PXOR       X7, X4
 373  	PADDD      X4, X3
 374  	PXOR       X3, X2
 375  	MOVO       X2, X7
 376  	PSRLL      $0x0c, X7
 377  	PSLLL      $0x14, X2
 378  	PXOR       X7, X2
 379  	PADDD      X8, X1
 380  	PADDD      X2, X1
 381  	PXOR       X1, X4
 382  	MOVO       X4, X7
 383  	PSRLL      $0x08, X7
 384  	PSLLL      $0x18, X4
 385  	PXOR       X7, X4
 386  	PADDD      X4, X3
 387  	PXOR       X3, X2
 388  	MOVO       X2, X7
 389  	PSRLL      $0x07, X7
 390  	PSLLL      $0x19, X2
 391  	PXOR       X7, X2
 392  
 393  	// Round 2 diagonal step part 3: undiagonalize.
 394  	PSHUFD $0x93, X2, X2
 395  	PSHUFD $0x4e, X3, X3
 396  	PSHUFD $0x39, X4, X4
 397  
 398  	// Round 3 column step.
 399  	MOVD       60(SP), X9
 400  	MOVD       20(SP), X7
 401  	MOVOA      X7, X8
 402  	PUNPCKLLQ  X9, X8
 403  	MOVD       48(SP), X7
 404  	MOVD       44(SP), X9
 405  	PUNPCKLLQ  X7, X9
 406  	PUNPCKLQDQ X8, X9
 407  	MOVOU      permuted_blake_consts<>+128(SB), X8
 408  	PXOR       X9, X8
 409  	PADDD      X8, X1
 410  	MOVD       52(SP), X9
 411  	MOVD       8(SP), X7
 412  	MOVOA      X7, X8
 413  	PUNPCKLLQ  X9, X8
 414  	MOVD       (SP), X7
 415  	MOVD       32(SP), X9
 416  	PUNPCKLLQ  X7, X9
 417  	PUNPCKLQDQ X8, X9
 418  	MOVOU      permuted_blake_consts<>+144(SB), X8
 419  	PXOR       X9, X8
 420  	PADDD      X2, X1
 421  	PXOR       X1, X4
 422  	MOVO       X4, X7
 423  	PSRLL      $0x10, X7
 424  	PSLLL      $0x10, X4
 425  	PXOR       X7, X4
 426  	PADDD      X4, X3
 427  	PXOR       X3, X2
 428  	MOVO       X2, X7
 429  	PSRLL      $0x0c, X7
 430  	PSLLL      $0x14, X2
 431  	PXOR       X7, X2
 432  	PADDD      X8, X1
 433  	PADDD      X2, X1
 434  	PXOR       X1, X4
 435  	MOVO       X4, X7
 436  	PSRLL      $0x08, X7
 437  	PSLLL      $0x18, X4
 438  	PXOR       X7, X4
 439  	PADDD      X4, X3
 440  	PXOR       X3, X2
 441  	MOVO       X2, X7
 442  	PSRLL      $0x07, X7
 443  	PSLLL      $0x19, X2
 444  	PXOR       X7, X2
 445  
 446  	// Round 3 diagonal step part 1: diagonalize.
 447  	PSHUFD $0x39, X2, X2
 448  	PSHUFD $0x4e, X3, X3
 449  	PSHUFD $0x93, X4, X4
 450  
 451  	// Round 3 diagonal step part 2: column step.
 452  	MOVD       36(SP), X9
 453  	MOVD       28(SP), X7
 454  	MOVOA      X7, X8
 455  	PUNPCKLLQ  X9, X8
 456  	MOVD       12(SP), X7
 457  	MOVD       40(SP), X9
 458  	PUNPCKLLQ  X7, X9
 459  	PUNPCKLQDQ X8, X9
 460  	MOVOU      permuted_blake_consts<>+160(SB), X8
 461  	PXOR       X9, X8
 462  	PADDD      X8, X1
 463  	MOVD       16(SP), X9
 464  	MOVD       4(SP), X7
 465  	MOVOA      X7, X8
 466  	PUNPCKLLQ  X9, X8
 467  	MOVD       24(SP), X7
 468  	MOVD       56(SP), X9
 469  	PUNPCKLLQ  X7, X9
 470  	PUNPCKLQDQ X8, X9
 471  	MOVOU      permuted_blake_consts<>+176(SB), X8
 472  	PXOR       X9, X8
 473  	PADDD      X2, X1
 474  	PXOR       X1, X4
 475  	MOVO       X4, X7
 476  	PSRLL      $0x10, X7
 477  	PSLLL      $0x10, X4
 478  	PXOR       X7, X4
 479  	PADDD      X4, X3
 480  	PXOR       X3, X2
 481  	MOVO       X2, X7
 482  	PSRLL      $0x0c, X7
 483  	PSLLL      $0x14, X2
 484  	PXOR       X7, X2
 485  	PADDD      X8, X1
 486  	PADDD      X2, X1
 487  	PXOR       X1, X4
 488  	MOVO       X4, X7
 489  	PSRLL      $0x08, X7
 490  	PSLLL      $0x18, X4
 491  	PXOR       X7, X4
 492  	PADDD      X4, X3
 493  	PXOR       X3, X2
 494  	MOVO       X2, X7
 495  	PSRLL      $0x07, X7
 496  	PSLLL      $0x19, X2
 497  	PXOR       X7, X2
 498  
 499  	// Round 3 diagonal step part 3: undiagonalize.
 500  	PSHUFD $0x93, X2, X2
 501  	PSHUFD $0x4e, X3, X3
 502  	PSHUFD $0x39, X4, X4
 503  
 504  	// Round 4 column step.
 505  	MOVD       44(SP), X9
 506  	MOVD       52(SP), X7
 507  	MOVOA      X7, X8
 508  	PUNPCKLLQ  X9, X8
 509  	MOVD       12(SP), X7
 510  	MOVD       28(SP), X9
 511  	PUNPCKLLQ  X7, X9
 512  	PUNPCKLQDQ X8, X9
 513  	MOVOU      permuted_blake_consts<>+192(SB), X8
 514  	PXOR       X9, X8
 515  	PADDD      X8, X1
 516  	MOVD       56(SP), X9
 517  	MOVD       48(SP), X7
 518  	MOVOA      X7, X8
 519  	PUNPCKLLQ  X9, X8
 520  	MOVD       4(SP), X7
 521  	MOVD       36(SP), X9
 522  	PUNPCKLLQ  X7, X9
 523  	PUNPCKLQDQ X8, X9
 524  	MOVOU      permuted_blake_consts<>+208(SB), X8
 525  	PXOR       X9, X8
 526  	PADDD      X2, X1
 527  	PXOR       X1, X4
 528  	MOVO       X4, X7
 529  	PSRLL      $0x10, X7
 530  	PSLLL      $0x10, X4
 531  	PXOR       X7, X4
 532  	PADDD      X4, X3
 533  	PXOR       X3, X2
 534  	MOVO       X2, X7
 535  	PSRLL      $0x0c, X7
 536  	PSLLL      $0x14, X2
 537  	PXOR       X7, X2
 538  	PADDD      X8, X1
 539  	PADDD      X2, X1
 540  	PXOR       X1, X4
 541  	MOVO       X4, X7
 542  	PSRLL      $0x08, X7
 543  	PSLLL      $0x18, X4
 544  	PXOR       X7, X4
 545  	PADDD      X4, X3
 546  	PXOR       X3, X2
 547  	MOVO       X2, X7
 548  	PSRLL      $0x07, X7
 549  	PSLLL      $0x19, X2
 550  	PXOR       X7, X2
 551  
 552  	// Round 4 diagonal step part 1: diagonalize.
 553  	PSHUFD $0x39, X2, X2
 554  	PSHUFD $0x4e, X3, X3
 555  	PSHUFD $0x93, X4, X4
 556  
 557  	// Round 4 diagonal step part 2: column step.
 558  	MOVD       60(SP), X9
 559  	MOVD       16(SP), X7
 560  	MOVOA      X7, X8
 561  	PUNPCKLLQ  X9, X8
 562  	MOVD       20(SP), X7
 563  	MOVD       8(SP), X9
 564  	PUNPCKLLQ  X7, X9
 565  	PUNPCKLQDQ X8, X9
 566  	MOVOU      permuted_blake_consts<>+224(SB), X8
 567  	PXOR       X9, X8
 568  	PADDD      X8, X1
 569  	MOVD       32(SP), X9
 570  	MOVD       (SP), X7
 571  	MOVOA      X7, X8
 572  	PUNPCKLLQ  X9, X8
 573  	MOVD       40(SP), X7
 574  	MOVD       24(SP), X9
 575  	PUNPCKLLQ  X7, X9
 576  	PUNPCKLQDQ X8, X9
 577  	MOVOU      permuted_blake_consts<>+240(SB), X8
 578  	PXOR       X9, X8
 579  	PADDD      X2, X1
 580  	PXOR       X1, X4
 581  	MOVO       X4, X7
 582  	PSRLL      $0x10, X7
 583  	PSLLL      $0x10, X4
 584  	PXOR       X7, X4
 585  	PADDD      X4, X3
 586  	PXOR       X3, X2
 587  	MOVO       X2, X7
 588  	PSRLL      $0x0c, X7
 589  	PSLLL      $0x14, X2
 590  	PXOR       X7, X2
 591  	PADDD      X8, X1
 592  	PADDD      X2, X1
 593  	PXOR       X1, X4
 594  	MOVO       X4, X7
 595  	PSRLL      $0x08, X7
 596  	PSLLL      $0x18, X4
 597  	PXOR       X7, X4
 598  	PADDD      X4, X3
 599  	PXOR       X3, X2
 600  	MOVO       X2, X7
 601  	PSRLL      $0x07, X7
 602  	PSLLL      $0x19, X2
 603  	PXOR       X7, X2
 604  
 605  	// Round 4 diagonal step part 3: undiagonalize.
 606  	PSHUFD $0x93, X2, X2
 607  	PSHUFD $0x4e, X3, X3
 608  	PSHUFD $0x39, X4, X4
 609  
 610  	// Round 5 column step.
 611  	MOVD       40(SP), X9
 612  	MOVD       8(SP), X7
 613  	MOVOA      X7, X8
 614  	PUNPCKLLQ  X9, X8
 615  	MOVD       20(SP), X7
 616  	MOVD       36(SP), X9
 617  	PUNPCKLLQ  X7, X9
 618  	PUNPCKLQDQ X8, X9
 619  	MOVOU      permuted_blake_consts<>+256(SB), X8
 620  	PXOR       X9, X8
 621  	PADDD      X8, X1
 622  	MOVD       60(SP), X9
 623  	MOVD       16(SP), X7
 624  	MOVOA      X7, X8
 625  	PUNPCKLLQ  X9, X8
 626  	MOVD       28(SP), X7
 627  	MOVD       (SP), X9
 628  	PUNPCKLLQ  X7, X9
 629  	PUNPCKLQDQ X8, X9
 630  	MOVOU      permuted_blake_consts<>+272(SB), X8
 631  	PXOR       X9, X8
 632  	PADDD      X2, X1
 633  	PXOR       X1, X4
 634  	MOVO       X4, X7
 635  	PSRLL      $0x10, X7
 636  	PSLLL      $0x10, X4
 637  	PXOR       X7, X4
 638  	PADDD      X4, X3
 639  	PXOR       X3, X2
 640  	MOVO       X2, X7
 641  	PSRLL      $0x0c, X7
 642  	PSLLL      $0x14, X2
 643  	PXOR       X7, X2
 644  	PADDD      X8, X1
 645  	PADDD      X2, X1
 646  	PXOR       X1, X4
 647  	MOVO       X4, X7
 648  	PSRLL      $0x08, X7
 649  	PSLLL      $0x18, X4
 650  	PXOR       X7, X4
 651  	PADDD      X4, X3
 652  	PXOR       X3, X2
 653  	MOVO       X2, X7
 654  	PSRLL      $0x07, X7
 655  	PSLLL      $0x19, X2
 656  	PXOR       X7, X2
 657  
 658  	// Round 5 diagonal step part 1: diagonalize.
 659  	PSHUFD $0x39, X2, X2
 660  	PSHUFD $0x4e, X3, X3
 661  	PSHUFD $0x93, X4, X4
 662  
 663  	// Round 5 diagonal step part 2: column step.
 664  	MOVD       12(SP), X9
 665  	MOVD       24(SP), X7
 666  	MOVOA      X7, X8
 667  	PUNPCKLLQ  X9, X8
 668  	MOVD       44(SP), X7
 669  	MOVD       56(SP), X9
 670  	PUNPCKLLQ  X7, X9
 671  	PUNPCKLQDQ X8, X9
 672  	MOVOU      permuted_blake_consts<>+288(SB), X8
 673  	PXOR       X9, X8
 674  	PADDD      X8, X1
 675  	MOVD       52(SP), X9
 676  	MOVD       32(SP), X7
 677  	MOVOA      X7, X8
 678  	PUNPCKLLQ  X9, X8
 679  	MOVD       48(SP), X7
 680  	MOVD       4(SP), X9
 681  	PUNPCKLLQ  X7, X9
 682  	PUNPCKLQDQ X8, X9
 683  	MOVOU      permuted_blake_consts<>+304(SB), X8
 684  	PXOR       X9, X8
 685  	PADDD      X2, X1
 686  	PXOR       X1, X4
 687  	MOVO       X4, X7
 688  	PSRLL      $0x10, X7
 689  	PSLLL      $0x10, X4
 690  	PXOR       X7, X4
 691  	PADDD      X4, X3
 692  	PXOR       X3, X2
 693  	MOVO       X2, X7
 694  	PSRLL      $0x0c, X7
 695  	PSLLL      $0x14, X2
 696  	PXOR       X7, X2
 697  	PADDD      X8, X1
 698  	PADDD      X2, X1
 699  	PXOR       X1, X4
 700  	MOVO       X4, X7
 701  	PSRLL      $0x08, X7
 702  	PSLLL      $0x18, X4
 703  	PXOR       X7, X4
 704  	PADDD      X4, X3
 705  	PXOR       X3, X2
 706  	MOVO       X2, X7
 707  	PSRLL      $0x07, X7
 708  	PSLLL      $0x19, X2
 709  	PXOR       X7, X2
 710  
 711  	// Round 5 diagonal step part 3: undiagonalize.
 712  	PSHUFD $0x93, X2, X2
 713  	PSHUFD $0x4e, X3, X3
 714  	PSHUFD $0x39, X4, X4
 715  
 716  	// Round 6 column step.
 717  	MOVD       32(SP), X9
 718  	MOVD       (SP), X7
 719  	MOVOA      X7, X8
 720  	PUNPCKLLQ  X9, X8
 721  	MOVD       24(SP), X7
 722  	MOVD       8(SP), X9
 723  	PUNPCKLLQ  X7, X9
 724  	PUNPCKLQDQ X8, X9
 725  	MOVOU      permuted_blake_consts<>+320(SB), X8
 726  	PXOR       X9, X8
 727  	PADDD      X8, X1
 728  	MOVD       12(SP), X9
 729  	MOVD       44(SP), X7
 730  	MOVOA      X7, X8
 731  	PUNPCKLLQ  X9, X8
 732  	MOVD       40(SP), X7
 733  	MOVD       48(SP), X9
 734  	PUNPCKLLQ  X7, X9
 735  	PUNPCKLQDQ X8, X9
 736  	MOVOU      permuted_blake_consts<>+336(SB), X8
 737  	PXOR       X9, X8
 738  	PADDD      X2, X1
 739  	PXOR       X1, X4
 740  	MOVO       X4, X7
 741  	PSRLL      $0x10, X7
 742  	PSLLL      $0x10, X4
 743  	PXOR       X7, X4
 744  	PADDD      X4, X3
 745  	PXOR       X3, X2
 746  	MOVO       X2, X7
 747  	PSRLL      $0x0c, X7
 748  	PSLLL      $0x14, X2
 749  	PXOR       X7, X2
 750  	PADDD      X8, X1
 751  	PADDD      X2, X1
 752  	PXOR       X1, X4
 753  	MOVO       X4, X7
 754  	PSRLL      $0x08, X7
 755  	PSLLL      $0x18, X4
 756  	PXOR       X7, X4
 757  	PADDD      X4, X3
 758  	PXOR       X3, X2
 759  	MOVO       X2, X7
 760  	PSRLL      $0x07, X7
 761  	PSLLL      $0x19, X2
 762  	PXOR       X7, X2
 763  
 764  	// Round 6 diagonal step part 1: diagonalize.
 765  	PSHUFD $0x39, X2, X2
 766  	PSHUFD $0x4e, X3, X3
 767  	PSHUFD $0x93, X4, X4
 768  
 769  	// Round 6 diagonal step part 2: column step.
 770  	MOVD       4(SP), X9
 771  	MOVD       60(SP), X7
 772  	MOVOA      X7, X8
 773  	PUNPCKLLQ  X9, X8
 774  	MOVD       28(SP), X7
 775  	MOVD       16(SP), X9
 776  	PUNPCKLLQ  X7, X9
 777  	PUNPCKLQDQ X8, X9
 778  	MOVOU      permuted_blake_consts<>+352(SB), X8
 779  	PXOR       X9, X8
 780  	PADDD      X8, X1
 781  	MOVD       36(SP), X9
 782  	MOVD       56(SP), X7
 783  	MOVOA      X7, X8
 784  	PUNPCKLLQ  X9, X8
 785  	MOVD       20(SP), X7
 786  	MOVD       52(SP), X9
 787  	PUNPCKLLQ  X7, X9
 788  	PUNPCKLQDQ X8, X9
 789  	MOVOU      permuted_blake_consts<>+368(SB), X8
 790  	PXOR       X9, X8
 791  	PADDD      X2, X1
 792  	PXOR       X1, X4
 793  	MOVO       X4, X7
 794  	PSRLL      $0x10, X7
 795  	PSLLL      $0x10, X4
 796  	PXOR       X7, X4
 797  	PADDD      X4, X3
 798  	PXOR       X3, X2
 799  	MOVO       X2, X7
 800  	PSRLL      $0x0c, X7
 801  	PSLLL      $0x14, X2
 802  	PXOR       X7, X2
 803  	PADDD      X8, X1
 804  	PADDD      X2, X1
 805  	PXOR       X1, X4
 806  	MOVO       X4, X7
 807  	PSRLL      $0x08, X7
 808  	PSLLL      $0x18, X4
 809  	PXOR       X7, X4
 810  	PADDD      X4, X3
 811  	PXOR       X3, X2
 812  	MOVO       X2, X7
 813  	PSRLL      $0x07, X7
 814  	PSLLL      $0x19, X2
 815  	PXOR       X7, X2
 816  
 817  	// Round 6 diagonal step part 3: undiagonalize.
 818  	PSHUFD $0x93, X2, X2
 819  	PSHUFD $0x4e, X3, X3
 820  	PSHUFD $0x39, X4, X4
 821  
 822  	// Round 7 column step.
 823  	MOVD       16(SP), X9
 824  	MOVD       56(SP), X7
 825  	MOVOA      X7, X8
 826  	PUNPCKLLQ  X9, X8
 827  	MOVD       4(SP), X7
 828  	MOVD       48(SP), X9
 829  	PUNPCKLLQ  X7, X9
 830  	PUNPCKLQDQ X8, X9
 831  	MOVOU      permuted_blake_consts<>+384(SB), X8
 832  	PXOR       X9, X8
 833  	PADDD      X8, X1
 834  	MOVD       40(SP), X9
 835  	MOVD       52(SP), X7
 836  	MOVOA      X7, X8
 837  	PUNPCKLLQ  X9, X8
 838  	MOVD       60(SP), X7
 839  	MOVD       20(SP), X9
 840  	PUNPCKLLQ  X7, X9
 841  	PUNPCKLQDQ X8, X9
 842  	MOVOU      permuted_blake_consts<>+400(SB), X8
 843  	PXOR       X9, X8
 844  	PADDD      X2, X1
 845  	PXOR       X1, X4
 846  	MOVO       X4, X7
 847  	PSRLL      $0x10, X7
 848  	PSLLL      $0x10, X4
 849  	PXOR       X7, X4
 850  	PADDD      X4, X3
 851  	PXOR       X3, X2
 852  	MOVO       X2, X7
 853  	PSRLL      $0x0c, X7
 854  	PSLLL      $0x14, X2
 855  	PXOR       X7, X2
 856  	PADDD      X8, X1
 857  	PADDD      X2, X1
 858  	PXOR       X1, X4
 859  	MOVO       X4, X7
 860  	PSRLL      $0x08, X7
 861  	PSLLL      $0x18, X4
 862  	PXOR       X7, X4
 863  	PADDD      X4, X3
 864  	PXOR       X3, X2
 865  	MOVO       X2, X7
 866  	PSRLL      $0x07, X7
 867  	PSLLL      $0x19, X2
 868  	PXOR       X7, X2
 869  
 870  	// Round 7 diagonal step part 1: diagonalize.
 871  	PSHUFD $0x39, X2, X2
 872  	PSHUFD $0x4e, X3, X3
 873  	PSHUFD $0x93, X4, X4
 874  
 875  	// Round 7 diagonal step part 2: column step.
 876  	MOVD       32(SP), X9
 877  	MOVD       36(SP), X7
 878  	MOVOA      X7, X8
 879  	PUNPCKLLQ  X9, X8
 880  	MOVD       24(SP), X7
 881  	MOVD       (SP), X9
 882  	PUNPCKLLQ  X7, X9
 883  	PUNPCKLQDQ X8, X9
 884  	MOVOU      permuted_blake_consts<>+416(SB), X8
 885  	PXOR       X9, X8
 886  	PADDD      X8, X1
 887  	MOVD       44(SP), X9
 888  	MOVD       8(SP), X7
 889  	MOVOA      X7, X8
 890  	PUNPCKLLQ  X9, X8
 891  	MOVD       12(SP), X7
 892  	MOVD       28(SP), X9
 893  	PUNPCKLLQ  X7, X9
 894  	PUNPCKLQDQ X8, X9
 895  	MOVOU      permuted_blake_consts<>+432(SB), X8
 896  	PXOR       X9, X8
 897  	PADDD      X2, X1
 898  	PXOR       X1, X4
 899  	MOVO       X4, X7
 900  	PSRLL      $0x10, X7
 901  	PSLLL      $0x10, X4
 902  	PXOR       X7, X4
 903  	PADDD      X4, X3
 904  	PXOR       X3, X2
 905  	MOVO       X2, X7
 906  	PSRLL      $0x0c, X7
 907  	PSLLL      $0x14, X2
 908  	PXOR       X7, X2
 909  	PADDD      X8, X1
 910  	PADDD      X2, X1
 911  	PXOR       X1, X4
 912  	MOVO       X4, X7
 913  	PSRLL      $0x08, X7
 914  	PSLLL      $0x18, X4
 915  	PXOR       X7, X4
 916  	PADDD      X4, X3
 917  	PXOR       X3, X2
 918  	MOVO       X2, X7
 919  	PSRLL      $0x07, X7
 920  	PSLLL      $0x19, X2
 921  	PXOR       X7, X2
 922  
 923  	// Round 7 diagonal step part 3: undiagonalize.
 924  	PSHUFD $0x93, X2, X2
 925  	PSHUFD $0x4e, X3, X3
 926  	PSHUFD $0x39, X4, X4
 927  
 928  	// Round 8 column step.
 929  	MOVD       12(SP), X9
 930  	MOVD       48(SP), X7
 931  	MOVOA      X7, X8
 932  	PUNPCKLLQ  X9, X8
 933  	MOVD       28(SP), X7
 934  	MOVD       52(SP), X9
 935  	PUNPCKLLQ  X7, X9
 936  	PUNPCKLQDQ X8, X9
 937  	MOVOU      permuted_blake_consts<>+448(SB), X8
 938  	PXOR       X9, X8
 939  	PADDD      X8, X1
 940  	MOVD       36(SP), X9
 941  	MOVD       4(SP), X7
 942  	MOVOA      X7, X8
 943  	PUNPCKLLQ  X9, X8
 944  	MOVD       56(SP), X7
 945  	MOVD       44(SP), X9
 946  	PUNPCKLLQ  X7, X9
 947  	PUNPCKLQDQ X8, X9
 948  	MOVOU      permuted_blake_consts<>+464(SB), X8
 949  	PXOR       X9, X8
 950  	PADDD      X2, X1
 951  	PXOR       X1, X4
 952  	MOVO       X4, X7
 953  	PSRLL      $0x10, X7
 954  	PSLLL      $0x10, X4
 955  	PXOR       X7, X4
 956  	PADDD      X4, X3
 957  	PXOR       X3, X2
 958  	MOVO       X2, X7
 959  	PSRLL      $0x0c, X7
 960  	PSLLL      $0x14, X2
 961  	PXOR       X7, X2
 962  	PADDD      X8, X1
 963  	PADDD      X2, X1
 964  	PXOR       X1, X4
 965  	MOVO       X4, X7
 966  	PSRLL      $0x08, X7
 967  	PSLLL      $0x18, X4
 968  	PXOR       X7, X4
 969  	PADDD      X4, X3
 970  	PXOR       X3, X2
 971  	MOVO       X2, X7
 972  	PSRLL      $0x07, X7
 973  	PSLLL      $0x19, X2
 974  	PXOR       X7, X2
 975  
 976  	// Round 8 diagonal step part 1: diagonalize.
 977  	PSHUFD $0x39, X2, X2
 978  	PSHUFD $0x4e, X3, X3
 979  	PSHUFD $0x93, X4, X4
 980  
 981  	// Round 8 diagonal step part 2: column step.
 982  	MOVD       8(SP), X9
 983  	MOVD       32(SP), X7
 984  	MOVOA      X7, X8
 985  	PUNPCKLLQ  X9, X8
 986  	MOVD       60(SP), X7
 987  	MOVD       20(SP), X9
 988  	PUNPCKLLQ  X7, X9
 989  	PUNPCKLQDQ X8, X9
 990  	MOVOU      permuted_blake_consts<>+480(SB), X8
 991  	PXOR       X9, X8
 992  	PADDD      X8, X1
 993  	MOVD       40(SP), X9
 994  	MOVD       24(SP), X7
 995  	MOVOA      X7, X8
 996  	PUNPCKLLQ  X9, X8
 997  	MOVD       16(SP), X7
 998  	MOVD       (SP), X9
 999  	PUNPCKLLQ  X7, X9
1000  	PUNPCKLQDQ X8, X9
1001  	MOVOU      permuted_blake_consts<>+496(SB), X8
1002  	PXOR       X9, X8
1003  	PADDD      X2, X1
1004  	PXOR       X1, X4
1005  	MOVO       X4, X7
1006  	PSRLL      $0x10, X7
1007  	PSLLL      $0x10, X4
1008  	PXOR       X7, X4
1009  	PADDD      X4, X3
1010  	PXOR       X3, X2
1011  	MOVO       X2, X7
1012  	PSRLL      $0x0c, X7
1013  	PSLLL      $0x14, X2
1014  	PXOR       X7, X2
1015  	PADDD      X8, X1
1016  	PADDD      X2, X1
1017  	PXOR       X1, X4
1018  	MOVO       X4, X7
1019  	PSRLL      $0x08, X7
1020  	PSLLL      $0x18, X4
1021  	PXOR       X7, X4
1022  	PADDD      X4, X3
1023  	PXOR       X3, X2
1024  	MOVO       X2, X7
1025  	PSRLL      $0x07, X7
1026  	PSLLL      $0x19, X2
1027  	PXOR       X7, X2
1028  
1029  	// Round 8 diagonal step part 3: undiagonalize.
1030  	PSHUFD $0x93, X2, X2
1031  	PSHUFD $0x4e, X3, X3
1032  	PSHUFD $0x39, X4, X4
1033  
1034  	// Round 9 column step.
1035  	MOVD       (SP), X9
1036  	MOVD       44(SP), X7
1037  	MOVOA      X7, X8
1038  	PUNPCKLLQ  X9, X8
1039  	MOVD       56(SP), X7
1040  	MOVD       24(SP), X9
1041  	PUNPCKLLQ  X7, X9
1042  	PUNPCKLQDQ X8, X9
1043  	MOVOU      permuted_blake_consts<>+512(SB), X8
1044  	PXOR       X9, X8
1045  	PADDD      X8, X1
1046  	MOVD       32(SP), X9
1047  	MOVD       12(SP), X7
1048  	MOVOA      X7, X8
1049  	PUNPCKLLQ  X9, X8
1050  	MOVD       36(SP), X7
1051  	MOVD       60(SP), X9
1052  	PUNPCKLLQ  X7, X9
1053  	PUNPCKLQDQ X8, X9
1054  	MOVOU      permuted_blake_consts<>+528(SB), X8
1055  	PXOR       X9, X8
1056  	PADDD      X2, X1
1057  	PXOR       X1, X4
1058  	MOVO       X4, X7
1059  	PSRLL      $0x10, X7
1060  	PSLLL      $0x10, X4
1061  	PXOR       X7, X4
1062  	PADDD      X4, X3
1063  	PXOR       X3, X2
1064  	MOVO       X2, X7
1065  	PSRLL      $0x0c, X7
1066  	PSLLL      $0x14, X2
1067  	PXOR       X7, X2
1068  	PADDD      X8, X1
1069  	PADDD      X2, X1
1070  	PXOR       X1, X4
1071  	MOVO       X4, X7
1072  	PSRLL      $0x08, X7
1073  	PSLLL      $0x18, X4
1074  	PXOR       X7, X4
1075  	PADDD      X4, X3
1076  	PXOR       X3, X2
1077  	MOVO       X2, X7
1078  	PSRLL      $0x07, X7
1079  	PSLLL      $0x19, X2
1080  	PXOR       X7, X2
1081  
1082  	// Round 9 diagonal step part 1: diagonalize.
1083  	PSHUFD $0x39, X2, X2
1084  	PSHUFD $0x4e, X3, X3
1085  	PSHUFD $0x93, X4, X4
1086  
1087  	// Round 9 diagonal step part 2: column step.
1088  	MOVD       40(SP), X9
1089  	MOVD       4(SP), X7
1090  	MOVOA      X7, X8
1091  	PUNPCKLLQ  X9, X8
1092  	MOVD       52(SP), X7
1093  	MOVD       48(SP), X9
1094  	PUNPCKLLQ  X7, X9
1095  	PUNPCKLQDQ X8, X9
1096  	MOVOU      permuted_blake_consts<>+544(SB), X8
1097  	PXOR       X9, X8
1098  	PADDD      X8, X1
1099  	MOVD       20(SP), X9
1100  	MOVD       16(SP), X7
1101  	MOVOA      X7, X8
1102  	PUNPCKLLQ  X9, X8
1103  	MOVD       28(SP), X7
1104  	MOVD       8(SP), X9
1105  	PUNPCKLLQ  X7, X9
1106  	PUNPCKLQDQ X8, X9
1107  	MOVOU      permuted_blake_consts<>+560(SB), X8
1108  	PXOR       X9, X8
1109  	PADDD      X2, X1
1110  	PXOR       X1, X4
1111  	MOVO       X4, X7
1112  	PSRLL      $0x10, X7
1113  	PSLLL      $0x10, X4
1114  	PXOR       X7, X4
1115  	PADDD      X4, X3
1116  	PXOR       X3, X2
1117  	MOVO       X2, X7
1118  	PSRLL      $0x0c, X7
1119  	PSLLL      $0x14, X2
1120  	PXOR       X7, X2
1121  	PADDD      X8, X1
1122  	PADDD      X2, X1
1123  	PXOR       X1, X4
1124  	MOVO       X4, X7
1125  	PSRLL      $0x08, X7
1126  	PSLLL      $0x18, X4
1127  	PXOR       X7, X4
1128  	PADDD      X4, X3
1129  	PXOR       X3, X2
1130  	MOVO       X2, X7
1131  	PSRLL      $0x07, X7
1132  	PSLLL      $0x19, X2
1133  	PXOR       X7, X2
1134  
1135  	// Round 9 diagonal step part 3: undiagonalize.
1136  	PSHUFD $0x93, X2, X2
1137  	PSHUFD $0x4e, X3, X3
1138  	PSHUFD $0x39, X4, X4
1139  
1140  	// Round 10 column step.
1141  	MOVD       4(SP), X9
1142  	MOVD       28(SP), X7
1143  	MOVOA      X7, X8
1144  	PUNPCKLLQ  X9, X8
1145  	MOVD       32(SP), X7
1146  	MOVD       40(SP), X9
1147  	PUNPCKLLQ  X7, X9
1148  	PUNPCKLQDQ X8, X9
1149  	MOVOU      permuted_blake_consts<>+576(SB), X8
1150  	PXOR       X9, X8
1151  	PADDD      X8, X1
1152  	MOVD       20(SP), X9
1153  	MOVD       24(SP), X7
1154  	MOVOA      X7, X8
1155  	PUNPCKLLQ  X9, X8
1156  	MOVD       16(SP), X7
1157  	MOVD       8(SP), X9
1158  	PUNPCKLLQ  X7, X9
1159  	PUNPCKLQDQ X8, X9
1160  	MOVOU      permuted_blake_consts<>+592(SB), X8
1161  	PXOR       X9, X8
1162  	PADDD      X2, X1
1163  	PXOR       X1, X4
1164  	MOVO       X4, X7
1165  	PSRLL      $0x10, X7
1166  	PSLLL      $0x10, X4
1167  	PXOR       X7, X4
1168  	PADDD      X4, X3
1169  	PXOR       X3, X2
1170  	MOVO       X2, X7
1171  	PSRLL      $0x0c, X7
1172  	PSLLL      $0x14, X2
1173  	PXOR       X7, X2
1174  	PADDD      X8, X1
1175  	PADDD      X2, X1
1176  	PXOR       X1, X4
1177  	MOVO       X4, X7
1178  	PSRLL      $0x08, X7
1179  	PSLLL      $0x18, X4
1180  	PXOR       X7, X4
1181  	PADDD      X4, X3
1182  	PXOR       X3, X2
1183  	MOVO       X2, X7
1184  	PSRLL      $0x07, X7
1185  	PSLLL      $0x19, X2
1186  	PXOR       X7, X2
1187  
1188  	// Round 10 diagonal step part 1: diagonalize.
1189  	PSHUFD $0x39, X2, X2
1190  	PSHUFD $0x4e, X3, X3
1191  	PSHUFD $0x93, X4, X4
1192  
1193  	// Round 10 diagonal step part 2: column step.
1194  	MOVD       52(SP), X9
1195  	MOVD       12(SP), X7
1196  	MOVOA      X7, X8
1197  	PUNPCKLLQ  X9, X8
1198  	MOVD       36(SP), X7
1199  	MOVD       60(SP), X9
1200  	PUNPCKLLQ  X7, X9
1201  	PUNPCKLQDQ X8, X9
1202  	MOVOU      permuted_blake_consts<>+608(SB), X8
1203  	PXOR       X9, X8
1204  	PADDD      X8, X1
1205  	MOVD       (SP), X9
1206  	MOVD       48(SP), X7
1207  	MOVOA      X7, X8
1208  	PUNPCKLLQ  X9, X8
1209  	MOVD       56(SP), X7
1210  	MOVD       44(SP), X9
1211  	PUNPCKLLQ  X7, X9
1212  	PUNPCKLQDQ X8, X9
1213  	MOVOU      permuted_blake_consts<>+624(SB), X8
1214  	PXOR       X9, X8
1215  	PADDD      X2, X1
1216  	PXOR       X1, X4
1217  	MOVO       X4, X7
1218  	PSRLL      $0x10, X7
1219  	PSLLL      $0x10, X4
1220  	PXOR       X7, X4
1221  	PADDD      X4, X3
1222  	PXOR       X3, X2
1223  	MOVO       X2, X7
1224  	PSRLL      $0x0c, X7
1225  	PSLLL      $0x14, X2
1226  	PXOR       X7, X2
1227  	PADDD      X8, X1
1228  	PADDD      X2, X1
1229  	PXOR       X1, X4
1230  	MOVO       X4, X7
1231  	PSRLL      $0x08, X7
1232  	PSLLL      $0x18, X4
1233  	PXOR       X7, X4
1234  	PADDD      X4, X3
1235  	PXOR       X3, X2
1236  	MOVO       X2, X7
1237  	PSRLL      $0x07, X7
1238  	PSLLL      $0x19, X2
1239  	PXOR       X7, X2
1240  
1241  	// Round 10 diagonal step part 3: undiagonalize.
1242  	PSHUFD $0x93, X2, X2
1243  	PSHUFD $0x4e, X3, X3
1244  	PSHUFD $0x39, X4, X4
1245  
1246  	// Round 11 column step.
1247  	MOVD       24(SP), X9
1248  	MOVD       16(SP), X7
1249  	MOVOA      X7, X8
1250  	PUNPCKLLQ  X9, X8
1251  	MOVD       8(SP), X7
1252  	MOVD       (SP), X9
1253  	PUNPCKLLQ  X7, X9
1254  	PUNPCKLQDQ X8, X9
1255  	MOVOU      permuted_blake_consts<>+0(SB), X8
1256  	PXOR       X9, X8
1257  	PADDD      X8, X1
1258  	MOVD       28(SP), X9
1259  	MOVD       20(SP), X7
1260  	MOVOA      X7, X8
1261  	PUNPCKLLQ  X9, X8
1262  	MOVD       12(SP), X7
1263  	MOVD       4(SP), X9
1264  	PUNPCKLLQ  X7, X9
1265  	PUNPCKLQDQ X8, X9
1266  	MOVOU      permuted_blake_consts<>+16(SB), X8
1267  	PXOR       X9, X8
1268  	PADDD      X2, X1
1269  	PXOR       X1, X4
1270  	MOVO       X4, X7
1271  	PSRLL      $0x10, X7
1272  	PSLLL      $0x10, X4
1273  	PXOR       X7, X4
1274  	PADDD      X4, X3
1275  	PXOR       X3, X2
1276  	MOVO       X2, X7
1277  	PSRLL      $0x0c, X7
1278  	PSLLL      $0x14, X2
1279  	PXOR       X7, X2
1280  	PADDD      X8, X1
1281  	PADDD      X2, X1
1282  	PXOR       X1, X4
1283  	MOVO       X4, X7
1284  	PSRLL      $0x08, X7
1285  	PSLLL      $0x18, X4
1286  	PXOR       X7, X4
1287  	PADDD      X4, X3
1288  	PXOR       X3, X2
1289  	MOVO       X2, X7
1290  	PSRLL      $0x07, X7
1291  	PSLLL      $0x19, X2
1292  	PXOR       X7, X2
1293  
1294  	// Round 11 diagonal step part 1: diagonalize.
1295  	PSHUFD $0x39, X2, X2
1296  	PSHUFD $0x4e, X3, X3
1297  	PSHUFD $0x93, X4, X4
1298  
1299  	// Round 11 diagonal step part 2: column step.
1300  	MOVD       56(SP), X9
1301  	MOVD       48(SP), X7
1302  	MOVOA      X7, X8
1303  	PUNPCKLLQ  X9, X8
1304  	MOVD       40(SP), X7
1305  	MOVD       32(SP), X9
1306  	PUNPCKLLQ  X7, X9
1307  	PUNPCKLQDQ X8, X9
1308  	MOVOU      permuted_blake_consts<>+32(SB), X8
1309  	PXOR       X9, X8
1310  	PADDD      X8, X1
1311  	MOVD       60(SP), X9
1312  	MOVD       52(SP), X7
1313  	MOVOA      X7, X8
1314  	PUNPCKLLQ  X9, X8
1315  	MOVD       44(SP), X7
1316  	MOVD       36(SP), X9
1317  	PUNPCKLLQ  X7, X9
1318  	PUNPCKLQDQ X8, X9
1319  	MOVOU      permuted_blake_consts<>+48(SB), X8
1320  	PXOR       X9, X8
1321  	PADDD      X2, X1
1322  	PXOR       X1, X4
1323  	MOVO       X4, X7
1324  	PSRLL      $0x10, X7
1325  	PSLLL      $0x10, X4
1326  	PXOR       X7, X4
1327  	PADDD      X4, X3
1328  	PXOR       X3, X2
1329  	MOVO       X2, X7
1330  	PSRLL      $0x0c, X7
1331  	PSLLL      $0x14, X2
1332  	PXOR       X7, X2
1333  	PADDD      X8, X1
1334  	PADDD      X2, X1
1335  	PXOR       X1, X4
1336  	MOVO       X4, X7
1337  	PSRLL      $0x08, X7
1338  	PSLLL      $0x18, X4
1339  	PXOR       X7, X4
1340  	PADDD      X4, X3
1341  	PXOR       X3, X2
1342  	MOVO       X2, X7
1343  	PSRLL      $0x07, X7
1344  	PSLLL      $0x19, X2
1345  	PXOR       X7, X2
1346  
1347  	// Round 11 diagonal step part 3: undiagonalize.
1348  	PSHUFD $0x93, X2, X2
1349  	PSHUFD $0x4e, X3, X3
1350  	PSHUFD $0x39, X4, X4
1351  
1352  	// Round 12 column step.
1353  	MOVD       52(SP), X9
1354  	MOVD       36(SP), X7
1355  	MOVOA      X7, X8
1356  	PUNPCKLLQ  X9, X8
1357  	MOVD       16(SP), X7
1358  	MOVD       56(SP), X9
1359  	PUNPCKLLQ  X7, X9
1360  	PUNPCKLQDQ X8, X9
1361  	MOVOU      permuted_blake_consts<>+64(SB), X8
1362  	PXOR       X9, X8
1363  	PADDD      X8, X1
1364  	MOVD       24(SP), X9
1365  	MOVD       60(SP), X7
1366  	MOVOA      X7, X8
1367  	PUNPCKLLQ  X9, X8
1368  	MOVD       32(SP), X7
1369  	MOVD       40(SP), X9
1370  	PUNPCKLLQ  X7, X9
1371  	PUNPCKLQDQ X8, X9
1372  	MOVOU      permuted_blake_consts<>+80(SB), X8
1373  	PXOR       X9, X8
1374  	PADDD      X2, X1
1375  	PXOR       X1, X4
1376  	MOVO       X4, X7
1377  	PSRLL      $0x10, X7
1378  	PSLLL      $0x10, X4
1379  	PXOR       X7, X4
1380  	PADDD      X4, X3
1381  	PXOR       X3, X2
1382  	MOVO       X2, X7
1383  	PSRLL      $0x0c, X7
1384  	PSLLL      $0x14, X2
1385  	PXOR       X7, X2
1386  	PADDD      X8, X1
1387  	PADDD      X2, X1
1388  	PXOR       X1, X4
1389  	MOVO       X4, X7
1390  	PSRLL      $0x08, X7
1391  	PSLLL      $0x18, X4
1392  	PXOR       X7, X4
1393  	PADDD      X4, X3
1394  	PXOR       X3, X2
1395  	MOVO       X2, X7
1396  	PSRLL      $0x07, X7
1397  	PSLLL      $0x19, X2
1398  	PXOR       X7, X2
1399  
1400  	// Round 12 diagonal step part 1: diagonalize.
1401  	PSHUFD $0x39, X2, X2
1402  	PSHUFD $0x4e, X3, X3
1403  	PSHUFD $0x93, X4, X4
1404  
1405  	// Round 12 diagonal step part 2: column step.
1406  	MOVD       20(SP), X9
1407  	MOVD       44(SP), X7
1408  	MOVOA      X7, X8
1409  	PUNPCKLLQ  X9, X8
1410  	MOVD       (SP), X7
1411  	MOVD       4(SP), X9
1412  	PUNPCKLLQ  X7, X9
1413  	PUNPCKLQDQ X8, X9
1414  	MOVOU      permuted_blake_consts<>+96(SB), X8
1415  	PXOR       X9, X8
1416  	PADDD      X8, X1
1417  	MOVD       12(SP), X9
1418  	MOVD       28(SP), X7
1419  	MOVOA      X7, X8
1420  	PUNPCKLLQ  X9, X8
1421  	MOVD       8(SP), X7
1422  	MOVD       48(SP), X9
1423  	PUNPCKLLQ  X7, X9
1424  	PUNPCKLQDQ X8, X9
1425  	MOVOU      permuted_blake_consts<>+112(SB), X8
1426  	PXOR       X9, X8
1427  	PADDD      X2, X1
1428  	PXOR       X1, X4
1429  	MOVO       X4, X7
1430  	PSRLL      $0x10, X7
1431  	PSLLL      $0x10, X4
1432  	PXOR       X7, X4
1433  	PADDD      X4, X3
1434  	PXOR       X3, X2
1435  	MOVO       X2, X7
1436  	PSRLL      $0x0c, X7
1437  	PSLLL      $0x14, X2
1438  	PXOR       X7, X2
1439  	PADDD      X8, X1
1440  	PADDD      X2, X1
1441  	PXOR       X1, X4
1442  	MOVO       X4, X7
1443  	PSRLL      $0x08, X7
1444  	PSLLL      $0x18, X4
1445  	PXOR       X7, X4
1446  	PADDD      X4, X3
1447  	PXOR       X3, X2
1448  	MOVO       X2, X7
1449  	PSRLL      $0x07, X7
1450  	PSLLL      $0x19, X2
1451  	PXOR       X7, X2
1452  
1453  	// Round 12 diagonal step part 3: undiagonalize.
1454  	PSHUFD $0x93, X2, X2
1455  	PSHUFD $0x4e, X3, X3
1456  	PSHUFD $0x39, X4, X4
1457  
1458  	// Round 13 column step.
1459  	MOVD       60(SP), X9
1460  	MOVD       20(SP), X7
1461  	MOVOA      X7, X8
1462  	PUNPCKLLQ  X9, X8
1463  	MOVD       48(SP), X7
1464  	MOVD       44(SP), X9
1465  	PUNPCKLLQ  X7, X9
1466  	PUNPCKLQDQ X8, X9
1467  	MOVOU      permuted_blake_consts<>+128(SB), X8
1468  	PXOR       X9, X8
1469  	PADDD      X8, X1
1470  	MOVD       52(SP), X9
1471  	MOVD       8(SP), X7
1472  	MOVOA      X7, X8
1473  	PUNPCKLLQ  X9, X8
1474  	MOVD       (SP), X7
1475  	MOVD       32(SP), X9
1476  	PUNPCKLLQ  X7, X9
1477  	PUNPCKLQDQ X8, X9
1478  	MOVOU      permuted_blake_consts<>+144(SB), X8
1479  	PXOR       X9, X8
1480  	PADDD      X2, X1
1481  	PXOR       X1, X4
1482  	MOVO       X4, X7
1483  	PSRLL      $0x10, X7
1484  	PSLLL      $0x10, X4
1485  	PXOR       X7, X4
1486  	PADDD      X4, X3
1487  	PXOR       X3, X2
1488  	MOVO       X2, X7
1489  	PSRLL      $0x0c, X7
1490  	PSLLL      $0x14, X2
1491  	PXOR       X7, X2
1492  	PADDD      X8, X1
1493  	PADDD      X2, X1
1494  	PXOR       X1, X4
1495  	MOVO       X4, X7
1496  	PSRLL      $0x08, X7
1497  	PSLLL      $0x18, X4
1498  	PXOR       X7, X4
1499  	PADDD      X4, X3
1500  	PXOR       X3, X2
1501  	MOVO       X2, X7
1502  	PSRLL      $0x07, X7
1503  	PSLLL      $0x19, X2
1504  	PXOR       X7, X2
1505  
1506  	// Round 13 diagonal step part 1: diagonalize.
1507  	PSHUFD $0x39, X2, X2
1508  	PSHUFD $0x4e, X3, X3
1509  	PSHUFD $0x93, X4, X4
1510  
1511  	// Round 13 diagonal step part 2: column step.
1512  	MOVD       36(SP), X9
1513  	MOVD       28(SP), X7
1514  	MOVOA      X7, X8
1515  	PUNPCKLLQ  X9, X8
1516  	MOVD       12(SP), X7
1517  	MOVD       40(SP), X9
1518  	PUNPCKLLQ  X7, X9
1519  	PUNPCKLQDQ X8, X9
1520  	MOVOU      permuted_blake_consts<>+160(SB), X8
1521  	PXOR       X9, X8
1522  	PADDD      X8, X1
1523  	MOVD       16(SP), X9
1524  	MOVD       4(SP), X7
1525  	MOVOA      X7, X8
1526  	PUNPCKLLQ  X9, X8
1527  	MOVD       24(SP), X7
1528  	MOVD       56(SP), X9
1529  	PUNPCKLLQ  X7, X9
1530  	PUNPCKLQDQ X8, X9
1531  	MOVOU      permuted_blake_consts<>+176(SB), X8
1532  	PXOR       X9, X8
1533  	PADDD      X2, X1
1534  	PXOR       X1, X4
1535  	MOVO       X4, X7
1536  	PSRLL      $0x10, X7
1537  	PSLLL      $0x10, X4
1538  	PXOR       X7, X4
1539  	PADDD      X4, X3
1540  	PXOR       X3, X2
1541  	MOVO       X2, X7
1542  	PSRLL      $0x0c, X7
1543  	PSLLL      $0x14, X2
1544  	PXOR       X7, X2
1545  	PADDD      X8, X1
1546  	PADDD      X2, X1
1547  	PXOR       X1, X4
1548  	MOVO       X4, X7
1549  	PSRLL      $0x08, X7
1550  	PSLLL      $0x18, X4
1551  	PXOR       X7, X4
1552  	PADDD      X4, X3
1553  	PXOR       X3, X2
1554  	MOVO       X2, X7
1555  	PSRLL      $0x07, X7
1556  	PSLLL      $0x19, X2
1557  	PXOR       X7, X2
1558  
1559  	// Round 13 diagonal step part 3: undiagonalize.
1560  	PSHUFD $0x93, X2, X2
1561  	PSHUFD $0x4e, X3, X3
1562  	PSHUFD $0x39, X4, X4
1563  
1564  	// Round 14 column step.
1565  	MOVD       44(SP), X9
1566  	MOVD       52(SP), X7
1567  	MOVOA      X7, X8
1568  	PUNPCKLLQ  X9, X8
1569  	MOVD       12(SP), X7
1570  	MOVD       28(SP), X9
1571  	PUNPCKLLQ  X7, X9
1572  	PUNPCKLQDQ X8, X9
1573  	MOVOU      permuted_blake_consts<>+192(SB), X8
1574  	PXOR       X9, X8
1575  	PADDD      X8, X1
1576  	MOVD       56(SP), X9
1577  	MOVD       48(SP), X7
1578  	MOVOA      X7, X8
1579  	PUNPCKLLQ  X9, X8
1580  	MOVD       4(SP), X7
1581  	MOVD       36(SP), X9
1582  	PUNPCKLLQ  X7, X9
1583  	PUNPCKLQDQ X8, X9
1584  	MOVOU      permuted_blake_consts<>+208(SB), X8
1585  	PXOR       X9, X8
1586  	PADDD      X2, X1
1587  	PXOR       X1, X4
1588  	MOVO       X4, X7
1589  	PSRLL      $0x10, X7
1590  	PSLLL      $0x10, X4
1591  	PXOR       X7, X4
1592  	PADDD      X4, X3
1593  	PXOR       X3, X2
1594  	MOVO       X2, X7
1595  	PSRLL      $0x0c, X7
1596  	PSLLL      $0x14, X2
1597  	PXOR       X7, X2
1598  	PADDD      X8, X1
1599  	PADDD      X2, X1
1600  	PXOR       X1, X4
1601  	MOVO       X4, X7
1602  	PSRLL      $0x08, X7
1603  	PSLLL      $0x18, X4
1604  	PXOR       X7, X4
1605  	PADDD      X4, X3
1606  	PXOR       X3, X2
1607  	MOVO       X2, X7
1608  	PSRLL      $0x07, X7
1609  	PSLLL      $0x19, X2
1610  	PXOR       X7, X2
1611  
1612  	// Round 14 diagonal step part 1: diagonalize.
1613  	PSHUFD $0x39, X2, X2
1614  	PSHUFD $0x4e, X3, X3
1615  	PSHUFD $0x93, X4, X4
1616  
1617  	// Round 14 diagonal step part 2: column step.
1618  	MOVD       60(SP), X9
1619  	MOVD       16(SP), X7
1620  	MOVOA      X7, X8
1621  	PUNPCKLLQ  X9, X8
1622  	MOVD       20(SP), X7
1623  	MOVD       8(SP), X9
1624  	PUNPCKLLQ  X7, X9
1625  	PUNPCKLQDQ X8, X9
1626  	MOVOU      permuted_blake_consts<>+224(SB), X8
1627  	PXOR       X9, X8
1628  	PADDD      X8, X1
1629  	MOVD       32(SP), X9
1630  	MOVD       (SP), X7
1631  	MOVOA      X7, X8
1632  	PUNPCKLLQ  X9, X8
1633  	MOVD       40(SP), X7
1634  	MOVD       24(SP), X9
1635  	PUNPCKLLQ  X7, X9
1636  	PUNPCKLQDQ X8, X9
1637  	MOVOU      permuted_blake_consts<>+240(SB), X8
1638  	PXOR       X9, X8
1639  	PADDD      X2, X1
1640  	PXOR       X1, X4
1641  	MOVO       X4, X7
1642  	PSRLL      $0x10, X7
1643  	PSLLL      $0x10, X4
1644  	PXOR       X7, X4
1645  	PADDD      X4, X3
1646  	PXOR       X3, X2
1647  	MOVO       X2, X7
1648  	PSRLL      $0x0c, X7
1649  	PSLLL      $0x14, X2
1650  	PXOR       X7, X2
1651  	PADDD      X8, X1
1652  	PADDD      X2, X1
1653  	PXOR       X1, X4
1654  	MOVO       X4, X7
1655  	PSRLL      $0x08, X7
1656  	PSLLL      $0x18, X4
1657  	PXOR       X7, X4
1658  	PADDD      X4, X3
1659  	PXOR       X3, X2
1660  	MOVO       X2, X7
1661  	PSRLL      $0x07, X7
1662  	PSLLL      $0x19, X2
1663  	PXOR       X7, X2
1664  
1665  	// Round 14 diagonal step part 3: undiagonalize.
1666  	PSHUFD $0x93, X2, X2
1667  	PSHUFD $0x4e, X3, X3
1668  	PSHUFD $0x39, X4, X4
1669  
1670  	// Finally the chain value is defined as:
1671  	// h'0 = h0^s0^v0^v8
1672  	// h'1 = h1^s1^v1^v9
1673  	// h'2 = h2^s2^v2^va
1674  	// h'3 = h3^s3^v3^vb
1675  	// h'4 = h4^s0^v4^vc
1676  	// h'5 = h5^s1^v5^vd
1677  	// h'6 = h6^s2^v6^ve
1678  	// h'7 = h7^s3^v7^vf
1679  	PXOR X5, X1
1680  	PXOR X0, X1
1681  	PXOR X3, X1
1682  	PXOR X6, X2
1683  	PXOR X0, X2
1684  	PXOR X4, X2
1685  
1686  	// Either terminate the loop when there are no more full blocks
1687  	// to compress or move the message pointer to the next block of
1688  	// bytes to compress, increment the message bits counter
1689  	// accordingly, and loop back around to compress it.
1690  	DECQ BX
1691  	JZ   done
1692  	LEAQ 64(DX), DX
1693  	ADDQ $0x00000200, CX
1694  	JMP  compressLoop
1695  
1696  done:
1697  	// Output the resulting chain value.
1698  	MOVOU X1, (AX)
1699  	MOVOU X2, 16(AX)
1700  	RET
1701  
1702  // func blocksSSE41(state *State, msg []byte, counter uint64)
1703  // Requires: SSE2, SSE4.1, SSSE3
1704  TEXT ·blocksSSE41(SB), NOSPLIT, $0-40
1705  	MOVQ state+0(FP), AX
1706  	MOVQ counter+32(FP), CX
1707  	MOVQ msg_base+8(FP), DX
1708  	MOVQ msg_len+16(FP), BX
1709  
1710  	// Populate registers for faster right rotations.
1711  	MOVOU shuffle_rotr8_4x32<>+0(SB), X4
1712  	MOVOU shuffle_rotr16_4x32<>+0(SB), X5
1713  
1714  	// Convert message len to number of blocks for loop counter.
1715  	SHRQ $0x06, BX
1716  
1717  	// Initialize state matrix.
1718  	// row0 = |v0  v1  v2  v3|   |  h0     h1     h2     h3 |
1719  	// row1 = |v4  v5  v6  v7|   |  h4     h5     h6     h7 |
1720  	MOVOU 32(AX), X6
1721  	MOVOU (AX), X7
1722  	MOVOU 16(AX), X8
1723  
1724  compressLoop:
1725  	// row2 = |v8  v9  va  vb| = |s0^c0  s1^c1  s2^c2  s3^c3|
1726  	// row3 = |vc  vd  ve  vf|   |t0^c4  t0^c5  t1^c6  t1^c7|
1727  	MOVOU  first_8_blake_consts<>+0(SB), X9
1728  	PXOR   X6, X9
1729  	MOVD   CX, X10
1730  	PSHUFD $0x50, X10, X10
1731  	PXOR   first_8_blake_consts<>+16(SB), X10
1732  	MOVO   X7, X11
1733  	MOVO   X8, X12
1734  
1735  	// Convert message to big endian.
1736  	MOVOU  shuffle_le_to_be_4x32<>+0(SB), X13
1737  	MOVOU  (DX), X0
1738  	PSHUFB X13, X0
1739  	MOVOU  16(DX), X1
1740  	PSHUFB X13, X1
1741  	MOVOU  32(DX), X2
1742  	PSHUFB X13, X2
1743  	MOVOU  48(DX), X3
1744  	PSHUFB X13, X3
1745  
1746  	// Round 1 column step.
1747  	PSHUFD  $0x08, X0, X14
1748  	PSHUFD  $0x80, X1, X13
1749  	PBLENDW $0xf0, X13, X14
1750  	MOVOU   permuted_blake_consts<>+0(SB), X15
1751  	PXOR    X14, X15
1752  	PADDD   X15, X7
1753  	PSHUFD  $0x0d, X0, X14
1754  	PSHUFD  $0xd0, X1, X13
1755  	PBLENDW $0xf0, X13, X14
1756  	MOVOU   permuted_blake_consts<>+16(SB), X15
1757  	PXOR    X14, X15
1758  	PADDD   X8, X7
1759  	PXOR    X7, X10
1760  	PSHUFB  X5, X10
1761  	PADDD   X10, X9
1762  	PXOR    X9, X8
1763  	MOVO    X8, X13
1764  	PSRLL   $0x0c, X13
1765  	PSLLL   $0x14, X8
1766  	PXOR    X13, X8
1767  	PADDD   X15, X7
1768  	PADDD   X8, X7
1769  	PXOR    X7, X10
1770  	PSHUFB  X4, X10
1771  	PADDD   X10, X9
1772  	PXOR    X9, X8
1773  	MOVO    X8, X13
1774  	PSRLL   $0x07, X13
1775  	PSLLL   $0x19, X8
1776  	PXOR    X13, X8
1777  
1778  	// Round 1 diagonal step part 1: diagonalize.
1779  	PSHUFD $0x39, X8, X8
1780  	PSHUFD $0x4e, X9, X9
1781  	PSHUFD $0x93, X10, X10
1782  
1783  	// Round 1 diagonal step part 2: column step.
1784  	PSHUFD  $0x08, X2, X14
1785  	PSHUFD  $0x80, X3, X13
1786  	PBLENDW $0xf0, X13, X14
1787  	MOVOU   permuted_blake_consts<>+32(SB), X15
1788  	PXOR    X14, X15
1789  	PADDD   X15, X7
1790  	PSHUFD  $0x0d, X2, X14
1791  	PSHUFD  $0xd0, X3, X13
1792  	PBLENDW $0xf0, X13, X14
1793  	MOVOU   permuted_blake_consts<>+48(SB), X15
1794  	PXOR    X14, X15
1795  	PADDD   X8, X7
1796  	PXOR    X7, X10
1797  	PSHUFB  X5, X10
1798  	PADDD   X10, X9
1799  	PXOR    X9, X8
1800  	MOVO    X8, X13
1801  	PSRLL   $0x0c, X13
1802  	PSLLL   $0x14, X8
1803  	PXOR    X13, X8
1804  	PADDD   X15, X7
1805  	PADDD   X8, X7
1806  	PXOR    X7, X10
1807  	PSHUFB  X4, X10
1808  	PADDD   X10, X9
1809  	PXOR    X9, X8
1810  	MOVO    X8, X13
1811  	PSRLL   $0x07, X13
1812  	PSLLL   $0x19, X8
1813  	PXOR    X13, X8
1814  
1815  	// Round 1 diagonal step part 3: undiagonalize.
1816  	PSHUFD $0x93, X8, X8
1817  	PSHUFD $0x4e, X9, X9
1818  	PSHUFD $0x39, X10, X10
1819  
1820  	// Round 2 column step.
1821  	PSHUFD  $0x00, X1, X14
1822  	PSHUFD  $0x10, X2, X13
1823  	PBLENDW $0x30, X13, X14
1824  	PSHUFD  $0x42, X3, X13
1825  	PBLENDW $0xc3, X13, X14
1826  	MOVOU   permuted_blake_consts<>+64(SB), X15
1827  	PXOR    X14, X15
1828  	PADDD   X15, X7
1829  	PSHUFD  $0x80, X1, X14
1830  	PSHUFD  $0x02, X2, X13
1831  	PBLENDW $0x0f, X13, X14
1832  	PSHUFD  $0x30, X3, X13
1833  	PBLENDW $0x30, X13, X14
1834  	MOVOU   permuted_blake_consts<>+80(SB), X15
1835  	PXOR    X14, X15
1836  	PADDD   X8, X7
1837  	PXOR    X7, X10
1838  	PSHUFB  X5, X10
1839  	PADDD   X10, X9
1840  	PXOR    X9, X8
1841  	MOVO    X8, X13
1842  	PSRLL   $0x0c, X13
1843  	PSLLL   $0x14, X8
1844  	PXOR    X13, X8
1845  	PADDD   X15, X7
1846  	PADDD   X8, X7
1847  	PXOR    X7, X10
1848  	PSHUFB  X4, X10
1849  	PADDD   X10, X9
1850  	PXOR    X9, X8
1851  	MOVO    X8, X13
1852  	PSRLL   $0x07, X13
1853  	PSLLL   $0x19, X8
1854  	PXOR    X13, X8
1855  
1856  	// Round 2 diagonal step part 1: diagonalize.
1857  	PSHUFD $0x39, X8, X8
1858  	PSHUFD $0x4e, X9, X9
1859  	PSHUFD $0x93, X10, X10
1860  
1861  	// Round 2 diagonal step part 2: column step.
1862  	PSHUFD  $0x01, X0, X14
1863  	PSHUFD  $0x40, X1, X13
1864  	PBLENDW $0xc0, X13, X14
1865  	PSHUFD  $0x30, X2, X13
1866  	PBLENDW $0x30, X13, X14
1867  	MOVOU   permuted_blake_consts<>+96(SB), X15
1868  	PXOR    X14, X15
1869  	PADDD   X15, X7
1870  	PSHUFD  $0xc8, X0, X14
1871  	PSHUFD  $0x30, X1, X13
1872  	PBLENDW $0x30, X13, X14
1873  	PSHUFD  $0x00, X3, X13
1874  	PBLENDW $0x03, X13, X14
1875  	MOVOU   permuted_blake_consts<>+112(SB), X15
1876  	PXOR    X14, X15
1877  	PADDD   X8, X7
1878  	PXOR    X7, X10
1879  	PSHUFB  X5, X10
1880  	PADDD   X10, X9
1881  	PXOR    X9, X8
1882  	MOVO    X8, X13
1883  	PSRLL   $0x0c, X13
1884  	PSLLL   $0x14, X8
1885  	PXOR    X13, X8
1886  	PADDD   X15, X7
1887  	PADDD   X8, X7
1888  	PXOR    X7, X10
1889  	PSHUFB  X4, X10
1890  	PADDD   X10, X9
1891  	PXOR    X9, X8
1892  	MOVO    X8, X13
1893  	PSRLL   $0x07, X13
1894  	PSLLL   $0x19, X8
1895  	PXOR    X13, X8
1896  
1897  	// Round 2 diagonal step part 3: undiagonalize.
1898  	PSHUFD $0x93, X8, X8
1899  	PSHUFD $0x4e, X9, X9
1900  	PSHUFD $0x39, X10, X10
1901  
1902  	// Round 3 column step.
1903  	PSHUFD  $0x10, X1, X14
1904  	PSHUFD  $0x03, X2, X13
1905  	PBLENDW $0x03, X13, X14
1906  	PSHUFD  $0xc0, X3, X13
1907  	PBLENDW $0xcc, X13, X14
1908  	MOVOU   permuted_blake_consts<>+128(SB), X15
1909  	PXOR    X14, X15
1910  	PADDD   X15, X7
1911  	PSHUFD  $0x20, X0, X14
1912  	PSHUFD  $0x00, X2, X13
1913  	PBLENDW $0x03, X13, X14
1914  	PSHUFD  $0x40, X3, X13
1915  	PBLENDW $0xc0, X13, X14
1916  	MOVOU   permuted_blake_consts<>+144(SB), X15
1917  	PXOR    X14, X15
1918  	PADDD   X8, X7
1919  	PXOR    X7, X10
1920  	PSHUFB  X5, X10
1921  	PADDD   X10, X9
1922  	PXOR    X9, X8
1923  	MOVO    X8, X13
1924  	PSRLL   $0x0c, X13
1925  	PSLLL   $0x14, X8
1926  	PXOR    X13, X8
1927  	PADDD   X15, X7
1928  	PADDD   X8, X7
1929  	PXOR    X7, X10
1930  	PSHUFB  X4, X10
1931  	PADDD   X10, X9
1932  	PXOR    X9, X8
1933  	MOVO    X8, X13
1934  	PSRLL   $0x07, X13
1935  	PSLLL   $0x19, X8
1936  	PXOR    X13, X8
1937  
1938  	// Round 3 diagonal step part 1: diagonalize.
1939  	PSHUFD $0x39, X8, X8
1940  	PSHUFD $0x4e, X9, X9
1941  	PSHUFD $0x93, X10, X10
1942  
1943  	// Round 3 diagonal step part 2: column step.
1944  	PSHUFD  $0x0c, X0, X14
1945  	PSHUFD  $0x30, X1, X13
1946  	PBLENDW $0x30, X13, X14
1947  	PSHUFD  $0x42, X2, X13
1948  	PBLENDW $0xc3, X13, X14
1949  	MOVOU   permuted_blake_consts<>+160(SB), X15
1950  	PXOR    X14, X15
1951  	PADDD   X15, X7
1952  	PSHUFD  $0x10, X0, X14
1953  	PSHUFD  $0x08, X1, X13
1954  	PBLENDW $0xcc, X13, X14
1955  	PSHUFD  $0x02, X3, X13
1956  	PBLENDW $0x03, X13, X14
1957  	MOVOU   permuted_blake_consts<>+176(SB), X15
1958  	PXOR    X14, X15
1959  	PADDD   X8, X7
1960  	PXOR    X7, X10
1961  	PSHUFB  X5, X10
1962  	PADDD   X10, X9
1963  	PXOR    X9, X8
1964  	MOVO    X8, X13
1965  	PSRLL   $0x0c, X13
1966  	PSLLL   $0x14, X8
1967  	PXOR    X13, X8
1968  	PADDD   X15, X7
1969  	PADDD   X8, X7
1970  	PXOR    X7, X10
1971  	PSHUFB  X4, X10
1972  	PADDD   X10, X9
1973  	PXOR    X9, X8
1974  	MOVO    X8, X13
1975  	PSRLL   $0x07, X13
1976  	PSLLL   $0x19, X8
1977  	PXOR    X13, X8
1978  
1979  	// Round 3 diagonal step part 3: undiagonalize.
1980  	PSHUFD $0x93, X8, X8
1981  	PSHUFD $0x4e, X9, X9
1982  	PSHUFD $0x39, X10, X10
1983  
1984  	// Round 4 column step.
1985  	PSHUFD  $0x0c, X0, X14
1986  	PSHUFD  $0x03, X1, X13
1987  	PBLENDW $0x03, X13, X14
1988  	PSHUFD  $0xc0, X2, X13
1989  	PBLENDW $0xc0, X13, X14
1990  	PSHUFD  $0x10, X3, X13
1991  	PBLENDW $0x30, X13, X14
1992  	MOVOU   permuted_blake_consts<>+192(SB), X15
1993  	PXOR    X14, X15
1994  	PADDD   X15, X7
1995  	PSHUFD  $0x04, X0, X14
1996  	PSHUFD  $0x01, X2, X13
1997  	PBLENDW $0x03, X13, X14
1998  	PSHUFD  $0x80, X3, X13
1999  	PBLENDW $0xf0, X13, X14
2000  	MOVOU   permuted_blake_consts<>+208(SB), X15
2001  	PXOR    X14, X15
2002  	PADDD   X8, X7
2003  	PXOR    X7, X10
2004  	PSHUFB  X5, X10
2005  	PADDD   X10, X9
2006  	PXOR    X9, X8
2007  	MOVO    X8, X13
2008  	PSRLL   $0x0c, X13
2009  	PSLLL   $0x14, X8
2010  	PXOR    X13, X8
2011  	PADDD   X15, X7
2012  	PADDD   X8, X7
2013  	PXOR    X7, X10
2014  	PSHUFB  X4, X10
2015  	PADDD   X10, X9
2016  	PXOR    X9, X8
2017  	MOVO    X8, X13
2018  	PSRLL   $0x07, X13
2019  	PSLLL   $0x19, X8
2020  	PXOR    X13, X8
2021  
2022  	// Round 4 diagonal step part 1: diagonalize.
2023  	PSHUFD $0x39, X8, X8
2024  	PSHUFD $0x4e, X9, X9
2025  	PSHUFD $0x93, X10, X10
2026  
2027  	// Round 4 diagonal step part 2: column step.
2028  	PSHUFD  $0x02, X0, X14
2029  	PSHUFD  $0x04, X1, X13
2030  	PBLENDW $0x3c, X13, X14
2031  	PSHUFD  $0xc0, X3, X13
2032  	PBLENDW $0xc0, X13, X14
2033  	MOVOU   permuted_blake_consts<>+224(SB), X15
2034  	PXOR    X14, X15
2035  	PADDD   X15, X7
2036  	PSHUFD  $0x00, X0, X14
2037  	PSHUFD  $0x02, X1, X13
2038  	PBLENDW $0x03, X13, X14
2039  	PSHUFD  $0x08, X2, X13
2040  	PBLENDW $0xcc, X13, X14
2041  	MOVOU   permuted_blake_consts<>+240(SB), X15
2042  	PXOR    X14, X15
2043  	PADDD   X8, X7
2044  	PXOR    X7, X10
2045  	PSHUFB  X5, X10
2046  	PADDD   X10, X9
2047  	PXOR    X9, X8
2048  	MOVO    X8, X13
2049  	PSRLL   $0x0c, X13
2050  	PSLLL   $0x14, X8
2051  	PXOR    X13, X8
2052  	PADDD   X15, X7
2053  	PADDD   X8, X7
2054  	PXOR    X7, X10
2055  	PSHUFB  X4, X10
2056  	PADDD   X10, X9
2057  	PXOR    X9, X8
2058  	MOVO    X8, X13
2059  	PSRLL   $0x07, X13
2060  	PSLLL   $0x19, X8
2061  	PXOR    X13, X8
2062  
2063  	// Round 4 diagonal step part 3: undiagonalize.
2064  	PSHUFD $0x93, X8, X8
2065  	PSHUFD $0x4e, X9, X9
2066  	PSHUFD $0x39, X10, X10
2067  
2068  	// Round 5 column step.
2069  	PSHUFD  $0x20, X0, X14
2070  	PSHUFD  $0x04, X1, X13
2071  	PBLENDW $0x0c, X13, X14
2072  	PSHUFD  $0x81, X2, X13
2073  	PBLENDW $0xc3, X13, X14
2074  	MOVOU   permuted_blake_consts<>+256(SB), X15
2075  	PXOR    X14, X15
2076  	PADDD   X15, X7
2077  	PSHUFD  $0x00, X0, X14
2078  	PSHUFD  $0x0c, X1, X13
2079  	PBLENDW $0x3c, X13, X14
2080  	PSHUFD  $0xc0, X3, X13
2081  	PBLENDW $0xc0, X13, X14
2082  	MOVOU   permuted_blake_consts<>+272(SB), X15
2083  	PXOR    X14, X15
2084  	PADDD   X8, X7
2085  	PXOR    X7, X10
2086  	PSHUFB  X5, X10
2087  	PADDD   X10, X9
2088  	PXOR    X9, X8
2089  	MOVO    X8, X13
2090  	PSRLL   $0x0c, X13
2091  	PSLLL   $0x14, X8
2092  	PXOR    X13, X8
2093  	PADDD   X15, X7
2094  	PADDD   X8, X7
2095  	PXOR    X7, X10
2096  	PSHUFB  X4, X10
2097  	PADDD   X10, X9
2098  	PXOR    X9, X8
2099  	MOVO    X8, X13
2100  	PSRLL   $0x07, X13
2101  	PSLLL   $0x19, X8
2102  	PXOR    X13, X8
2103  
2104  	// Round 5 diagonal step part 1: diagonalize.
2105  	PSHUFD $0x39, X8, X8
2106  	PSHUFD $0x4e, X9, X9
2107  	PSHUFD $0x93, X10, X10
2108  
2109  	// Round 5 diagonal step part 2: column step.
2110  	PSHUFD  $0xc0, X0, X14
2111  	PSHUFD  $0x20, X1, X13
2112  	PBLENDW $0x30, X13, X14
2113  	PSHUFD  $0x0c, X2, X13
2114  	PBLENDW $0x0c, X13, X14
2115  	PSHUFD  $0x02, X3, X13
2116  	PBLENDW $0x03, X13, X14
2117  	MOVOU   permuted_blake_consts<>+288(SB), X15
2118  	PXOR    X14, X15
2119  	PADDD   X15, X7
2120  	PSHUFD  $0x01, X0, X14
2121  	PSHUFD  $0x00, X2, X13
2122  	PBLENDW $0x30, X13, X14
2123  	PSHUFD  $0x40, X3, X13
2124  	PBLENDW $0xcc, X13, X14
2125  	MOVOU   permuted_blake_consts<>+304(SB), X15
2126  	PXOR    X14, X15
2127  	PADDD   X8, X7
2128  	PXOR    X7, X10
2129  	PSHUFB  X5, X10
2130  	PADDD   X10, X9
2131  	PXOR    X9, X8
2132  	MOVO    X8, X13
2133  	PSRLL   $0x0c, X13
2134  	PSLLL   $0x14, X8
2135  	PXOR    X13, X8
2136  	PADDD   X15, X7
2137  	PADDD   X8, X7
2138  	PXOR    X7, X10
2139  	PSHUFB  X4, X10
2140  	PADDD   X10, X9
2141  	PXOR    X9, X8
2142  	MOVO    X8, X13
2143  	PSRLL   $0x07, X13
2144  	PSLLL   $0x19, X8
2145  	PXOR    X13, X8
2146  
2147  	// Round 5 diagonal step part 3: undiagonalize.
2148  	PSHUFD $0x93, X8, X8
2149  	PSHUFD $0x4e, X9, X9
2150  	PSHUFD $0x39, X10, X10
2151  
2152  	// Round 6 column step.
2153  	PSHUFD  $0x02, X0, X14
2154  	PSHUFD  $0x08, X1, X13
2155  	PBLENDW $0x0c, X13, X14
2156  	PSHUFD  $0x00, X2, X13
2157  	PBLENDW $0xc0, X13, X14
2158  	MOVOU   permuted_blake_consts<>+320(SB), X15
2159  	PXOR    X14, X15
2160  	PADDD   X15, X7
2161  	PSHUFD  $0xc0, X0, X14
2162  	PSHUFD  $0x38, X2, X13
2163  	PBLENDW $0x3c, X13, X14
2164  	PSHUFD  $0x00, X3, X13
2165  	PBLENDW $0x03, X13, X14
2166  	MOVOU   permuted_blake_consts<>+336(SB), X15
2167  	PXOR    X14, X15
2168  	PADDD   X8, X7
2169  	PXOR    X7, X10
2170  	PSHUFB  X5, X10
2171  	PADDD   X10, X9
2172  	PXOR    X9, X8
2173  	MOVO    X8, X13
2174  	PSRLL   $0x0c, X13
2175  	PSLLL   $0x14, X8
2176  	PXOR    X13, X8
2177  	PADDD   X15, X7
2178  	PADDD   X8, X7
2179  	PXOR    X7, X10
2180  	PSHUFB  X4, X10
2181  	PADDD   X10, X9
2182  	PXOR    X9, X8
2183  	MOVO    X8, X13
2184  	PSRLL   $0x07, X13
2185  	PSLLL   $0x19, X8
2186  	PXOR    X13, X8
2187  
2188  	// Round 6 diagonal step part 1: diagonalize.
2189  	PSHUFD $0x39, X8, X8
2190  	PSHUFD $0x4e, X9, X9
2191  	PSHUFD $0x93, X10, X10
2192  
2193  	// Round 6 diagonal step part 2: column step.
2194  	PSHUFD  $0x40, X0, X14
2195  	PSHUFD  $0x0c, X1, X13
2196  	PBLENDW $0x0f, X13, X14
2197  	PSHUFD  $0x30, X3, X13
2198  	PBLENDW $0x30, X13, X14
2199  	MOVOU   permuted_blake_consts<>+352(SB), X15
2200  	PXOR    X14, X15
2201  	PADDD   X15, X7
2202  	PSHUFD  $0x04, X1, X14
2203  	PSHUFD  $0x40, X2, X13
2204  	PBLENDW $0xc0, X13, X14
2205  	PSHUFD  $0x21, X3, X13
2206  	PBLENDW $0x33, X13, X14
2207  	MOVOU   permuted_blake_consts<>+368(SB), X15
2208  	PXOR    X14, X15
2209  	PADDD   X8, X7
2210  	PXOR    X7, X10
2211  	PSHUFB  X5, X10
2212  	PADDD   X10, X9
2213  	PXOR    X9, X8
2214  	MOVO    X8, X13
2215  	PSRLL   $0x0c, X13
2216  	PSLLL   $0x14, X8
2217  	PXOR    X13, X8
2218  	PADDD   X15, X7
2219  	PADDD   X8, X7
2220  	PXOR    X7, X10
2221  	PSHUFB  X4, X10
2222  	PADDD   X10, X9
2223  	PXOR    X9, X8
2224  	MOVO    X8, X13
2225  	PSRLL   $0x07, X13
2226  	PSLLL   $0x19, X8
2227  	PXOR    X13, X8
2228  
2229  	// Round 6 diagonal step part 3: undiagonalize.
2230  	PSHUFD $0x93, X8, X8
2231  	PSHUFD $0x4e, X9, X9
2232  	PSHUFD $0x39, X10, X10
2233  
2234  	// Round 7 column step.
2235  	PSHUFD  $0x04, X0, X14
2236  	PSHUFD  $0x00, X1, X13
2237  	PBLENDW $0xc0, X13, X14
2238  	PSHUFD  $0x20, X3, X13
2239  	PBLENDW $0x33, X13, X14
2240  	MOVOU   permuted_blake_consts<>+384(SB), X15
2241  	PXOR    X14, X15
2242  	PADDD   X15, X7
2243  	PSHUFD  $0x01, X1, X14
2244  	PSHUFD  $0x80, X2, X13
2245  	PBLENDW $0xc0, X13, X14
2246  	PSHUFD  $0x1c, X3, X13
2247  	PBLENDW $0x3c, X13, X14
2248  	MOVOU   permuted_blake_consts<>+400(SB), X15
2249  	PXOR    X14, X15
2250  	PADDD   X8, X7
2251  	PXOR    X7, X10
2252  	PSHUFB  X5, X10
2253  	PADDD   X10, X9
2254  	PXOR    X9, X8
2255  	MOVO    X8, X13
2256  	PSRLL   $0x0c, X13
2257  	PSLLL   $0x14, X8
2258  	PXOR    X13, X8
2259  	PADDD   X15, X7
2260  	PADDD   X8, X7
2261  	PXOR    X7, X10
2262  	PSHUFB  X4, X10
2263  	PADDD   X10, X9
2264  	PXOR    X9, X8
2265  	MOVO    X8, X13
2266  	PSRLL   $0x07, X13
2267  	PSLLL   $0x19, X8
2268  	PXOR    X13, X8
2269  
2270  	// Round 7 diagonal step part 1: diagonalize.
2271  	PSHUFD $0x39, X8, X8
2272  	PSHUFD $0x4e, X9, X9
2273  	PSHUFD $0x93, X10, X10
2274  
2275  	// Round 7 diagonal step part 2: column step.
2276  	PSHUFD  $0x00, X0, X14
2277  	PSHUFD  $0x08, X1, X13
2278  	PBLENDW $0x0c, X13, X14
2279  	PSHUFD  $0x10, X2, X13
2280  	PBLENDW $0xf0, X13, X14
2281  	MOVOU   permuted_blake_consts<>+416(SB), X15
2282  	PXOR    X14, X15
2283  	PADDD   X15, X7
2284  	PSHUFD  $0x2c, X0, X14
2285  	PSHUFD  $0x03, X1, X13
2286  	PBLENDW $0x03, X13, X14
2287  	PSHUFD  $0xc0, X2, X13
2288  	PBLENDW $0xc0, X13, X14
2289  	MOVOU   permuted_blake_consts<>+432(SB), X15
2290  	PXOR    X14, X15
2291  	PADDD   X8, X7
2292  	PXOR    X7, X10
2293  	PSHUFB  X5, X10
2294  	PADDD   X10, X9
2295  	PXOR    X9, X8
2296  	MOVO    X8, X13
2297  	PSRLL   $0x0c, X13
2298  	PSLLL   $0x14, X8
2299  	PXOR    X13, X8
2300  	PADDD   X15, X7
2301  	PADDD   X8, X7
2302  	PXOR    X7, X10
2303  	PSHUFB  X4, X10
2304  	PADDD   X10, X9
2305  	PXOR    X9, X8
2306  	MOVO    X8, X13
2307  	PSRLL   $0x07, X13
2308  	PSLLL   $0x19, X8
2309  	PXOR    X13, X8
2310  
2311  	// Round 7 diagonal step part 3: undiagonalize.
2312  	PSHUFD $0x93, X8, X8
2313  	PSHUFD $0x4e, X9, X9
2314  	PSHUFD $0x39, X10, X10
2315  
2316  	// Round 8 column step.
2317  	PSHUFD  $0xc0, X0, X14
2318  	PSHUFD  $0x0c, X1, X13
2319  	PBLENDW $0x0c, X13, X14
2320  	PSHUFD  $0x01, X3, X13
2321  	PBLENDW $0x33, X13, X14
2322  	MOVOU   permuted_blake_consts<>+448(SB), X15
2323  	PXOR    X14, X15
2324  	PADDD   X15, X7
2325  	PSHUFD  $0x10, X0, X14
2326  	PSHUFD  $0x43, X2, X13
2327  	PBLENDW $0xc3, X13, X14
2328  	PSHUFD  $0x08, X3, X13
2329  	PBLENDW $0x0c, X13, X14
2330  	MOVOU   permuted_blake_consts<>+464(SB), X15
2331  	PXOR    X14, X15
2332  	PADDD   X8, X7
2333  	PXOR    X7, X10
2334  	PSHUFB  X5, X10
2335  	PADDD   X10, X9
2336  	PXOR    X9, X8
2337  	MOVO    X8, X13
2338  	PSRLL   $0x0c, X13
2339  	PSLLL   $0x14, X8
2340  	PXOR    X13, X8
2341  	PADDD   X15, X7
2342  	PADDD   X8, X7
2343  	PXOR    X7, X10
2344  	PSHUFB  X4, X10
2345  	PADDD   X10, X9
2346  	PXOR    X9, X8
2347  	MOVO    X8, X13
2348  	PSRLL   $0x07, X13
2349  	PSLLL   $0x19, X8
2350  	PXOR    X13, X8
2351  
2352  	// Round 8 diagonal step part 1: diagonalize.
2353  	PSHUFD $0x39, X8, X8
2354  	PSHUFD $0x4e, X9, X9
2355  	PSHUFD $0x93, X10, X10
2356  
2357  	// Round 8 diagonal step part 2: column step.
2358  	PSHUFD  $0x80, X0, X14
2359  	PSHUFD  $0x01, X1, X13
2360  	PBLENDW $0x03, X13, X14
2361  	PSHUFD  $0x00, X2, X13
2362  	PBLENDW $0x30, X13, X14
2363  	PSHUFD  $0x0c, X3, X13
2364  	PBLENDW $0x0c, X13, X14
2365  	MOVOU   permuted_blake_consts<>+480(SB), X15
2366  	PXOR    X14, X15
2367  	PADDD   X15, X7
2368  	PSHUFD  $0x00, X0, X14
2369  	PSHUFD  $0x20, X1, X13
2370  	PBLENDW $0x3c, X13, X14
2371  	PSHUFD  $0x80, X2, X13
2372  	PBLENDW $0xc0, X13, X14
2373  	MOVOU   permuted_blake_consts<>+496(SB), X15
2374  	PXOR    X14, X15
2375  	PADDD   X8, X7
2376  	PXOR    X7, X10
2377  	PSHUFB  X5, X10
2378  	PADDD   X10, X9
2379  	PXOR    X9, X8
2380  	MOVO    X8, X13
2381  	PSRLL   $0x0c, X13
2382  	PSLLL   $0x14, X8
2383  	PXOR    X13, X8
2384  	PADDD   X15, X7
2385  	PADDD   X8, X7
2386  	PXOR    X7, X10
2387  	PSHUFB  X4, X10
2388  	PADDD   X10, X9
2389  	PXOR    X9, X8
2390  	MOVO    X8, X13
2391  	PSRLL   $0x07, X13
2392  	PSLLL   $0x19, X8
2393  	PXOR    X13, X8
2394  
2395  	// Round 8 diagonal step part 3: undiagonalize.
2396  	PSHUFD $0x93, X8, X8
2397  	PSHUFD $0x4e, X9, X9
2398  	PSHUFD $0x39, X10, X10
2399  
2400  	// Round 9 column step.
2401  	PSHUFD  $0x00, X0, X14
2402  	PSHUFD  $0x02, X1, X13
2403  	PBLENDW $0x03, X13, X14
2404  	PSHUFD  $0x30, X2, X13
2405  	PBLENDW $0x30, X13, X14
2406  	PSHUFD  $0x08, X3, X13
2407  	PBLENDW $0x0c, X13, X14
2408  	MOVOU   permuted_blake_consts<>+512(SB), X15
2409  	PXOR    X14, X15
2410  	PADDD   X15, X7
2411  	PSHUFD  $0x30, X0, X14
2412  	PSHUFD  $0x04, X2, X13
2413  	PBLENDW $0xcc, X13, X14
2414  	PSHUFD  $0x03, X3, X13
2415  	PBLENDW $0x03, X13, X14
2416  	MOVOU   permuted_blake_consts<>+528(SB), X15
2417  	PXOR    X14, X15
2418  	PADDD   X8, X7
2419  	PXOR    X7, X10
2420  	PSHUFB  X5, X10
2421  	PADDD   X10, X9
2422  	PXOR    X9, X8
2423  	MOVO    X8, X13
2424  	PSRLL   $0x0c, X13
2425  	PSLLL   $0x14, X8
2426  	PXOR    X13, X8
2427  	PADDD   X15, X7
2428  	PADDD   X8, X7
2429  	PXOR    X7, X10
2430  	PSHUFB  X4, X10
2431  	PADDD   X10, X9
2432  	PXOR    X9, X8
2433  	MOVO    X8, X13
2434  	PSRLL   $0x07, X13
2435  	PSLLL   $0x19, X8
2436  	PXOR    X13, X8
2437  
2438  	// Round 9 diagonal step part 1: diagonalize.
2439  	PSHUFD $0x39, X8, X8
2440  	PSHUFD $0x4e, X9, X9
2441  	PSHUFD $0x93, X10, X10
2442  
2443  	// Round 9 diagonal step part 2: column step.
2444  	PSHUFD  $0x10, X0, X14
2445  	PSHUFD  $0x80, X2, X13
2446  	PBLENDW $0xc0, X13, X14
2447  	PSHUFD  $0x04, X3, X13
2448  	PBLENDW $0x0f, X13, X14
2449  	MOVOU   permuted_blake_consts<>+544(SB), X15
2450  	PXOR    X14, X15
2451  	PADDD   X15, X7
2452  	PSHUFD  $0x02, X0, X14
2453  	PSHUFD  $0x4c, X1, X13
2454  	PBLENDW $0xfc, X13, X14
2455  	MOVOU   permuted_blake_consts<>+560(SB), X15
2456  	PXOR    X14, X15
2457  	PADDD   X8, X7
2458  	PXOR    X7, X10
2459  	PSHUFB  X5, X10
2460  	PADDD   X10, X9
2461  	PXOR    X9, X8
2462  	MOVO    X8, X13
2463  	PSRLL   $0x0c, X13
2464  	PSLLL   $0x14, X8
2465  	PXOR    X13, X8
2466  	PADDD   X15, X7
2467  	PADDD   X8, X7
2468  	PXOR    X7, X10
2469  	PSHUFB  X4, X10
2470  	PADDD   X10, X9
2471  	PXOR    X9, X8
2472  	MOVO    X8, X13
2473  	PSRLL   $0x07, X13
2474  	PSLLL   $0x19, X8
2475  	PXOR    X13, X8
2476  
2477  	// Round 9 diagonal step part 3: undiagonalize.
2478  	PSHUFD $0x93, X8, X8
2479  	PSHUFD $0x4e, X9, X9
2480  	PSHUFD $0x39, X10, X10
2481  
2482  	// Round 10 column step.
2483  	PSHUFD  $0x40, X0, X14
2484  	PSHUFD  $0x30, X1, X13
2485  	PBLENDW $0x30, X13, X14
2486  	PSHUFD  $0x02, X2, X13
2487  	PBLENDW $0x0f, X13, X14
2488  	MOVOU   permuted_blake_consts<>+576(SB), X15
2489  	PXOR    X14, X15
2490  	PADDD   X15, X7
2491  	PSHUFD  $0x02, X0, X14
2492  	PSHUFD  $0x60, X1, X13
2493  	PBLENDW $0xfc, X13, X14
2494  	MOVOU   permuted_blake_consts<>+592(SB), X15
2495  	PXOR    X14, X15
2496  	PADDD   X8, X7
2497  	PXOR    X7, X10
2498  	PSHUFB  X5, X10
2499  	PADDD   X10, X9
2500  	PXOR    X9, X8
2501  	MOVO    X8, X13
2502  	PSRLL   $0x0c, X13
2503  	PSLLL   $0x14, X8
2504  	PXOR    X13, X8
2505  	PADDD   X15, X7
2506  	PADDD   X8, X7
2507  	PXOR    X7, X10
2508  	PSHUFB  X4, X10
2509  	PADDD   X10, X9
2510  	PXOR    X9, X8
2511  	MOVO    X8, X13
2512  	PSRLL   $0x07, X13
2513  	PSLLL   $0x19, X8
2514  	PXOR    X13, X8
2515  
2516  	// Round 10 diagonal step part 1: diagonalize.
2517  	PSHUFD $0x39, X8, X8
2518  	PSHUFD $0x4e, X9, X9
2519  	PSHUFD $0x93, X10, X10
2520  
2521  	// Round 10 diagonal step part 2: column step.
2522  	PSHUFD  $0x30, X0, X14
2523  	PSHUFD  $0x04, X2, X13
2524  	PBLENDW $0x0c, X13, X14
2525  	PSHUFD  $0x43, X3, X13
2526  	PBLENDW $0xc3, X13, X14
2527  	MOVOU   permuted_blake_consts<>+608(SB), X15
2528  	PXOR    X14, X15
2529  	PADDD   X15, X7
2530  	PSHUFD  $0x00, X0, X14
2531  	PSHUFD  $0x03, X2, X13
2532  	PBLENDW $0x03, X13, X14
2533  	PSHUFD  $0x08, X3, X13
2534  	PBLENDW $0x3c, X13, X14
2535  	MOVOU   permuted_blake_consts<>+624(SB), X15
2536  	PXOR    X14, X15
2537  	PADDD   X8, X7
2538  	PXOR    X7, X10
2539  	PSHUFB  X5, X10
2540  	PADDD   X10, X9
2541  	PXOR    X9, X8
2542  	MOVO    X8, X13
2543  	PSRLL   $0x0c, X13
2544  	PSLLL   $0x14, X8
2545  	PXOR    X13, X8
2546  	PADDD   X15, X7
2547  	PADDD   X8, X7
2548  	PXOR    X7, X10
2549  	PSHUFB  X4, X10
2550  	PADDD   X10, X9
2551  	PXOR    X9, X8
2552  	MOVO    X8, X13
2553  	PSRLL   $0x07, X13
2554  	PSLLL   $0x19, X8
2555  	PXOR    X13, X8
2556  
2557  	// Round 10 diagonal step part 3: undiagonalize.
2558  	PSHUFD $0x93, X8, X8
2559  	PSHUFD $0x4e, X9, X9
2560  	PSHUFD $0x39, X10, X10
2561  
2562  	// Round 11 column step.
2563  	PSHUFD  $0x08, X0, X14
2564  	PSHUFD  $0x80, X1, X13
2565  	PBLENDW $0xf0, X13, X14
2566  	MOVOU   permuted_blake_consts<>+0(SB), X15
2567  	PXOR    X14, X15
2568  	PADDD   X15, X7
2569  	PSHUFD  $0x0d, X0, X14
2570  	PSHUFD  $0xd0, X1, X13
2571  	PBLENDW $0xf0, X13, X14
2572  	MOVOU   permuted_blake_consts<>+16(SB), X15
2573  	PXOR    X14, X15
2574  	PADDD   X8, X7
2575  	PXOR    X7, X10
2576  	PSHUFB  X5, X10
2577  	PADDD   X10, X9
2578  	PXOR    X9, X8
2579  	MOVO    X8, X13
2580  	PSRLL   $0x0c, X13
2581  	PSLLL   $0x14, X8
2582  	PXOR    X13, X8
2583  	PADDD   X15, X7
2584  	PADDD   X8, X7
2585  	PXOR    X7, X10
2586  	PSHUFB  X4, X10
2587  	PADDD   X10, X9
2588  	PXOR    X9, X8
2589  	MOVO    X8, X13
2590  	PSRLL   $0x07, X13
2591  	PSLLL   $0x19, X8
2592  	PXOR    X13, X8
2593  
2594  	// Round 11 diagonal step part 1: diagonalize.
2595  	PSHUFD $0x39, X8, X8
2596  	PSHUFD $0x4e, X9, X9
2597  	PSHUFD $0x93, X10, X10
2598  
2599  	// Round 11 diagonal step part 2: column step.
2600  	PSHUFD  $0x08, X2, X14
2601  	PSHUFD  $0x80, X3, X13
2602  	PBLENDW $0xf0, X13, X14
2603  	MOVOU   permuted_blake_consts<>+32(SB), X15
2604  	PXOR    X14, X15
2605  	PADDD   X15, X7
2606  	PSHUFD  $0x0d, X2, X14
2607  	PSHUFD  $0xd0, X3, X13
2608  	PBLENDW $0xf0, X13, X14
2609  	MOVOU   permuted_blake_consts<>+48(SB), X15
2610  	PXOR    X14, X15
2611  	PADDD   X8, X7
2612  	PXOR    X7, X10
2613  	PSHUFB  X5, X10
2614  	PADDD   X10, X9
2615  	PXOR    X9, X8
2616  	MOVO    X8, X13
2617  	PSRLL   $0x0c, X13
2618  	PSLLL   $0x14, X8
2619  	PXOR    X13, X8
2620  	PADDD   X15, X7
2621  	PADDD   X8, X7
2622  	PXOR    X7, X10
2623  	PSHUFB  X4, X10
2624  	PADDD   X10, X9
2625  	PXOR    X9, X8
2626  	MOVO    X8, X13
2627  	PSRLL   $0x07, X13
2628  	PSLLL   $0x19, X8
2629  	PXOR    X13, X8
2630  
2631  	// Round 11 diagonal step part 3: undiagonalize.
2632  	PSHUFD $0x93, X8, X8
2633  	PSHUFD $0x4e, X9, X9
2634  	PSHUFD $0x39, X10, X10
2635  
2636  	// Round 12 column step.
2637  	PSHUFD  $0x00, X1, X14
2638  	PSHUFD  $0x10, X2, X13
2639  	PBLENDW $0x30, X13, X14
2640  	PSHUFD  $0x42, X3, X13
2641  	PBLENDW $0xc3, X13, X14
2642  	MOVOU   permuted_blake_consts<>+64(SB), X15
2643  	PXOR    X14, X15
2644  	PADDD   X15, X7
2645  	PSHUFD  $0x80, X1, X14
2646  	PSHUFD  $0x02, X2, X13
2647  	PBLENDW $0x0f, X13, X14
2648  	PSHUFD  $0x30, X3, X13
2649  	PBLENDW $0x30, X13, X14
2650  	MOVOU   permuted_blake_consts<>+80(SB), X15
2651  	PXOR    X14, X15
2652  	PADDD   X8, X7
2653  	PXOR    X7, X10
2654  	PSHUFB  X5, X10
2655  	PADDD   X10, X9
2656  	PXOR    X9, X8
2657  	MOVO    X8, X13
2658  	PSRLL   $0x0c, X13
2659  	PSLLL   $0x14, X8
2660  	PXOR    X13, X8
2661  	PADDD   X15, X7
2662  	PADDD   X8, X7
2663  	PXOR    X7, X10
2664  	PSHUFB  X4, X10
2665  	PADDD   X10, X9
2666  	PXOR    X9, X8
2667  	MOVO    X8, X13
2668  	PSRLL   $0x07, X13
2669  	PSLLL   $0x19, X8
2670  	PXOR    X13, X8
2671  
2672  	// Round 12 diagonal step part 1: diagonalize.
2673  	PSHUFD $0x39, X8, X8
2674  	PSHUFD $0x4e, X9, X9
2675  	PSHUFD $0x93, X10, X10
2676  
2677  	// Round 12 diagonal step part 2: column step.
2678  	PSHUFD  $0x01, X0, X14
2679  	PSHUFD  $0x40, X1, X13
2680  	PBLENDW $0xc0, X13, X14
2681  	PSHUFD  $0x30, X2, X13
2682  	PBLENDW $0x30, X13, X14
2683  	MOVOU   permuted_blake_consts<>+96(SB), X15
2684  	PXOR    X14, X15
2685  	PADDD   X15, X7
2686  	PSHUFD  $0xc8, X0, X14
2687  	PSHUFD  $0x30, X1, X13
2688  	PBLENDW $0x30, X13, X14
2689  	PSHUFD  $0x00, X3, X13
2690  	PBLENDW $0x03, X13, X14
2691  	MOVOU   permuted_blake_consts<>+112(SB), X15
2692  	PXOR    X14, X15
2693  	PADDD   X8, X7
2694  	PXOR    X7, X10
2695  	PSHUFB  X5, X10
2696  	PADDD   X10, X9
2697  	PXOR    X9, X8
2698  	MOVO    X8, X13
2699  	PSRLL   $0x0c, X13
2700  	PSLLL   $0x14, X8
2701  	PXOR    X13, X8
2702  	PADDD   X15, X7
2703  	PADDD   X8, X7
2704  	PXOR    X7, X10
2705  	PSHUFB  X4, X10
2706  	PADDD   X10, X9
2707  	PXOR    X9, X8
2708  	MOVO    X8, X13
2709  	PSRLL   $0x07, X13
2710  	PSLLL   $0x19, X8
2711  	PXOR    X13, X8
2712  
2713  	// Round 12 diagonal step part 3: undiagonalize.
2714  	PSHUFD $0x93, X8, X8
2715  	PSHUFD $0x4e, X9, X9
2716  	PSHUFD $0x39, X10, X10
2717  
2718  	// Round 13 column step.
2719  	PSHUFD  $0x10, X1, X14
2720  	PSHUFD  $0x03, X2, X13
2721  	PBLENDW $0x03, X13, X14
2722  	PSHUFD  $0xc0, X3, X13
2723  	PBLENDW $0xcc, X13, X14
2724  	MOVOU   permuted_blake_consts<>+128(SB), X15
2725  	PXOR    X14, X15
2726  	PADDD   X15, X7
2727  	PSHUFD  $0x20, X0, X14
2728  	PSHUFD  $0x00, X2, X13
2729  	PBLENDW $0x03, X13, X14
2730  	PSHUFD  $0x40, X3, X13
2731  	PBLENDW $0xc0, X13, X14
2732  	MOVOU   permuted_blake_consts<>+144(SB), X15
2733  	PXOR    X14, X15
2734  	PADDD   X8, X7
2735  	PXOR    X7, X10
2736  	PSHUFB  X5, X10
2737  	PADDD   X10, X9
2738  	PXOR    X9, X8
2739  	MOVO    X8, X13
2740  	PSRLL   $0x0c, X13
2741  	PSLLL   $0x14, X8
2742  	PXOR    X13, X8
2743  	PADDD   X15, X7
2744  	PADDD   X8, X7
2745  	PXOR    X7, X10
2746  	PSHUFB  X4, X10
2747  	PADDD   X10, X9
2748  	PXOR    X9, X8
2749  	MOVO    X8, X13
2750  	PSRLL   $0x07, X13
2751  	PSLLL   $0x19, X8
2752  	PXOR    X13, X8
2753  
2754  	// Round 13 diagonal step part 1: diagonalize.
2755  	PSHUFD $0x39, X8, X8
2756  	PSHUFD $0x4e, X9, X9
2757  	PSHUFD $0x93, X10, X10
2758  
2759  	// Round 13 diagonal step part 2: column step.
2760  	PSHUFD  $0x0c, X0, X14
2761  	PSHUFD  $0x30, X1, X13
2762  	PBLENDW $0x30, X13, X14
2763  	PSHUFD  $0x42, X2, X13
2764  	PBLENDW $0xc3, X13, X14
2765  	MOVOU   permuted_blake_consts<>+160(SB), X15
2766  	PXOR    X14, X15
2767  	PADDD   X15, X7
2768  	PSHUFD  $0x10, X0, X14
2769  	PSHUFD  $0x08, X1, X13
2770  	PBLENDW $0xcc, X13, X14
2771  	PSHUFD  $0x02, X3, X13
2772  	PBLENDW $0x03, X13, X14
2773  	MOVOU   permuted_blake_consts<>+176(SB), X15
2774  	PXOR    X14, X15
2775  	PADDD   X8, X7
2776  	PXOR    X7, X10
2777  	PSHUFB  X5, X10
2778  	PADDD   X10, X9
2779  	PXOR    X9, X8
2780  	MOVO    X8, X13
2781  	PSRLL   $0x0c, X13
2782  	PSLLL   $0x14, X8
2783  	PXOR    X13, X8
2784  	PADDD   X15, X7
2785  	PADDD   X8, X7
2786  	PXOR    X7, X10
2787  	PSHUFB  X4, X10
2788  	PADDD   X10, X9
2789  	PXOR    X9, X8
2790  	MOVO    X8, X13
2791  	PSRLL   $0x07, X13
2792  	PSLLL   $0x19, X8
2793  	PXOR    X13, X8
2794  
2795  	// Round 13 diagonal step part 3: undiagonalize.
2796  	PSHUFD $0x93, X8, X8
2797  	PSHUFD $0x4e, X9, X9
2798  	PSHUFD $0x39, X10, X10
2799  
2800  	// Round 14 column step.
2801  	PSHUFD  $0x0c, X0, X14
2802  	PSHUFD  $0x03, X1, X13
2803  	PBLENDW $0x03, X13, X14
2804  	PSHUFD  $0xc0, X2, X13
2805  	PBLENDW $0xc0, X13, X14
2806  	PSHUFD  $0x10, X3, X13
2807  	PBLENDW $0x30, X13, X14
2808  	MOVOU   permuted_blake_consts<>+192(SB), X15
2809  	PXOR    X14, X15
2810  	PADDD   X15, X7
2811  	PSHUFD  $0x04, X0, X14
2812  	PSHUFD  $0x01, X2, X13
2813  	PBLENDW $0x03, X13, X14
2814  	PSHUFD  $0x80, X3, X13
2815  	PBLENDW $0xf0, X13, X14
2816  	MOVOU   permuted_blake_consts<>+208(SB), X15
2817  	PXOR    X14, X15
2818  	PADDD   X8, X7
2819  	PXOR    X7, X10
2820  	PSHUFB  X5, X10
2821  	PADDD   X10, X9
2822  	PXOR    X9, X8
2823  	MOVO    X8, X13
2824  	PSRLL   $0x0c, X13
2825  	PSLLL   $0x14, X8
2826  	PXOR    X13, X8
2827  	PADDD   X15, X7
2828  	PADDD   X8, X7
2829  	PXOR    X7, X10
2830  	PSHUFB  X4, X10
2831  	PADDD   X10, X9
2832  	PXOR    X9, X8
2833  	MOVO    X8, X13
2834  	PSRLL   $0x07, X13
2835  	PSLLL   $0x19, X8
2836  	PXOR    X13, X8
2837  
2838  	// Round 14 diagonal step part 1: diagonalize.
2839  	PSHUFD $0x39, X8, X8
2840  	PSHUFD $0x4e, X9, X9
2841  	PSHUFD $0x93, X10, X10
2842  
2843  	// Round 14 diagonal step part 2: column step.
2844  	PSHUFD  $0x02, X0, X14
2845  	PSHUFD  $0x04, X1, X13
2846  	PBLENDW $0x3c, X13, X14
2847  	PSHUFD  $0xc0, X3, X13
2848  	PBLENDW $0xc0, X13, X14
2849  	MOVOU   permuted_blake_consts<>+224(SB), X15
2850  	PXOR    X14, X15
2851  	PADDD   X15, X7
2852  	PSHUFD  $0x00, X0, X14
2853  	PSHUFD  $0x02, X1, X13
2854  	PBLENDW $0x03, X13, X14
2855  	PSHUFD  $0x08, X2, X13
2856  	PBLENDW $0xcc, X13, X14
2857  	MOVOU   permuted_blake_consts<>+240(SB), X15
2858  	PXOR    X14, X15
2859  	PADDD   X8, X7
2860  	PXOR    X7, X10
2861  	PSHUFB  X5, X10
2862  	PADDD   X10, X9
2863  	PXOR    X9, X8
2864  	MOVO    X8, X13
2865  	PSRLL   $0x0c, X13
2866  	PSLLL   $0x14, X8
2867  	PXOR    X13, X8
2868  	PADDD   X15, X7
2869  	PADDD   X8, X7
2870  	PXOR    X7, X10
2871  	PSHUFB  X4, X10
2872  	PADDD   X10, X9
2873  	PXOR    X9, X8
2874  	MOVO    X8, X13
2875  	PSRLL   $0x07, X13
2876  	PSLLL   $0x19, X8
2877  	PXOR    X13, X8
2878  
2879  	// Round 14 diagonal step part 3: undiagonalize.
2880  	PSHUFD $0x93, X8, X8
2881  	PSHUFD $0x4e, X9, X9
2882  	PSHUFD $0x39, X10, X10
2883  
2884  	// Finally the chain value is defined as:
2885  	// h'0 = h0^s0^v0^v8
2886  	// h'1 = h1^s1^v1^v9
2887  	// h'2 = h2^s2^v2^va
2888  	// h'3 = h3^s3^v3^vb
2889  	// h'4 = h4^s0^v4^vc
2890  	// h'5 = h5^s1^v5^vd
2891  	// h'6 = h6^s2^v6^ve
2892  	// h'7 = h7^s3^v7^vf
2893  	PXOR X11, X7
2894  	PXOR X6, X7
2895  	PXOR X9, X7
2896  	PXOR X12, X8
2897  	PXOR X6, X8
2898  	PXOR X10, X8
2899  
2900  	// Either terminate the loop when there are no more full blocks
2901  	// to compress or move the message pointer to the next block of
2902  	// bytes to compress, increment the message bits counter
2903  	// accordingly, and loop back around to compress it.
2904  	DECQ BX
2905  	JZ   done
2906  	LEAQ 64(DX), DX
2907  	ADDQ $0x00000200, CX
2908  	JMP  compressLoop
2909  
2910  done:
2911  	// Output the resulting chain value.
2912  	MOVOU X7, (AX)
2913  	MOVOU X8, 16(AX)
2914  	RET
2915  
2916  // func blocksAVX(state *State, msg []byte, counter uint64)
2917  // Requires: AVX
2918  TEXT ·blocksAVX(SB), NOSPLIT, $0-40
2919  	MOVQ state+0(FP), AX
2920  	MOVQ counter+32(FP), CX
2921  	MOVQ msg_base+8(FP), DX
2922  	MOVQ msg_len+16(FP), BX
2923  
2924  	// Populate registers for fast right rotations.
2925  	VMOVDQU shuffle_rotr8_4x32<>+0(SB), X4
2926  	VMOVDQU shuffle_rotr16_4x32<>+0(SB), X5
2927  
2928  	// Convert message len to number of blocks for loop counter.
2929  	SHRQ $0x06, BX
2930  
2931  	// Initialize state matrix.
2932  	// row0 = |v0  v1  v2  v3|   |  h0     h1     h2     h3 |
2933  	// row1 = |v4  v5  v6  v7|   |  h4     h5     h6     h7 |
2934  	VMOVDQU 32(AX), X6
2935  	VMOVDQU (AX), X7
2936  	VMOVDQU 16(AX), X8
2937  
2938  compressLoop:
2939  	// row2 = |v8  v9  va  vb| = |s0^c0  s1^c1  s2^c2  s3^c3|
2940  	// row3 = |vc  vd  ve  vf|   |t0^c4  t0^c5  t1^c6  t1^c7|
2941  	VMOVDQU first_8_blake_consts<>+0(SB), X9
2942  	VPXOR   X6, X9, X9
2943  	VMOVQ   CX, X10
2944  	VPSHUFD $0x50, X10, X10
2945  	VPXOR   first_8_blake_consts<>+16(SB), X10, X10
2946  	VMOVDQA X7, X11
2947  	VMOVDQA X8, X12
2948  
2949  	// Convert message to big endian.
2950  	VMOVDQU shuffle_le_to_be_4x32<>+0(SB), X13
2951  	VMOVDQU (DX), X0
2952  	VPSHUFB X13, X0, X0
2953  	VMOVDQU 16(DX), X1
2954  	VPSHUFB X13, X1, X1
2955  	VMOVDQU 32(DX), X2
2956  	VPSHUFB X13, X2, X2
2957  	VMOVDQU 48(DX), X3
2958  	VPSHUFB X13, X3, X3
2959  
2960  	// Round 1 column step.
2961  	VPSHUFD  $0x08, X0, X14
2962  	VPSHUFD  $0x80, X1, X13
2963  	VPBLENDW $0xf0, X13, X14, X14
2964  	VMOVDQU  permuted_blake_consts<>+0(SB), X15
2965  	VPXOR    X14, X15, X15
2966  	VPADDD   X15, X7, X7
2967  	VPSHUFD  $0x0d, X0, X14
2968  	VPSHUFD  $0xd0, X1, X13
2969  	VPBLENDW $0xf0, X13, X14, X14
2970  	VMOVDQU  permuted_blake_consts<>+16(SB), X15
2971  	VPXOR    X14, X15, X15
2972  	VPADDD   X8, X7, X7
2973  	VPXOR    X7, X10, X10
2974  	VPSHUFB  X5, X10, X10
2975  	VPADDD   X10, X9, X9
2976  	VPXOR    X9, X8, X8
2977  	VPSRLD   $0x0c, X8, X13
2978  	VPSLLD   $0x14, X8, X8
2979  	VPXOR    X13, X8, X8
2980  	VPADDD   X15, X7, X7
2981  	VPADDD   X8, X7, X7
2982  	VPXOR    X7, X10, X10
2983  	VPSHUFB  X4, X10, X10
2984  	VPADDD   X10, X9, X9
2985  	VPXOR    X9, X8, X8
2986  	VPSRLD   $0x07, X8, X13
2987  	VPSLLD   $0x19, X8, X8
2988  	VPXOR    X13, X8, X8
2989  
2990  	// Round 1 diagonal step part 1: diagonalize.
2991  	VPSHUFD $0x39, X8, X8
2992  	VPSHUFD $0x4e, X9, X9
2993  	VPSHUFD $0x93, X10, X10
2994  
2995  	// Round 1 diagonal step part 2: column step.
2996  	VPSHUFD  $0x08, X2, X14
2997  	VPSHUFD  $0x80, X3, X13
2998  	VPBLENDW $0xf0, X13, X14, X14
2999  	VMOVDQU  permuted_blake_consts<>+32(SB), X15
3000  	VPXOR    X14, X15, X15
3001  	VPADDD   X15, X7, X7
3002  	VPSHUFD  $0x0d, X2, X14
3003  	VPSHUFD  $0xd0, X3, X13
3004  	VPBLENDW $0xf0, X13, X14, X14
3005  	VMOVDQU  permuted_blake_consts<>+48(SB), X15
3006  	VPXOR    X14, X15, X15
3007  	VPADDD   X8, X7, X7
3008  	VPXOR    X7, X10, X10
3009  	VPSHUFB  X5, X10, X10
3010  	VPADDD   X10, X9, X9
3011  	VPXOR    X9, X8, X8
3012  	VPSRLD   $0x0c, X8, X13
3013  	VPSLLD   $0x14, X8, X8
3014  	VPXOR    X13, X8, X8
3015  	VPADDD   X15, X7, X7
3016  	VPADDD   X8, X7, X7
3017  	VPXOR    X7, X10, X10
3018  	VPSHUFB  X4, X10, X10
3019  	VPADDD   X10, X9, X9
3020  	VPXOR    X9, X8, X8
3021  	VPSRLD   $0x07, X8, X13
3022  	VPSLLD   $0x19, X8, X8
3023  	VPXOR    X13, X8, X8
3024  
3025  	// Round 1 diagonal step part 3: undiagonalize.
3026  	VPSHUFD $0x93, X8, X8
3027  	VPSHUFD $0x4e, X9, X9
3028  	VPSHUFD $0x39, X10, X10
3029  
3030  	// Round 2 column step.
3031  	VPSHUFD  $0x00, X1, X14
3032  	VPSHUFD  $0x10, X2, X13
3033  	VPBLENDW $0x30, X13, X14, X14
3034  	VPSHUFD  $0x42, X3, X13
3035  	VPBLENDW $0xc3, X13, X14, X14
3036  	VMOVDQU  permuted_blake_consts<>+64(SB), X15
3037  	VPXOR    X14, X15, X15
3038  	VPADDD   X15, X7, X7
3039  	VPSHUFD  $0x80, X1, X14
3040  	VPSHUFD  $0x02, X2, X13
3041  	VPBLENDW $0x0f, X13, X14, X14
3042  	VPSHUFD  $0x30, X3, X13
3043  	VPBLENDW $0x30, X13, X14, X14
3044  	VMOVDQU  permuted_blake_consts<>+80(SB), X15
3045  	VPXOR    X14, X15, X15
3046  	VPADDD   X8, X7, X7
3047  	VPXOR    X7, X10, X10
3048  	VPSHUFB  X5, X10, X10
3049  	VPADDD   X10, X9, X9
3050  	VPXOR    X9, X8, X8
3051  	VPSRLD   $0x0c, X8, X13
3052  	VPSLLD   $0x14, X8, X8
3053  	VPXOR    X13, X8, X8
3054  	VPADDD   X15, X7, X7
3055  	VPADDD   X8, X7, X7
3056  	VPXOR    X7, X10, X10
3057  	VPSHUFB  X4, X10, X10
3058  	VPADDD   X10, X9, X9
3059  	VPXOR    X9, X8, X8
3060  	VPSRLD   $0x07, X8, X13
3061  	VPSLLD   $0x19, X8, X8
3062  	VPXOR    X13, X8, X8
3063  
3064  	// Round 2 diagonal step part 1: diagonalize.
3065  	VPSHUFD $0x39, X8, X8
3066  	VPSHUFD $0x4e, X9, X9
3067  	VPSHUFD $0x93, X10, X10
3068  
3069  	// Round 2 diagonal step part 2: column step.
3070  	VPSHUFD  $0x01, X0, X14
3071  	VPSHUFD  $0x40, X1, X13
3072  	VPBLENDW $0xc0, X13, X14, X14
3073  	VPSHUFD  $0x30, X2, X13
3074  	VPBLENDW $0x30, X13, X14, X14
3075  	VMOVDQU  permuted_blake_consts<>+96(SB), X15
3076  	VPXOR    X14, X15, X15
3077  	VPADDD   X15, X7, X7
3078  	VPSHUFD  $0xc8, X0, X14
3079  	VPSHUFD  $0x30, X1, X13
3080  	VPBLENDW $0x30, X13, X14, X14
3081  	VPSHUFD  $0x00, X3, X13
3082  	VPBLENDW $0x03, X13, X14, X14
3083  	VMOVDQU  permuted_blake_consts<>+112(SB), X15
3084  	VPXOR    X14, X15, X15
3085  	VPADDD   X8, X7, X7
3086  	VPXOR    X7, X10, X10
3087  	VPSHUFB  X5, X10, X10
3088  	VPADDD   X10, X9, X9
3089  	VPXOR    X9, X8, X8
3090  	VPSRLD   $0x0c, X8, X13
3091  	VPSLLD   $0x14, X8, X8
3092  	VPXOR    X13, X8, X8
3093  	VPADDD   X15, X7, X7
3094  	VPADDD   X8, X7, X7
3095  	VPXOR    X7, X10, X10
3096  	VPSHUFB  X4, X10, X10
3097  	VPADDD   X10, X9, X9
3098  	VPXOR    X9, X8, X8
3099  	VPSRLD   $0x07, X8, X13
3100  	VPSLLD   $0x19, X8, X8
3101  	VPXOR    X13, X8, X8
3102  
3103  	// Round 2 diagonal step part 3: undiagonalize.
3104  	VPSHUFD $0x93, X8, X8
3105  	VPSHUFD $0x4e, X9, X9
3106  	VPSHUFD $0x39, X10, X10
3107  
3108  	// Round 3 column step.
3109  	VPSHUFD  $0x10, X1, X14
3110  	VPSHUFD  $0x03, X2, X13
3111  	VPBLENDW $0x03, X13, X14, X14
3112  	VPSHUFD  $0xc0, X3, X13
3113  	VPBLENDW $0xcc, X13, X14, X14
3114  	VMOVDQU  permuted_blake_consts<>+128(SB), X15
3115  	VPXOR    X14, X15, X15
3116  	VPADDD   X15, X7, X7
3117  	VPSHUFD  $0x20, X0, X14
3118  	VPSHUFD  $0x00, X2, X13
3119  	VPBLENDW $0x03, X13, X14, X14
3120  	VPSHUFD  $0x40, X3, X13
3121  	VPBLENDW $0xc0, X13, X14, X14
3122  	VMOVDQU  permuted_blake_consts<>+144(SB), X15
3123  	VPXOR    X14, X15, X15
3124  	VPADDD   X8, X7, X7
3125  	VPXOR    X7, X10, X10
3126  	VPSHUFB  X5, X10, X10
3127  	VPADDD   X10, X9, X9
3128  	VPXOR    X9, X8, X8
3129  	VPSRLD   $0x0c, X8, X13
3130  	VPSLLD   $0x14, X8, X8
3131  	VPXOR    X13, X8, X8
3132  	VPADDD   X15, X7, X7
3133  	VPADDD   X8, X7, X7
3134  	VPXOR    X7, X10, X10
3135  	VPSHUFB  X4, X10, X10
3136  	VPADDD   X10, X9, X9
3137  	VPXOR    X9, X8, X8
3138  	VPSRLD   $0x07, X8, X13
3139  	VPSLLD   $0x19, X8, X8
3140  	VPXOR    X13, X8, X8
3141  
3142  	// Round 3 diagonal step part 1: diagonalize.
3143  	VPSHUFD $0x39, X8, X8
3144  	VPSHUFD $0x4e, X9, X9
3145  	VPSHUFD $0x93, X10, X10
3146  
3147  	// Round 3 diagonal step part 2: column step.
3148  	VPSHUFD  $0x0c, X0, X14
3149  	VPSHUFD  $0x30, X1, X13
3150  	VPBLENDW $0x30, X13, X14, X14
3151  	VPSHUFD  $0x42, X2, X13
3152  	VPBLENDW $0xc3, X13, X14, X14
3153  	VMOVDQU  permuted_blake_consts<>+160(SB), X15
3154  	VPXOR    X14, X15, X15
3155  	VPADDD   X15, X7, X7
3156  	VPSHUFD  $0x10, X0, X14
3157  	VPSHUFD  $0x08, X1, X13
3158  	VPBLENDW $0xcc, X13, X14, X14
3159  	VPSHUFD  $0x02, X3, X13
3160  	VPBLENDW $0x03, X13, X14, X14
3161  	VMOVDQU  permuted_blake_consts<>+176(SB), X15
3162  	VPXOR    X14, X15, X15
3163  	VPADDD   X8, X7, X7
3164  	VPXOR    X7, X10, X10
3165  	VPSHUFB  X5, X10, X10
3166  	VPADDD   X10, X9, X9
3167  	VPXOR    X9, X8, X8
3168  	VPSRLD   $0x0c, X8, X13
3169  	VPSLLD   $0x14, X8, X8
3170  	VPXOR    X13, X8, X8
3171  	VPADDD   X15, X7, X7
3172  	VPADDD   X8, X7, X7
3173  	VPXOR    X7, X10, X10
3174  	VPSHUFB  X4, X10, X10
3175  	VPADDD   X10, X9, X9
3176  	VPXOR    X9, X8, X8
3177  	VPSRLD   $0x07, X8, X13
3178  	VPSLLD   $0x19, X8, X8
3179  	VPXOR    X13, X8, X8
3180  
3181  	// Round 3 diagonal step part 3: undiagonalize.
3182  	VPSHUFD $0x93, X8, X8
3183  	VPSHUFD $0x4e, X9, X9
3184  	VPSHUFD $0x39, X10, X10
3185  
3186  	// Round 4 column step.
3187  	VPSHUFD  $0x0c, X0, X14
3188  	VPSHUFD  $0x03, X1, X13
3189  	VPBLENDW $0x03, X13, X14, X14
3190  	VPSHUFD  $0xc0, X2, X13
3191  	VPBLENDW $0xc0, X13, X14, X14
3192  	VPSHUFD  $0x10, X3, X13
3193  	VPBLENDW $0x30, X13, X14, X14
3194  	VMOVDQU  permuted_blake_consts<>+192(SB), X15
3195  	VPXOR    X14, X15, X15
3196  	VPADDD   X15, X7, X7
3197  	VPSHUFD  $0x04, X0, X14
3198  	VPSHUFD  $0x01, X2, X13
3199  	VPBLENDW $0x03, X13, X14, X14
3200  	VPSHUFD  $0x80, X3, X13
3201  	VPBLENDW $0xf0, X13, X14, X14
3202  	VMOVDQU  permuted_blake_consts<>+208(SB), X15
3203  	VPXOR    X14, X15, X15
3204  	VPADDD   X8, X7, X7
3205  	VPXOR    X7, X10, X10
3206  	VPSHUFB  X5, X10, X10
3207  	VPADDD   X10, X9, X9
3208  	VPXOR    X9, X8, X8
3209  	VPSRLD   $0x0c, X8, X13
3210  	VPSLLD   $0x14, X8, X8
3211  	VPXOR    X13, X8, X8
3212  	VPADDD   X15, X7, X7
3213  	VPADDD   X8, X7, X7
3214  	VPXOR    X7, X10, X10
3215  	VPSHUFB  X4, X10, X10
3216  	VPADDD   X10, X9, X9
3217  	VPXOR    X9, X8, X8
3218  	VPSRLD   $0x07, X8, X13
3219  	VPSLLD   $0x19, X8, X8
3220  	VPXOR    X13, X8, X8
3221  
3222  	// Round 4 diagonal step part 1: diagonalize.
3223  	VPSHUFD $0x39, X8, X8
3224  	VPSHUFD $0x4e, X9, X9
3225  	VPSHUFD $0x93, X10, X10
3226  
3227  	// Round 4 diagonal step part 2: column step.
3228  	VPSHUFD  $0x02, X0, X14
3229  	VPSHUFD  $0x04, X1, X13
3230  	VPBLENDW $0x3c, X13, X14, X14
3231  	VPSHUFD  $0xc0, X3, X13
3232  	VPBLENDW $0xc0, X13, X14, X14
3233  	VMOVDQU  permuted_blake_consts<>+224(SB), X15
3234  	VPXOR    X14, X15, X15
3235  	VPADDD   X15, X7, X7
3236  	VPSHUFD  $0x00, X0, X14
3237  	VPSHUFD  $0x02, X1, X13
3238  	VPBLENDW $0x03, X13, X14, X14
3239  	VPSHUFD  $0x08, X2, X13
3240  	VPBLENDW $0xcc, X13, X14, X14
3241  	VMOVDQU  permuted_blake_consts<>+240(SB), X15
3242  	VPXOR    X14, X15, X15
3243  	VPADDD   X8, X7, X7
3244  	VPXOR    X7, X10, X10
3245  	VPSHUFB  X5, X10, X10
3246  	VPADDD   X10, X9, X9
3247  	VPXOR    X9, X8, X8
3248  	VPSRLD   $0x0c, X8, X13
3249  	VPSLLD   $0x14, X8, X8
3250  	VPXOR    X13, X8, X8
3251  	VPADDD   X15, X7, X7
3252  	VPADDD   X8, X7, X7
3253  	VPXOR    X7, X10, X10
3254  	VPSHUFB  X4, X10, X10
3255  	VPADDD   X10, X9, X9
3256  	VPXOR    X9, X8, X8
3257  	VPSRLD   $0x07, X8, X13
3258  	VPSLLD   $0x19, X8, X8
3259  	VPXOR    X13, X8, X8
3260  
3261  	// Round 4 diagonal step part 3: undiagonalize.
3262  	VPSHUFD $0x93, X8, X8
3263  	VPSHUFD $0x4e, X9, X9
3264  	VPSHUFD $0x39, X10, X10
3265  
3266  	// Round 5 column step.
3267  	VPSHUFD  $0x20, X0, X14
3268  	VPSHUFD  $0x04, X1, X13
3269  	VPBLENDW $0x0c, X13, X14, X14
3270  	VPSHUFD  $0x81, X2, X13
3271  	VPBLENDW $0xc3, X13, X14, X14
3272  	VMOVDQU  permuted_blake_consts<>+256(SB), X15
3273  	VPXOR    X14, X15, X15
3274  	VPADDD   X15, X7, X7
3275  	VPSHUFD  $0x00, X0, X14
3276  	VPSHUFD  $0x0c, X1, X13
3277  	VPBLENDW $0x3c, X13, X14, X14
3278  	VPSHUFD  $0xc0, X3, X13
3279  	VPBLENDW $0xc0, X13, X14, X14
3280  	VMOVDQU  permuted_blake_consts<>+272(SB), X15
3281  	VPXOR    X14, X15, X15
3282  	VPADDD   X8, X7, X7
3283  	VPXOR    X7, X10, X10
3284  	VPSHUFB  X5, X10, X10
3285  	VPADDD   X10, X9, X9
3286  	VPXOR    X9, X8, X8
3287  	VPSRLD   $0x0c, X8, X13
3288  	VPSLLD   $0x14, X8, X8
3289  	VPXOR    X13, X8, X8
3290  	VPADDD   X15, X7, X7
3291  	VPADDD   X8, X7, X7
3292  	VPXOR    X7, X10, X10
3293  	VPSHUFB  X4, X10, X10
3294  	VPADDD   X10, X9, X9
3295  	VPXOR    X9, X8, X8
3296  	VPSRLD   $0x07, X8, X13
3297  	VPSLLD   $0x19, X8, X8
3298  	VPXOR    X13, X8, X8
3299  
3300  	// Round 5 diagonal step part 1: diagonalize.
3301  	VPSHUFD $0x39, X8, X8
3302  	VPSHUFD $0x4e, X9, X9
3303  	VPSHUFD $0x93, X10, X10
3304  
3305  	// Round 5 diagonal step part 2: column step.
3306  	VPSHUFD  $0xc0, X0, X14
3307  	VPSHUFD  $0x20, X1, X13
3308  	VPBLENDW $0x30, X13, X14, X14
3309  	VPSHUFD  $0x0c, X2, X13
3310  	VPBLENDW $0x0c, X13, X14, X14
3311  	VPSHUFD  $0x02, X3, X13
3312  	VPBLENDW $0x03, X13, X14, X14
3313  	VMOVDQU  permuted_blake_consts<>+288(SB), X15
3314  	VPXOR    X14, X15, X15
3315  	VPADDD   X15, X7, X7
3316  	VPSHUFD  $0x01, X0, X14
3317  	VPSHUFD  $0x00, X2, X13
3318  	VPBLENDW $0x30, X13, X14, X14
3319  	VPSHUFD  $0x40, X3, X13
3320  	VPBLENDW $0xcc, X13, X14, X14
3321  	VMOVDQU  permuted_blake_consts<>+304(SB), X15
3322  	VPXOR    X14, X15, X15
3323  	VPADDD   X8, X7, X7
3324  	VPXOR    X7, X10, X10
3325  	VPSHUFB  X5, X10, X10
3326  	VPADDD   X10, X9, X9
3327  	VPXOR    X9, X8, X8
3328  	VPSRLD   $0x0c, X8, X13
3329  	VPSLLD   $0x14, X8, X8
3330  	VPXOR    X13, X8, X8
3331  	VPADDD   X15, X7, X7
3332  	VPADDD   X8, X7, X7
3333  	VPXOR    X7, X10, X10
3334  	VPSHUFB  X4, X10, X10
3335  	VPADDD   X10, X9, X9
3336  	VPXOR    X9, X8, X8
3337  	VPSRLD   $0x07, X8, X13
3338  	VPSLLD   $0x19, X8, X8
3339  	VPXOR    X13, X8, X8
3340  
3341  	// Round 5 diagonal step part 3: undiagonalize.
3342  	VPSHUFD $0x93, X8, X8
3343  	VPSHUFD $0x4e, X9, X9
3344  	VPSHUFD $0x39, X10, X10
3345  
3346  	// Round 6 column step.
3347  	VPSHUFD  $0x02, X0, X14
3348  	VPSHUFD  $0x08, X1, X13
3349  	VPBLENDW $0x0c, X13, X14, X14
3350  	VPSHUFD  $0x00, X2, X13
3351  	VPBLENDW $0xc0, X13, X14, X14
3352  	VMOVDQU  permuted_blake_consts<>+320(SB), X15
3353  	VPXOR    X14, X15, X15
3354  	VPADDD   X15, X7, X7
3355  	VPSHUFD  $0xc0, X0, X14
3356  	VPSHUFD  $0x38, X2, X13
3357  	VPBLENDW $0x3c, X13, X14, X14
3358  	VPSHUFD  $0x00, X3, X13
3359  	VPBLENDW $0x03, X13, X14, X14
3360  	VMOVDQU  permuted_blake_consts<>+336(SB), X15
3361  	VPXOR    X14, X15, X15
3362  	VPADDD   X8, X7, X7
3363  	VPXOR    X7, X10, X10
3364  	VPSHUFB  X5, X10, X10
3365  	VPADDD   X10, X9, X9
3366  	VPXOR    X9, X8, X8
3367  	VPSRLD   $0x0c, X8, X13
3368  	VPSLLD   $0x14, X8, X8
3369  	VPXOR    X13, X8, X8
3370  	VPADDD   X15, X7, X7
3371  	VPADDD   X8, X7, X7
3372  	VPXOR    X7, X10, X10
3373  	VPSHUFB  X4, X10, X10
3374  	VPADDD   X10, X9, X9
3375  	VPXOR    X9, X8, X8
3376  	VPSRLD   $0x07, X8, X13
3377  	VPSLLD   $0x19, X8, X8
3378  	VPXOR    X13, X8, X8
3379  
3380  	// Round 6 diagonal step part 1: diagonalize.
3381  	VPSHUFD $0x39, X8, X8
3382  	VPSHUFD $0x4e, X9, X9
3383  	VPSHUFD $0x93, X10, X10
3384  
3385  	// Round 6 diagonal step part 2: column step.
3386  	VPSHUFD  $0x40, X0, X14
3387  	VPSHUFD  $0x0c, X1, X13
3388  	VPBLENDW $0x0f, X13, X14, X14
3389  	VPSHUFD  $0x30, X3, X13
3390  	VPBLENDW $0x30, X13, X14, X14
3391  	VMOVDQU  permuted_blake_consts<>+352(SB), X15
3392  	VPXOR    X14, X15, X15
3393  	VPADDD   X15, X7, X7
3394  	VPSHUFD  $0x04, X1, X14
3395  	VPSHUFD  $0x40, X2, X13
3396  	VPBLENDW $0xc0, X13, X14, X14
3397  	VPSHUFD  $0x21, X3, X13
3398  	VPBLENDW $0x33, X13, X14, X14
3399  	VMOVDQU  permuted_blake_consts<>+368(SB), X15
3400  	VPXOR    X14, X15, X15
3401  	VPADDD   X8, X7, X7
3402  	VPXOR    X7, X10, X10
3403  	VPSHUFB  X5, X10, X10
3404  	VPADDD   X10, X9, X9
3405  	VPXOR    X9, X8, X8
3406  	VPSRLD   $0x0c, X8, X13
3407  	VPSLLD   $0x14, X8, X8
3408  	VPXOR    X13, X8, X8
3409  	VPADDD   X15, X7, X7
3410  	VPADDD   X8, X7, X7
3411  	VPXOR    X7, X10, X10
3412  	VPSHUFB  X4, X10, X10
3413  	VPADDD   X10, X9, X9
3414  	VPXOR    X9, X8, X8
3415  	VPSRLD   $0x07, X8, X13
3416  	VPSLLD   $0x19, X8, X8
3417  	VPXOR    X13, X8, X8
3418  
3419  	// Round 6 diagonal step part 3: undiagonalize.
3420  	VPSHUFD $0x93, X8, X8
3421  	VPSHUFD $0x4e, X9, X9
3422  	VPSHUFD $0x39, X10, X10
3423  
3424  	// Round 7 column step.
3425  	VPSHUFD  $0x04, X0, X14
3426  	VPSHUFD  $0x00, X1, X13
3427  	VPBLENDW $0xc0, X13, X14, X14
3428  	VPSHUFD  $0x20, X3, X13
3429  	VPBLENDW $0x33, X13, X14, X14
3430  	VMOVDQU  permuted_blake_consts<>+384(SB), X15
3431  	VPXOR    X14, X15, X15
3432  	VPADDD   X15, X7, X7
3433  	VPSHUFD  $0x01, X1, X14
3434  	VPSHUFD  $0x80, X2, X13
3435  	VPBLENDW $0xc0, X13, X14, X14
3436  	VPSHUFD  $0x1c, X3, X13
3437  	VPBLENDW $0x3c, X13, X14, X14
3438  	VMOVDQU  permuted_blake_consts<>+400(SB), X15
3439  	VPXOR    X14, X15, X15
3440  	VPADDD   X8, X7, X7
3441  	VPXOR    X7, X10, X10
3442  	VPSHUFB  X5, X10, X10
3443  	VPADDD   X10, X9, X9
3444  	VPXOR    X9, X8, X8
3445  	VPSRLD   $0x0c, X8, X13
3446  	VPSLLD   $0x14, X8, X8
3447  	VPXOR    X13, X8, X8
3448  	VPADDD   X15, X7, X7
3449  	VPADDD   X8, X7, X7
3450  	VPXOR    X7, X10, X10
3451  	VPSHUFB  X4, X10, X10
3452  	VPADDD   X10, X9, X9
3453  	VPXOR    X9, X8, X8
3454  	VPSRLD   $0x07, X8, X13
3455  	VPSLLD   $0x19, X8, X8
3456  	VPXOR    X13, X8, X8
3457  
3458  	// Round 7 diagonal step part 1: diagonalize.
3459  	VPSHUFD $0x39, X8, X8
3460  	VPSHUFD $0x4e, X9, X9
3461  	VPSHUFD $0x93, X10, X10
3462  
3463  	// Round 7 diagonal step part 2: column step.
3464  	VPSHUFD  $0x00, X0, X14
3465  	VPSHUFD  $0x08, X1, X13
3466  	VPBLENDW $0x0c, X13, X14, X14
3467  	VPSHUFD  $0x10, X2, X13
3468  	VPBLENDW $0xf0, X13, X14, X14
3469  	VMOVDQU  permuted_blake_consts<>+416(SB), X15
3470  	VPXOR    X14, X15, X15
3471  	VPADDD   X15, X7, X7
3472  	VPSHUFD  $0x2c, X0, X14
3473  	VPSHUFD  $0x03, X1, X13
3474  	VPBLENDW $0x03, X13, X14, X14
3475  	VPSHUFD  $0xc0, X2, X13
3476  	VPBLENDW $0xc0, X13, X14, X14
3477  	VMOVDQU  permuted_blake_consts<>+432(SB), X15
3478  	VPXOR    X14, X15, X15
3479  	VPADDD   X8, X7, X7
3480  	VPXOR    X7, X10, X10
3481  	VPSHUFB  X5, X10, X10
3482  	VPADDD   X10, X9, X9
3483  	VPXOR    X9, X8, X8
3484  	VPSRLD   $0x0c, X8, X13
3485  	VPSLLD   $0x14, X8, X8
3486  	VPXOR    X13, X8, X8
3487  	VPADDD   X15, X7, X7
3488  	VPADDD   X8, X7, X7
3489  	VPXOR    X7, X10, X10
3490  	VPSHUFB  X4, X10, X10
3491  	VPADDD   X10, X9, X9
3492  	VPXOR    X9, X8, X8
3493  	VPSRLD   $0x07, X8, X13
3494  	VPSLLD   $0x19, X8, X8
3495  	VPXOR    X13, X8, X8
3496  
3497  	// Round 7 diagonal step part 3: undiagonalize.
3498  	VPSHUFD $0x93, X8, X8
3499  	VPSHUFD $0x4e, X9, X9
3500  	VPSHUFD $0x39, X10, X10
3501  
3502  	// Round 8 column step.
3503  	VPSHUFD  $0xc0, X0, X14
3504  	VPSHUFD  $0x0c, X1, X13
3505  	VPBLENDW $0x0c, X13, X14, X14
3506  	VPSHUFD  $0x01, X3, X13
3507  	VPBLENDW $0x33, X13, X14, X14
3508  	VMOVDQU  permuted_blake_consts<>+448(SB), X15
3509  	VPXOR    X14, X15, X15
3510  	VPADDD   X15, X7, X7
3511  	VPSHUFD  $0x10, X0, X14
3512  	VPSHUFD  $0x43, X2, X13
3513  	VPBLENDW $0xc3, X13, X14, X14
3514  	VPSHUFD  $0x08, X3, X13
3515  	VPBLENDW $0x0c, X13, X14, X14
3516  	VMOVDQU  permuted_blake_consts<>+464(SB), X15
3517  	VPXOR    X14, X15, X15
3518  	VPADDD   X8, X7, X7
3519  	VPXOR    X7, X10, X10
3520  	VPSHUFB  X5, X10, X10
3521  	VPADDD   X10, X9, X9
3522  	VPXOR    X9, X8, X8
3523  	VPSRLD   $0x0c, X8, X13
3524  	VPSLLD   $0x14, X8, X8
3525  	VPXOR    X13, X8, X8
3526  	VPADDD   X15, X7, X7
3527  	VPADDD   X8, X7, X7
3528  	VPXOR    X7, X10, X10
3529  	VPSHUFB  X4, X10, X10
3530  	VPADDD   X10, X9, X9
3531  	VPXOR    X9, X8, X8
3532  	VPSRLD   $0x07, X8, X13
3533  	VPSLLD   $0x19, X8, X8
3534  	VPXOR    X13, X8, X8
3535  
3536  	// Round 8 diagonal step part 1: diagonalize.
3537  	VPSHUFD $0x39, X8, X8
3538  	VPSHUFD $0x4e, X9, X9
3539  	VPSHUFD $0x93, X10, X10
3540  
3541  	// Round 8 diagonal step part 2: column step.
3542  	VPSHUFD  $0x80, X0, X14
3543  	VPSHUFD  $0x01, X1, X13
3544  	VPBLENDW $0x03, X13, X14, X14
3545  	VPSHUFD  $0x00, X2, X13
3546  	VPBLENDW $0x30, X13, X14, X14
3547  	VPSHUFD  $0x0c, X3, X13
3548  	VPBLENDW $0x0c, X13, X14, X14
3549  	VMOVDQU  permuted_blake_consts<>+480(SB), X15
3550  	VPXOR    X14, X15, X15
3551  	VPADDD   X15, X7, X7
3552  	VPSHUFD  $0x00, X0, X14
3553  	VPSHUFD  $0x20, X1, X13
3554  	VPBLENDW $0x3c, X13, X14, X14
3555  	VPSHUFD  $0x80, X2, X13
3556  	VPBLENDW $0xc0, X13, X14, X14
3557  	VMOVDQU  permuted_blake_consts<>+496(SB), X15
3558  	VPXOR    X14, X15, X15
3559  	VPADDD   X8, X7, X7
3560  	VPXOR    X7, X10, X10
3561  	VPSHUFB  X5, X10, X10
3562  	VPADDD   X10, X9, X9
3563  	VPXOR    X9, X8, X8
3564  	VPSRLD   $0x0c, X8, X13
3565  	VPSLLD   $0x14, X8, X8
3566  	VPXOR    X13, X8, X8
3567  	VPADDD   X15, X7, X7
3568  	VPADDD   X8, X7, X7
3569  	VPXOR    X7, X10, X10
3570  	VPSHUFB  X4, X10, X10
3571  	VPADDD   X10, X9, X9
3572  	VPXOR    X9, X8, X8
3573  	VPSRLD   $0x07, X8, X13
3574  	VPSLLD   $0x19, X8, X8
3575  	VPXOR    X13, X8, X8
3576  
3577  	// Round 8 diagonal step part 3: undiagonalize.
3578  	VPSHUFD $0x93, X8, X8
3579  	VPSHUFD $0x4e, X9, X9
3580  	VPSHUFD $0x39, X10, X10
3581  
3582  	// Round 9 column step.
3583  	VPSHUFD  $0x00, X0, X14
3584  	VPSHUFD  $0x02, X1, X13
3585  	VPBLENDW $0x03, X13, X14, X14
3586  	VPSHUFD  $0x30, X2, X13
3587  	VPBLENDW $0x30, X13, X14, X14
3588  	VPSHUFD  $0x08, X3, X13
3589  	VPBLENDW $0x0c, X13, X14, X14
3590  	VMOVDQU  permuted_blake_consts<>+512(SB), X15
3591  	VPXOR    X14, X15, X15
3592  	VPADDD   X15, X7, X7
3593  	VPSHUFD  $0x30, X0, X14
3594  	VPSHUFD  $0x04, X2, X13
3595  	VPBLENDW $0xcc, X13, X14, X14
3596  	VPSHUFD  $0x03, X3, X13
3597  	VPBLENDW $0x03, X13, X14, X14
3598  	VMOVDQU  permuted_blake_consts<>+528(SB), X15
3599  	VPXOR    X14, X15, X15
3600  	VPADDD   X8, X7, X7
3601  	VPXOR    X7, X10, X10
3602  	VPSHUFB  X5, X10, X10
3603  	VPADDD   X10, X9, X9
3604  	VPXOR    X9, X8, X8
3605  	VPSRLD   $0x0c, X8, X13
3606  	VPSLLD   $0x14, X8, X8
3607  	VPXOR    X13, X8, X8
3608  	VPADDD   X15, X7, X7
3609  	VPADDD   X8, X7, X7
3610  	VPXOR    X7, X10, X10
3611  	VPSHUFB  X4, X10, X10
3612  	VPADDD   X10, X9, X9
3613  	VPXOR    X9, X8, X8
3614  	VPSRLD   $0x07, X8, X13
3615  	VPSLLD   $0x19, X8, X8
3616  	VPXOR    X13, X8, X8
3617  
3618  	// Round 9 diagonal step part 1: diagonalize.
3619  	VPSHUFD $0x39, X8, X8
3620  	VPSHUFD $0x4e, X9, X9
3621  	VPSHUFD $0x93, X10, X10
3622  
3623  	// Round 9 diagonal step part 2: column step.
3624  	VPSHUFD  $0x10, X0, X14
3625  	VPSHUFD  $0x80, X2, X13
3626  	VPBLENDW $0xc0, X13, X14, X14
3627  	VPSHUFD  $0x04, X3, X13
3628  	VPBLENDW $0x0f, X13, X14, X14
3629  	VMOVDQU  permuted_blake_consts<>+544(SB), X15
3630  	VPXOR    X14, X15, X15
3631  	VPADDD   X15, X7, X7
3632  	VPSHUFD  $0x02, X0, X14
3633  	VPSHUFD  $0x4c, X1, X13
3634  	VPBLENDW $0xfc, X13, X14, X14
3635  	VMOVDQU  permuted_blake_consts<>+560(SB), X15
3636  	VPXOR    X14, X15, X15
3637  	VPADDD   X8, X7, X7
3638  	VPXOR    X7, X10, X10
3639  	VPSHUFB  X5, X10, X10
3640  	VPADDD   X10, X9, X9
3641  	VPXOR    X9, X8, X8
3642  	VPSRLD   $0x0c, X8, X13
3643  	VPSLLD   $0x14, X8, X8
3644  	VPXOR    X13, X8, X8
3645  	VPADDD   X15, X7, X7
3646  	VPADDD   X8, X7, X7
3647  	VPXOR    X7, X10, X10
3648  	VPSHUFB  X4, X10, X10
3649  	VPADDD   X10, X9, X9
3650  	VPXOR    X9, X8, X8
3651  	VPSRLD   $0x07, X8, X13
3652  	VPSLLD   $0x19, X8, X8
3653  	VPXOR    X13, X8, X8
3654  
3655  	// Round 9 diagonal step part 3: undiagonalize.
3656  	VPSHUFD $0x93, X8, X8
3657  	VPSHUFD $0x4e, X9, X9
3658  	VPSHUFD $0x39, X10, X10
3659  
3660  	// Round 10 column step.
3661  	VPSHUFD  $0x40, X0, X14
3662  	VPSHUFD  $0x30, X1, X13
3663  	VPBLENDW $0x30, X13, X14, X14
3664  	VPSHUFD  $0x02, X2, X13
3665  	VPBLENDW $0x0f, X13, X14, X14
3666  	VMOVDQU  permuted_blake_consts<>+576(SB), X15
3667  	VPXOR    X14, X15, X15
3668  	VPADDD   X15, X7, X7
3669  	VPSHUFD  $0x02, X0, X14
3670  	VPSHUFD  $0x60, X1, X13
3671  	VPBLENDW $0xfc, X13, X14, X14
3672  	VMOVDQU  permuted_blake_consts<>+592(SB), X15
3673  	VPXOR    X14, X15, X15
3674  	VPADDD   X8, X7, X7
3675  	VPXOR    X7, X10, X10
3676  	VPSHUFB  X5, X10, X10
3677  	VPADDD   X10, X9, X9
3678  	VPXOR    X9, X8, X8
3679  	VPSRLD   $0x0c, X8, X13
3680  	VPSLLD   $0x14, X8, X8
3681  	VPXOR    X13, X8, X8
3682  	VPADDD   X15, X7, X7
3683  	VPADDD   X8, X7, X7
3684  	VPXOR    X7, X10, X10
3685  	VPSHUFB  X4, X10, X10
3686  	VPADDD   X10, X9, X9
3687  	VPXOR    X9, X8, X8
3688  	VPSRLD   $0x07, X8, X13
3689  	VPSLLD   $0x19, X8, X8
3690  	VPXOR    X13, X8, X8
3691  
3692  	// Round 10 diagonal step part 1: diagonalize.
3693  	VPSHUFD $0x39, X8, X8
3694  	VPSHUFD $0x4e, X9, X9
3695  	VPSHUFD $0x93, X10, X10
3696  
3697  	// Round 10 diagonal step part 2: column step.
3698  	VPSHUFD  $0x30, X0, X14
3699  	VPSHUFD  $0x04, X2, X13
3700  	VPBLENDW $0x0c, X13, X14, X14
3701  	VPSHUFD  $0x43, X3, X13
3702  	VPBLENDW $0xc3, X13, X14, X14
3703  	VMOVDQU  permuted_blake_consts<>+608(SB), X15
3704  	VPXOR    X14, X15, X15
3705  	VPADDD   X15, X7, X7
3706  	VPSHUFD  $0x00, X0, X14
3707  	VPSHUFD  $0x03, X2, X13
3708  	VPBLENDW $0x03, X13, X14, X14
3709  	VPSHUFD  $0x08, X3, X13
3710  	VPBLENDW $0x3c, X13, X14, X14
3711  	VMOVDQU  permuted_blake_consts<>+624(SB), X15
3712  	VPXOR    X14, X15, X15
3713  	VPADDD   X8, X7, X7
3714  	VPXOR    X7, X10, X10
3715  	VPSHUFB  X5, X10, X10
3716  	VPADDD   X10, X9, X9
3717  	VPXOR    X9, X8, X8
3718  	VPSRLD   $0x0c, X8, X13
3719  	VPSLLD   $0x14, X8, X8
3720  	VPXOR    X13, X8, X8
3721  	VPADDD   X15, X7, X7
3722  	VPADDD   X8, X7, X7
3723  	VPXOR    X7, X10, X10
3724  	VPSHUFB  X4, X10, X10
3725  	VPADDD   X10, X9, X9
3726  	VPXOR    X9, X8, X8
3727  	VPSRLD   $0x07, X8, X13
3728  	VPSLLD   $0x19, X8, X8
3729  	VPXOR    X13, X8, X8
3730  
3731  	// Round 10 diagonal step part 3: undiagonalize.
3732  	VPSHUFD $0x93, X8, X8
3733  	VPSHUFD $0x4e, X9, X9
3734  	VPSHUFD $0x39, X10, X10
3735  
3736  	// Round 11 column step.
3737  	VPSHUFD  $0x08, X0, X14
3738  	VPSHUFD  $0x80, X1, X13
3739  	VPBLENDW $0xf0, X13, X14, X14
3740  	VMOVDQU  permuted_blake_consts<>+0(SB), X15
3741  	VPXOR    X14, X15, X15
3742  	VPADDD   X15, X7, X7
3743  	VPSHUFD  $0x0d, X0, X14
3744  	VPSHUFD  $0xd0, X1, X13
3745  	VPBLENDW $0xf0, X13, X14, X14
3746  	VMOVDQU  permuted_blake_consts<>+16(SB), X15
3747  	VPXOR    X14, X15, X15
3748  	VPADDD   X8, X7, X7
3749  	VPXOR    X7, X10, X10
3750  	VPSHUFB  X5, X10, X10
3751  	VPADDD   X10, X9, X9
3752  	VPXOR    X9, X8, X8
3753  	VPSRLD   $0x0c, X8, X13
3754  	VPSLLD   $0x14, X8, X8
3755  	VPXOR    X13, X8, X8
3756  	VPADDD   X15, X7, X7
3757  	VPADDD   X8, X7, X7
3758  	VPXOR    X7, X10, X10
3759  	VPSHUFB  X4, X10, X10
3760  	VPADDD   X10, X9, X9
3761  	VPXOR    X9, X8, X8
3762  	VPSRLD   $0x07, X8, X13
3763  	VPSLLD   $0x19, X8, X8
3764  	VPXOR    X13, X8, X8
3765  
3766  	// Round 11 diagonal step part 1: diagonalize.
3767  	VPSHUFD $0x39, X8, X8
3768  	VPSHUFD $0x4e, X9, X9
3769  	VPSHUFD $0x93, X10, X10
3770  
3771  	// Round 11 diagonal step part 2: column step.
3772  	VPSHUFD  $0x08, X2, X14
3773  	VPSHUFD  $0x80, X3, X13
3774  	VPBLENDW $0xf0, X13, X14, X14
3775  	VMOVDQU  permuted_blake_consts<>+32(SB), X15
3776  	VPXOR    X14, X15, X15
3777  	VPADDD   X15, X7, X7
3778  	VPSHUFD  $0x0d, X2, X14
3779  	VPSHUFD  $0xd0, X3, X13
3780  	VPBLENDW $0xf0, X13, X14, X14
3781  	VMOVDQU  permuted_blake_consts<>+48(SB), X15
3782  	VPXOR    X14, X15, X15
3783  	VPADDD   X8, X7, X7
3784  	VPXOR    X7, X10, X10
3785  	VPSHUFB  X5, X10, X10
3786  	VPADDD   X10, X9, X9
3787  	VPXOR    X9, X8, X8
3788  	VPSRLD   $0x0c, X8, X13
3789  	VPSLLD   $0x14, X8, X8
3790  	VPXOR    X13, X8, X8
3791  	VPADDD   X15, X7, X7
3792  	VPADDD   X8, X7, X7
3793  	VPXOR    X7, X10, X10
3794  	VPSHUFB  X4, X10, X10
3795  	VPADDD   X10, X9, X9
3796  	VPXOR    X9, X8, X8
3797  	VPSRLD   $0x07, X8, X13
3798  	VPSLLD   $0x19, X8, X8
3799  	VPXOR    X13, X8, X8
3800  
3801  	// Round 11 diagonal step part 3: undiagonalize.
3802  	VPSHUFD $0x93, X8, X8
3803  	VPSHUFD $0x4e, X9, X9
3804  	VPSHUFD $0x39, X10, X10
3805  
3806  	// Round 12 column step.
3807  	VPSHUFD  $0x00, X1, X14
3808  	VPSHUFD  $0x10, X2, X13
3809  	VPBLENDW $0x30, X13, X14, X14
3810  	VPSHUFD  $0x42, X3, X13
3811  	VPBLENDW $0xc3, X13, X14, X14
3812  	VMOVDQU  permuted_blake_consts<>+64(SB), X15
3813  	VPXOR    X14, X15, X15
3814  	VPADDD   X15, X7, X7
3815  	VPSHUFD  $0x80, X1, X14
3816  	VPSHUFD  $0x02, X2, X13
3817  	VPBLENDW $0x0f, X13, X14, X14
3818  	VPSHUFD  $0x30, X3, X13
3819  	VPBLENDW $0x30, X13, X14, X14
3820  	VMOVDQU  permuted_blake_consts<>+80(SB), X15
3821  	VPXOR    X14, X15, X15
3822  	VPADDD   X8, X7, X7
3823  	VPXOR    X7, X10, X10
3824  	VPSHUFB  X5, X10, X10
3825  	VPADDD   X10, X9, X9
3826  	VPXOR    X9, X8, X8
3827  	VPSRLD   $0x0c, X8, X13
3828  	VPSLLD   $0x14, X8, X8
3829  	VPXOR    X13, X8, X8
3830  	VPADDD   X15, X7, X7
3831  	VPADDD   X8, X7, X7
3832  	VPXOR    X7, X10, X10
3833  	VPSHUFB  X4, X10, X10
3834  	VPADDD   X10, X9, X9
3835  	VPXOR    X9, X8, X8
3836  	VPSRLD   $0x07, X8, X13
3837  	VPSLLD   $0x19, X8, X8
3838  	VPXOR    X13, X8, X8
3839  
3840  	// Round 12 diagonal step part 1: diagonalize.
3841  	VPSHUFD $0x39, X8, X8
3842  	VPSHUFD $0x4e, X9, X9
3843  	VPSHUFD $0x93, X10, X10
3844  
3845  	// Round 12 diagonal step part 2: column step.
3846  	VPSHUFD  $0x01, X0, X14
3847  	VPSHUFD  $0x40, X1, X13
3848  	VPBLENDW $0xc0, X13, X14, X14
3849  	VPSHUFD  $0x30, X2, X13
3850  	VPBLENDW $0x30, X13, X14, X14
3851  	VMOVDQU  permuted_blake_consts<>+96(SB), X15
3852  	VPXOR    X14, X15, X15
3853  	VPADDD   X15, X7, X7
3854  	VPSHUFD  $0xc8, X0, X14
3855  	VPSHUFD  $0x30, X1, X13
3856  	VPBLENDW $0x30, X13, X14, X14
3857  	VPSHUFD  $0x00, X3, X13
3858  	VPBLENDW $0x03, X13, X14, X14
3859  	VMOVDQU  permuted_blake_consts<>+112(SB), X15
3860  	VPXOR    X14, X15, X15
3861  	VPADDD   X8, X7, X7
3862  	VPXOR    X7, X10, X10
3863  	VPSHUFB  X5, X10, X10
3864  	VPADDD   X10, X9, X9
3865  	VPXOR    X9, X8, X8
3866  	VPSRLD   $0x0c, X8, X13
3867  	VPSLLD   $0x14, X8, X8
3868  	VPXOR    X13, X8, X8
3869  	VPADDD   X15, X7, X7
3870  	VPADDD   X8, X7, X7
3871  	VPXOR    X7, X10, X10
3872  	VPSHUFB  X4, X10, X10
3873  	VPADDD   X10, X9, X9
3874  	VPXOR    X9, X8, X8
3875  	VPSRLD   $0x07, X8, X13
3876  	VPSLLD   $0x19, X8, X8
3877  	VPXOR    X13, X8, X8
3878  
3879  	// Round 12 diagonal step part 3: undiagonalize.
3880  	VPSHUFD $0x93, X8, X8
3881  	VPSHUFD $0x4e, X9, X9
3882  	VPSHUFD $0x39, X10, X10
3883  
3884  	// Round 13 column step.
3885  	VPSHUFD  $0x10, X1, X14
3886  	VPSHUFD  $0x03, X2, X13
3887  	VPBLENDW $0x03, X13, X14, X14
3888  	VPSHUFD  $0xc0, X3, X13
3889  	VPBLENDW $0xcc, X13, X14, X14
3890  	VMOVDQU  permuted_blake_consts<>+128(SB), X15
3891  	VPXOR    X14, X15, X15
3892  	VPADDD   X15, X7, X7
3893  	VPSHUFD  $0x20, X0, X14
3894  	VPSHUFD  $0x00, X2, X13
3895  	VPBLENDW $0x03, X13, X14, X14
3896  	VPSHUFD  $0x40, X3, X13
3897  	VPBLENDW $0xc0, X13, X14, X14
3898  	VMOVDQU  permuted_blake_consts<>+144(SB), X15
3899  	VPXOR    X14, X15, X15
3900  	VPADDD   X8, X7, X7
3901  	VPXOR    X7, X10, X10
3902  	VPSHUFB  X5, X10, X10
3903  	VPADDD   X10, X9, X9
3904  	VPXOR    X9, X8, X8
3905  	VPSRLD   $0x0c, X8, X13
3906  	VPSLLD   $0x14, X8, X8
3907  	VPXOR    X13, X8, X8
3908  	VPADDD   X15, X7, X7
3909  	VPADDD   X8, X7, X7
3910  	VPXOR    X7, X10, X10
3911  	VPSHUFB  X4, X10, X10
3912  	VPADDD   X10, X9, X9
3913  	VPXOR    X9, X8, X8
3914  	VPSRLD   $0x07, X8, X13
3915  	VPSLLD   $0x19, X8, X8
3916  	VPXOR    X13, X8, X8
3917  
3918  	// Round 13 diagonal step part 1: diagonalize.
3919  	VPSHUFD $0x39, X8, X8
3920  	VPSHUFD $0x4e, X9, X9
3921  	VPSHUFD $0x93, X10, X10
3922  
3923  	// Round 13 diagonal step part 2: column step.
3924  	VPSHUFD  $0x0c, X0, X14
3925  	VPSHUFD  $0x30, X1, X13
3926  	VPBLENDW $0x30, X13, X14, X14
3927  	VPSHUFD  $0x42, X2, X13
3928  	VPBLENDW $0xc3, X13, X14, X14
3929  	VMOVDQU  permuted_blake_consts<>+160(SB), X15
3930  	VPXOR    X14, X15, X15
3931  	VPADDD   X15, X7, X7
3932  	VPSHUFD  $0x10, X0, X14
3933  	VPSHUFD  $0x08, X1, X13
3934  	VPBLENDW $0xcc, X13, X14, X14
3935  	VPSHUFD  $0x02, X3, X13
3936  	VPBLENDW $0x03, X13, X14, X14
3937  	VMOVDQU  permuted_blake_consts<>+176(SB), X15
3938  	VPXOR    X14, X15, X15
3939  	VPADDD   X8, X7, X7
3940  	VPXOR    X7, X10, X10
3941  	VPSHUFB  X5, X10, X10
3942  	VPADDD   X10, X9, X9
3943  	VPXOR    X9, X8, X8
3944  	VPSRLD   $0x0c, X8, X13
3945  	VPSLLD   $0x14, X8, X8
3946  	VPXOR    X13, X8, X8
3947  	VPADDD   X15, X7, X7
3948  	VPADDD   X8, X7, X7
3949  	VPXOR    X7, X10, X10
3950  	VPSHUFB  X4, X10, X10
3951  	VPADDD   X10, X9, X9
3952  	VPXOR    X9, X8, X8
3953  	VPSRLD   $0x07, X8, X13
3954  	VPSLLD   $0x19, X8, X8
3955  	VPXOR    X13, X8, X8
3956  
3957  	// Round 13 diagonal step part 3: undiagonalize.
3958  	VPSHUFD $0x93, X8, X8
3959  	VPSHUFD $0x4e, X9, X9
3960  	VPSHUFD $0x39, X10, X10
3961  
3962  	// Round 14 column step.
3963  	VPSHUFD  $0x0c, X0, X14
3964  	VPSHUFD  $0x03, X1, X13
3965  	VPBLENDW $0x03, X13, X14, X14
3966  	VPSHUFD  $0xc0, X2, X13
3967  	VPBLENDW $0xc0, X13, X14, X14
3968  	VPSHUFD  $0x10, X3, X13
3969  	VPBLENDW $0x30, X13, X14, X14
3970  	VMOVDQU  permuted_blake_consts<>+192(SB), X15
3971  	VPXOR    X14, X15, X15
3972  	VPADDD   X15, X7, X7
3973  	VPSHUFD  $0x04, X0, X14
3974  	VPSHUFD  $0x01, X2, X13
3975  	VPBLENDW $0x03, X13, X14, X14
3976  	VPSHUFD  $0x80, X3, X13
3977  	VPBLENDW $0xf0, X13, X14, X14
3978  	VMOVDQU  permuted_blake_consts<>+208(SB), X15
3979  	VPXOR    X14, X15, X15
3980  	VPADDD   X8, X7, X7
3981  	VPXOR    X7, X10, X10
3982  	VPSHUFB  X5, X10, X10
3983  	VPADDD   X10, X9, X9
3984  	VPXOR    X9, X8, X8
3985  	VPSRLD   $0x0c, X8, X13
3986  	VPSLLD   $0x14, X8, X8
3987  	VPXOR    X13, X8, X8
3988  	VPADDD   X15, X7, X7
3989  	VPADDD   X8, X7, X7
3990  	VPXOR    X7, X10, X10
3991  	VPSHUFB  X4, X10, X10
3992  	VPADDD   X10, X9, X9
3993  	VPXOR    X9, X8, X8
3994  	VPSRLD   $0x07, X8, X13
3995  	VPSLLD   $0x19, X8, X8
3996  	VPXOR    X13, X8, X8
3997  
3998  	// Round 14 diagonal step part 1: diagonalize.
3999  	VPSHUFD $0x39, X8, X8
4000  	VPSHUFD $0x4e, X9, X9
4001  	VPSHUFD $0x93, X10, X10
4002  
4003  	// Round 14 diagonal step part 2: column step.
4004  	VPSHUFD  $0x02, X0, X14
4005  	VPSHUFD  $0x04, X1, X13
4006  	VPBLENDW $0x3c, X13, X14, X14
4007  	VPSHUFD  $0xc0, X3, X13
4008  	VPBLENDW $0xc0, X13, X14, X14
4009  	VMOVDQU  permuted_blake_consts<>+224(SB), X15
4010  	VPXOR    X14, X15, X15
4011  	VPADDD   X15, X7, X7
4012  	VPSHUFD  $0x00, X0, X14
4013  	VPSHUFD  $0x02, X1, X13
4014  	VPBLENDW $0x03, X13, X14, X14
4015  	VPSHUFD  $0x08, X2, X13
4016  	VPBLENDW $0xcc, X13, X14, X14
4017  	VMOVDQU  permuted_blake_consts<>+240(SB), X15
4018  	VPXOR    X14, X15, X15
4019  	VPADDD   X8, X7, X7
4020  	VPXOR    X7, X10, X10
4021  	VPSHUFB  X5, X10, X10
4022  	VPADDD   X10, X9, X9
4023  	VPXOR    X9, X8, X8
4024  	VPSRLD   $0x0c, X8, X13
4025  	VPSLLD   $0x14, X8, X8
4026  	VPXOR    X13, X8, X8
4027  	VPADDD   X15, X7, X7
4028  	VPADDD   X8, X7, X7
4029  	VPXOR    X7, X10, X10
4030  	VPSHUFB  X4, X10, X10
4031  	VPADDD   X10, X9, X9
4032  	VPXOR    X9, X8, X8
4033  	VPSRLD   $0x07, X8, X13
4034  	VPSLLD   $0x19, X8, X8
4035  	VPXOR    X13, X8, X8
4036  
4037  	// Round 14 diagonal step part 3: undiagonalize.
4038  	VPSHUFD $0x93, X8, X8
4039  	VPSHUFD $0x4e, X9, X9
4040  	VPSHUFD $0x39, X10, X10
4041  
4042  	// Finally the chain value is defined as:
4043  	// h'0 = h0^s0^v0^v8
4044  	// h'1 = h1^s1^v1^v9
4045  	// h'2 = h2^s2^v2^va
4046  	// h'3 = h3^s3^v3^vb
4047  	// h'4 = h4^s0^v4^vc
4048  	// h'5 = h5^s1^v5^vd
4049  	// h'6 = h6^s2^v6^ve
4050  	// h'7 = h7^s3^v7^vf
4051  	VPXOR X11, X7, X7
4052  	VPXOR X6, X7, X7
4053  	VPXOR X9, X7, X7
4054  	VPXOR X12, X8, X8
4055  	VPXOR X6, X8, X8
4056  	VPXOR X10, X8, X8
4057  
4058  	// Either terminate the loop when there are no more full blocks
4059  	// to compress or move the message pointer to the next block of
4060  	// bytes to compress, increment the message bits counter
4061  	// accordingly, and loop back around to compress it.
4062  	DECQ BX
4063  	JZ   done
4064  	LEAQ 64(DX), DX
4065  	ADDQ $0x00000200, CX
4066  	JMP  compressLoop
4067  
4068  done:
4069  	// Output the resulting chain value.
4070  	VMOVDQU X7, (AX)
4071  	VMOVDQU X8, 16(AX)
4072  	RET
4073