chacha20poly1305_amd64.s raw

   1  // Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT.
   2  
   3  //go:build gc && !purego
   4  
   5  #include "textflag.h"
   6  
   7  // func polyHashADInternal<>()
   8  TEXT polyHashADInternal<>(SB), NOSPLIT, $0
   9  	// Hack: Must declare #define macros inside of a function due to Avo constraints
  10  	// ROL rotates the uint32s in register R left by N bits, using temporary T.
  11  	#define ROL(N, R, T) \
  12  		MOVO R, T; \
  13  		PSLLL $(N), T; \
  14  		PSRLL $(32-(N)), R; \
  15  		PXOR T, R
  16  
  17  	// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed.
  18  	#ifdef GOAMD64_v2
  19  		#define ROL8(R, T) PSHUFB ·rol8<>(SB), R
  20  	#else
  21  		#define ROL8(R, T) ROL(8, R, T)
  22  	#endif
  23  
  24  	// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed.
  25  	#ifdef GOAMD64_v2
  26  		#define ROL16(R, T) PSHUFB ·rol16<>(SB), R
  27  	#else
  28  		#define ROL16(R, T) ROL(16, R, T)
  29  	#endif
  30  	XORQ  R10, R10
  31  	XORQ  R11, R11
  32  	XORQ  R12, R12
  33  	CMPQ  R9, $0x0d
  34  	JNE   hashADLoop
  35  	MOVQ  (CX), R10
  36  	MOVQ  5(CX), R11
  37  	SHRQ  $0x18, R11
  38  	MOVQ  $0x00000001, R12
  39  	MOVQ  (BP), AX
  40  	MOVQ  AX, R15
  41  	MULQ  R10
  42  	MOVQ  AX, R13
  43  	MOVQ  DX, R14
  44  	MOVQ  (BP), AX
  45  	MULQ  R11
  46  	IMULQ R12, R15
  47  	ADDQ  AX, R14
  48  	ADCQ  DX, R15
  49  	MOVQ  8(BP), AX
  50  	MOVQ  AX, R8
  51  	MULQ  R10
  52  	ADDQ  AX, R14
  53  	ADCQ  $0x00, DX
  54  	MOVQ  DX, R10
  55  	MOVQ  8(BP), AX
  56  	MULQ  R11
  57  	ADDQ  AX, R15
  58  	ADCQ  $0x00, DX
  59  	IMULQ R12, R8
  60  	ADDQ  R10, R15
  61  	ADCQ  DX, R8
  62  	MOVQ  R13, R10
  63  	MOVQ  R14, R11
  64  	MOVQ  R15, R12
  65  	ANDQ  $0x03, R12
  66  	MOVQ  R15, R13
  67  	ANDQ  $-4, R13
  68  	MOVQ  R8, R14
  69  	SHRQ  $0x02, R8, R15
  70  	SHRQ  $0x02, R8
  71  	ADDQ  R13, R10
  72  	ADCQ  R14, R11
  73  	ADCQ  $0x00, R12
  74  	ADDQ  R15, R10
  75  	ADCQ  R8, R11
  76  	ADCQ  $0x00, R12
  77  	RET
  78  
  79  hashADLoop:
  80  	// Hash in 16 byte chunks
  81  	CMPQ  R9, $0x10
  82  	JB    hashADTail
  83  	ADDQ  (CX), R10
  84  	ADCQ  8(CX), R11
  85  	ADCQ  $0x01, R12
  86  	LEAQ  16(CX), CX
  87  	SUBQ  $0x10, R9
  88  	MOVQ  (BP), AX
  89  	MOVQ  AX, R15
  90  	MULQ  R10
  91  	MOVQ  AX, R13
  92  	MOVQ  DX, R14
  93  	MOVQ  (BP), AX
  94  	MULQ  R11
  95  	IMULQ R12, R15
  96  	ADDQ  AX, R14
  97  	ADCQ  DX, R15
  98  	MOVQ  8(BP), AX
  99  	MOVQ  AX, R8
 100  	MULQ  R10
 101  	ADDQ  AX, R14
 102  	ADCQ  $0x00, DX
 103  	MOVQ  DX, R10
 104  	MOVQ  8(BP), AX
 105  	MULQ  R11
 106  	ADDQ  AX, R15
 107  	ADCQ  $0x00, DX
 108  	IMULQ R12, R8
 109  	ADDQ  R10, R15
 110  	ADCQ  DX, R8
 111  	MOVQ  R13, R10
 112  	MOVQ  R14, R11
 113  	MOVQ  R15, R12
 114  	ANDQ  $0x03, R12
 115  	MOVQ  R15, R13
 116  	ANDQ  $-4, R13
 117  	MOVQ  R8, R14
 118  	SHRQ  $0x02, R8, R15
 119  	SHRQ  $0x02, R8
 120  	ADDQ  R13, R10
 121  	ADCQ  R14, R11
 122  	ADCQ  $0x00, R12
 123  	ADDQ  R15, R10
 124  	ADCQ  R8, R11
 125  	ADCQ  $0x00, R12
 126  	JMP   hashADLoop
 127  
 128  hashADTail:
 129  	CMPQ R9, $0x00
 130  	JE   hashADDone
 131  
 132  	// Hash last < 16 byte tail
 133  	XORQ R13, R13
 134  	XORQ R14, R14
 135  	XORQ R15, R15
 136  	ADDQ R9, CX
 137  
 138  hashADTailLoop:
 139  	SHLQ  $0x08, R13, R14
 140  	SHLQ  $0x08, R13
 141  	MOVB  -1(CX), R15
 142  	XORQ  R15, R13
 143  	DECQ  CX
 144  	DECQ  R9
 145  	JNE   hashADTailLoop
 146  	ADDQ  R13, R10
 147  	ADCQ  R14, R11
 148  	ADCQ  $0x01, R12
 149  	MOVQ  (BP), AX
 150  	MOVQ  AX, R15
 151  	MULQ  R10
 152  	MOVQ  AX, R13
 153  	MOVQ  DX, R14
 154  	MOVQ  (BP), AX
 155  	MULQ  R11
 156  	IMULQ R12, R15
 157  	ADDQ  AX, R14
 158  	ADCQ  DX, R15
 159  	MOVQ  8(BP), AX
 160  	MOVQ  AX, R8
 161  	MULQ  R10
 162  	ADDQ  AX, R14
 163  	ADCQ  $0x00, DX
 164  	MOVQ  DX, R10
 165  	MOVQ  8(BP), AX
 166  	MULQ  R11
 167  	ADDQ  AX, R15
 168  	ADCQ  $0x00, DX
 169  	IMULQ R12, R8
 170  	ADDQ  R10, R15
 171  	ADCQ  DX, R8
 172  	MOVQ  R13, R10
 173  	MOVQ  R14, R11
 174  	MOVQ  R15, R12
 175  	ANDQ  $0x03, R12
 176  	MOVQ  R15, R13
 177  	ANDQ  $-4, R13
 178  	MOVQ  R8, R14
 179  	SHRQ  $0x02, R8, R15
 180  	SHRQ  $0x02, R8
 181  	ADDQ  R13, R10
 182  	ADCQ  R14, R11
 183  	ADCQ  $0x00, R12
 184  	ADDQ  R15, R10
 185  	ADCQ  R8, R11
 186  	ADCQ  $0x00, R12
 187  
 188  hashADDone:
 189  	RET
 190  
 191  // func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool
 192  // Requires: AVX, AVX2, BMI2, CMOV, SSE2
 193  TEXT ·chacha20Poly1305Open(SB), $288-97
 194  	// For aligned stack access
 195  	MOVQ SP, BP
 196  	ADDQ $0x20, BP
 197  	ANDQ $-32, BP
 198  	MOVQ dst_base+0(FP), DI
 199  	MOVQ key_base+24(FP), R8
 200  	MOVQ src_base+48(FP), SI
 201  	MOVQ src_len+56(FP), BX
 202  	MOVQ ad_base+72(FP), CX
 203  
 204  	// Check for AVX2 support
 205  	CMPB ·useAVX2+0(SB), $0x01
 206  	JE   chacha20Poly1305Open_AVX2
 207  
 208  	// Special optimization, for very short buffers
 209  	CMPQ BX, $0x80
 210  	JBE  openSSE128
 211  
 212  	// For long buffers, prepare the poly key first
 213  	MOVOU ·chacha20Constants<>+0(SB), X0
 214  	MOVOU 16(R8), X3
 215  	MOVOU 32(R8), X6
 216  	MOVOU 48(R8), X9
 217  	MOVO  X9, X13
 218  
 219  	// Store state on stack for future use
 220  	MOVO X3, 32(BP)
 221  	MOVO X6, 48(BP)
 222  	MOVO X9, 128(BP)
 223  	MOVQ $0x0000000a, R9
 224  
 225  openSSEPreparePolyKey:
 226  	PADDD X3, X0
 227  	PXOR  X0, X9
 228  	ROL16(X9, X12)
 229  	PADDD X9, X6
 230  	PXOR  X6, X3
 231  	MOVO  X3, X12
 232  	PSLLL $0x0c, X12
 233  	PSRLL $0x14, X3
 234  	PXOR  X12, X3
 235  	PADDD X3, X0
 236  	PXOR  X0, X9
 237  	ROL8(X9, X12)
 238  	PADDD X9, X6
 239  	PXOR  X6, X3
 240  	MOVO  X3, X12
 241  	PSLLL $0x07, X12
 242  	PSRLL $0x19, X3
 243  	PXOR  X12, X3
 244  	BYTE  $0x66
 245  	BYTE  $0x0f
 246  	BYTE  $0x3a
 247  	BYTE  $0x0f
 248  	BYTE  $0xdb
 249  	BYTE  $0x04
 250  	BYTE  $0x66
 251  	BYTE  $0x0f
 252  	BYTE  $0x3a
 253  	BYTE  $0x0f
 254  	BYTE  $0xf6
 255  	BYTE  $0x08
 256  	BYTE  $0x66
 257  	BYTE  $0x45
 258  	BYTE  $0x0f
 259  	BYTE  $0x3a
 260  	BYTE  $0x0f
 261  	BYTE  $0xc9
 262  	BYTE  $0x0c
 263  	PADDD X3, X0
 264  	PXOR  X0, X9
 265  	ROL16(X9, X12)
 266  	PADDD X9, X6
 267  	PXOR  X6, X3
 268  	MOVO  X3, X12
 269  	PSLLL $0x0c, X12
 270  	PSRLL $0x14, X3
 271  	PXOR  X12, X3
 272  	PADDD X3, X0
 273  	PXOR  X0, X9
 274  	ROL8(X9, X12)
 275  	PADDD X9, X6
 276  	PXOR  X6, X3
 277  	MOVO  X3, X12
 278  	PSLLL $0x07, X12
 279  	PSRLL $0x19, X3
 280  	PXOR  X12, X3
 281  	BYTE  $0x66
 282  	BYTE  $0x0f
 283  	BYTE  $0x3a
 284  	BYTE  $0x0f
 285  	BYTE  $0xdb
 286  	BYTE  $0x0c
 287  	BYTE  $0x66
 288  	BYTE  $0x0f
 289  	BYTE  $0x3a
 290  	BYTE  $0x0f
 291  	BYTE  $0xf6
 292  	BYTE  $0x08
 293  	BYTE  $0x66
 294  	BYTE  $0x45
 295  	BYTE  $0x0f
 296  	BYTE  $0x3a
 297  	BYTE  $0x0f
 298  	BYTE  $0xc9
 299  	BYTE  $0x04
 300  	DECQ  R9
 301  	JNE   openSSEPreparePolyKey
 302  
 303  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
 304  	PADDL ·chacha20Constants<>+0(SB), X0
 305  	PADDL 32(BP), X3
 306  
 307  	// Clamp and store the key
 308  	PAND ·polyClampMask<>+0(SB), X0
 309  	MOVO X0, (BP)
 310  	MOVO X3, 16(BP)
 311  
 312  	// Hash AAD
 313  	MOVQ ad_len+80(FP), R9
 314  	CALL polyHashADInternal<>(SB)
 315  
 316  openSSEMainLoop:
 317  	CMPQ BX, $0x00000100
 318  	JB   openSSEMainLoopDone
 319  
 320  	// Load state, increment counter blocks
 321  	MOVO  ·chacha20Constants<>+0(SB), X0
 322  	MOVO  32(BP), X3
 323  	MOVO  48(BP), X6
 324  	MOVO  128(BP), X9
 325  	PADDL ·sseIncMask<>+0(SB), X9
 326  	MOVO  X0, X1
 327  	MOVO  X3, X4
 328  	MOVO  X6, X7
 329  	MOVO  X9, X10
 330  	PADDL ·sseIncMask<>+0(SB), X10
 331  	MOVO  X1, X2
 332  	MOVO  X4, X5
 333  	MOVO  X7, X8
 334  	MOVO  X10, X11
 335  	PADDL ·sseIncMask<>+0(SB), X11
 336  	MOVO  X2, X12
 337  	MOVO  X5, X13
 338  	MOVO  X8, X14
 339  	MOVO  X11, X15
 340  	PADDL ·sseIncMask<>+0(SB), X15
 341  
 342  	// Store counters
 343  	MOVO X9, 80(BP)
 344  	MOVO X10, 96(BP)
 345  	MOVO X11, 112(BP)
 346  	MOVO X15, 128(BP)
 347  
 348  	// There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash
 349  	// 2 blocks, and for the remaining 4 only 1 block - for a total of 16
 350  	MOVQ $0x00000004, CX
 351  	MOVQ SI, R9
 352  
 353  openSSEInternalLoop:
 354  	MOVO  X14, 64(BP)
 355  	PADDD X3, X0
 356  	PXOR  X0, X9
 357  	ROL16(X9, X14)
 358  	PADDD X9, X6
 359  	PXOR  X6, X3
 360  	MOVO  X3, X14
 361  	PSLLL $0x0c, X14
 362  	PSRLL $0x14, X3
 363  	PXOR  X14, X3
 364  	PADDD X3, X0
 365  	PXOR  X0, X9
 366  	ROL8(X9, X14)
 367  	PADDD X9, X6
 368  	PXOR  X6, X3
 369  	MOVO  X3, X14
 370  	PSLLL $0x07, X14
 371  	PSRLL $0x19, X3
 372  	PXOR  X14, X3
 373  	PADDD X4, X1
 374  	PXOR  X1, X10
 375  	ROL16(X10, X14)
 376  	PADDD X10, X7
 377  	PXOR  X7, X4
 378  	MOVO  X4, X14
 379  	PSLLL $0x0c, X14
 380  	PSRLL $0x14, X4
 381  	PXOR  X14, X4
 382  	PADDD X4, X1
 383  	PXOR  X1, X10
 384  	ROL8(X10, X14)
 385  	PADDD X10, X7
 386  	PXOR  X7, X4
 387  	MOVO  X4, X14
 388  	PSLLL $0x07, X14
 389  	PSRLL $0x19, X4
 390  	PXOR  X14, X4
 391  	PADDD X5, X2
 392  	PXOR  X2, X11
 393  	ROL16(X11, X14)
 394  	PADDD X11, X8
 395  	PXOR  X8, X5
 396  	MOVO  X5, X14
 397  	PSLLL $0x0c, X14
 398  	PSRLL $0x14, X5
 399  	PXOR  X14, X5
 400  	PADDD X5, X2
 401  	PXOR  X2, X11
 402  	ROL8(X11, X14)
 403  	PADDD X11, X8
 404  	PXOR  X8, X5
 405  	MOVO  X5, X14
 406  	PSLLL $0x07, X14
 407  	PSRLL $0x19, X5
 408  	PXOR  X14, X5
 409  	MOVO  64(BP), X14
 410  	MOVO  X7, 64(BP)
 411  	PADDD X13, X12
 412  	PXOR  X12, X15
 413  	ROL16(X15, X7)
 414  	PADDD X15, X14
 415  	PXOR  X14, X13
 416  	MOVO  X13, X7
 417  	PSLLL $0x0c, X7
 418  	PSRLL $0x14, X13
 419  	PXOR  X7, X13
 420  	PADDD X13, X12
 421  	PXOR  X12, X15
 422  	ROL8(X15, X7)
 423  	PADDD X15, X14
 424  	PXOR  X14, X13
 425  	MOVO  X13, X7
 426  	PSLLL $0x07, X7
 427  	PSRLL $0x19, X13
 428  	PXOR  X7, X13
 429  	MOVO  64(BP), X7
 430  	ADDQ  (R9), R10
 431  	ADCQ  8(R9), R11
 432  	ADCQ  $0x01, R12
 433  	BYTE  $0x66
 434  	BYTE  $0x0f
 435  	BYTE  $0x3a
 436  	BYTE  $0x0f
 437  	BYTE  $0xdb
 438  	BYTE  $0x04
 439  	BYTE  $0x66
 440  	BYTE  $0x0f
 441  	BYTE  $0x3a
 442  	BYTE  $0x0f
 443  	BYTE  $0xe4
 444  	BYTE  $0x04
 445  	BYTE  $0x66
 446  	BYTE  $0x0f
 447  	BYTE  $0x3a
 448  	BYTE  $0x0f
 449  	BYTE  $0xed
 450  	BYTE  $0x04
 451  	BYTE  $0x66
 452  	BYTE  $0x45
 453  	BYTE  $0x0f
 454  	BYTE  $0x3a
 455  	BYTE  $0x0f
 456  	BYTE  $0xed
 457  	BYTE  $0x04
 458  	BYTE  $0x66
 459  	BYTE  $0x0f
 460  	BYTE  $0x3a
 461  	BYTE  $0x0f
 462  	BYTE  $0xf6
 463  	BYTE  $0x08
 464  	BYTE  $0x66
 465  	BYTE  $0x0f
 466  	BYTE  $0x3a
 467  	BYTE  $0x0f
 468  	BYTE  $0xff
 469  	BYTE  $0x08
 470  	BYTE  $0x66
 471  	BYTE  $0x45
 472  	BYTE  $0x0f
 473  	BYTE  $0x3a
 474  	BYTE  $0x0f
 475  	BYTE  $0xc0
 476  	BYTE  $0x08
 477  	BYTE  $0x66
 478  	BYTE  $0x45
 479  	BYTE  $0x0f
 480  	BYTE  $0x3a
 481  	BYTE  $0x0f
 482  	BYTE  $0xf6
 483  	BYTE  $0x08
 484  	BYTE  $0x66
 485  	BYTE  $0x45
 486  	BYTE  $0x0f
 487  	BYTE  $0x3a
 488  	BYTE  $0x0f
 489  	BYTE  $0xc9
 490  	BYTE  $0x0c
 491  	BYTE  $0x66
 492  	BYTE  $0x45
 493  	BYTE  $0x0f
 494  	BYTE  $0x3a
 495  	BYTE  $0x0f
 496  	BYTE  $0xd2
 497  	BYTE  $0x0c
 498  	BYTE  $0x66
 499  	BYTE  $0x45
 500  	BYTE  $0x0f
 501  	BYTE  $0x3a
 502  	BYTE  $0x0f
 503  	BYTE  $0xdb
 504  	BYTE  $0x0c
 505  	BYTE  $0x66
 506  	BYTE  $0x45
 507  	BYTE  $0x0f
 508  	BYTE  $0x3a
 509  	BYTE  $0x0f
 510  	BYTE  $0xff
 511  	BYTE  $0x0c
 512  	MOVQ  (BP), AX
 513  	MOVQ  AX, R15
 514  	MULQ  R10
 515  	MOVQ  AX, R13
 516  	MOVQ  DX, R14
 517  	MOVQ  (BP), AX
 518  	MULQ  R11
 519  	IMULQ R12, R15
 520  	ADDQ  AX, R14
 521  	ADCQ  DX, R15
 522  	MOVQ  8(BP), AX
 523  	MOVQ  AX, R8
 524  	MULQ  R10
 525  	ADDQ  AX, R14
 526  	ADCQ  $0x00, DX
 527  	MOVQ  DX, R10
 528  	MOVQ  8(BP), AX
 529  	MULQ  R11
 530  	ADDQ  AX, R15
 531  	ADCQ  $0x00, DX
 532  	LEAQ  16(R9), R9
 533  	MOVO  X14, 64(BP)
 534  	PADDD X3, X0
 535  	PXOR  X0, X9
 536  	ROL16(X9, X14)
 537  	PADDD X9, X6
 538  	PXOR  X6, X3
 539  	MOVO  X3, X14
 540  	PSLLL $0x0c, X14
 541  	PSRLL $0x14, X3
 542  	PXOR  X14, X3
 543  	PADDD X3, X0
 544  	PXOR  X0, X9
 545  	ROL8(X9, X14)
 546  	PADDD X9, X6
 547  	PXOR  X6, X3
 548  	MOVO  X3, X14
 549  	PSLLL $0x07, X14
 550  	PSRLL $0x19, X3
 551  	PXOR  X14, X3
 552  	PADDD X4, X1
 553  	PXOR  X1, X10
 554  	ROL16(X10, X14)
 555  	PADDD X10, X7
 556  	PXOR  X7, X4
 557  	MOVO  X4, X14
 558  	PSLLL $0x0c, X14
 559  	PSRLL $0x14, X4
 560  	PXOR  X14, X4
 561  	PADDD X4, X1
 562  	PXOR  X1, X10
 563  	ROL8(X10, X14)
 564  	PADDD X10, X7
 565  	PXOR  X7, X4
 566  	MOVO  X4, X14
 567  	PSLLL $0x07, X14
 568  	PSRLL $0x19, X4
 569  	PXOR  X14, X4
 570  	PADDD X5, X2
 571  	PXOR  X2, X11
 572  	ROL16(X11, X14)
 573  	PADDD X11, X8
 574  	PXOR  X8, X5
 575  	MOVO  X5, X14
 576  	PSLLL $0x0c, X14
 577  	PSRLL $0x14, X5
 578  	PXOR  X14, X5
 579  	PADDD X5, X2
 580  	PXOR  X2, X11
 581  	ROL8(X11, X14)
 582  	PADDD X11, X8
 583  	PXOR  X8, X5
 584  	MOVO  X5, X14
 585  	PSLLL $0x07, X14
 586  	PSRLL $0x19, X5
 587  	PXOR  X14, X5
 588  	MOVO  64(BP), X14
 589  	MOVO  X7, 64(BP)
 590  	IMULQ R12, R8
 591  	ADDQ  R10, R15
 592  	ADCQ  DX, R8
 593  	PADDD X13, X12
 594  	PXOR  X12, X15
 595  	ROL16(X15, X7)
 596  	PADDD X15, X14
 597  	PXOR  X14, X13
 598  	MOVO  X13, X7
 599  	PSLLL $0x0c, X7
 600  	PSRLL $0x14, X13
 601  	PXOR  X7, X13
 602  	PADDD X13, X12
 603  	PXOR  X12, X15
 604  	ROL8(X15, X7)
 605  	PADDD X15, X14
 606  	PXOR  X14, X13
 607  	MOVO  X13, X7
 608  	PSLLL $0x07, X7
 609  	PSRLL $0x19, X13
 610  	PXOR  X7, X13
 611  	MOVO  64(BP), X7
 612  	MOVQ  R13, R10
 613  	MOVQ  R14, R11
 614  	MOVQ  R15, R12
 615  	ANDQ  $0x03, R12
 616  	MOVQ  R15, R13
 617  	ANDQ  $-4, R13
 618  	MOVQ  R8, R14
 619  	SHRQ  $0x02, R8, R15
 620  	SHRQ  $0x02, R8
 621  	ADDQ  R13, R10
 622  	ADCQ  R14, R11
 623  	ADCQ  $0x00, R12
 624  	ADDQ  R15, R10
 625  	ADCQ  R8, R11
 626  	ADCQ  $0x00, R12
 627  	BYTE  $0x66
 628  	BYTE  $0x0f
 629  	BYTE  $0x3a
 630  	BYTE  $0x0f
 631  	BYTE  $0xdb
 632  	BYTE  $0x0c
 633  	BYTE  $0x66
 634  	BYTE  $0x0f
 635  	BYTE  $0x3a
 636  	BYTE  $0x0f
 637  	BYTE  $0xe4
 638  	BYTE  $0x0c
 639  	BYTE  $0x66
 640  	BYTE  $0x0f
 641  	BYTE  $0x3a
 642  	BYTE  $0x0f
 643  	BYTE  $0xed
 644  	BYTE  $0x0c
 645  	BYTE  $0x66
 646  	BYTE  $0x45
 647  	BYTE  $0x0f
 648  	BYTE  $0x3a
 649  	BYTE  $0x0f
 650  	BYTE  $0xed
 651  	BYTE  $0x0c
 652  	BYTE  $0x66
 653  	BYTE  $0x0f
 654  	BYTE  $0x3a
 655  	BYTE  $0x0f
 656  	BYTE  $0xf6
 657  	BYTE  $0x08
 658  	BYTE  $0x66
 659  	BYTE  $0x0f
 660  	BYTE  $0x3a
 661  	BYTE  $0x0f
 662  	BYTE  $0xff
 663  	BYTE  $0x08
 664  	BYTE  $0x66
 665  	BYTE  $0x45
 666  	BYTE  $0x0f
 667  	BYTE  $0x3a
 668  	BYTE  $0x0f
 669  	BYTE  $0xc0
 670  	BYTE  $0x08
 671  	BYTE  $0x66
 672  	BYTE  $0x45
 673  	BYTE  $0x0f
 674  	BYTE  $0x3a
 675  	BYTE  $0x0f
 676  	BYTE  $0xf6
 677  	BYTE  $0x08
 678  	BYTE  $0x66
 679  	BYTE  $0x45
 680  	BYTE  $0x0f
 681  	BYTE  $0x3a
 682  	BYTE  $0x0f
 683  	BYTE  $0xc9
 684  	BYTE  $0x04
 685  	BYTE  $0x66
 686  	BYTE  $0x45
 687  	BYTE  $0x0f
 688  	BYTE  $0x3a
 689  	BYTE  $0x0f
 690  	BYTE  $0xd2
 691  	BYTE  $0x04
 692  	BYTE  $0x66
 693  	BYTE  $0x45
 694  	BYTE  $0x0f
 695  	BYTE  $0x3a
 696  	BYTE  $0x0f
 697  	BYTE  $0xdb
 698  	BYTE  $0x04
 699  	BYTE  $0x66
 700  	BYTE  $0x45
 701  	BYTE  $0x0f
 702  	BYTE  $0x3a
 703  	BYTE  $0x0f
 704  	BYTE  $0xff
 705  	BYTE  $0x04
 706  	DECQ  CX
 707  	JGE   openSSEInternalLoop
 708  	ADDQ  (R9), R10
 709  	ADCQ  8(R9), R11
 710  	ADCQ  $0x01, R12
 711  	MOVQ  (BP), AX
 712  	MOVQ  AX, R15
 713  	MULQ  R10
 714  	MOVQ  AX, R13
 715  	MOVQ  DX, R14
 716  	MOVQ  (BP), AX
 717  	MULQ  R11
 718  	IMULQ R12, R15
 719  	ADDQ  AX, R14
 720  	ADCQ  DX, R15
 721  	MOVQ  8(BP), AX
 722  	MOVQ  AX, R8
 723  	MULQ  R10
 724  	ADDQ  AX, R14
 725  	ADCQ  $0x00, DX
 726  	MOVQ  DX, R10
 727  	MOVQ  8(BP), AX
 728  	MULQ  R11
 729  	ADDQ  AX, R15
 730  	ADCQ  $0x00, DX
 731  	IMULQ R12, R8
 732  	ADDQ  R10, R15
 733  	ADCQ  DX, R8
 734  	MOVQ  R13, R10
 735  	MOVQ  R14, R11
 736  	MOVQ  R15, R12
 737  	ANDQ  $0x03, R12
 738  	MOVQ  R15, R13
 739  	ANDQ  $-4, R13
 740  	MOVQ  R8, R14
 741  	SHRQ  $0x02, R8, R15
 742  	SHRQ  $0x02, R8
 743  	ADDQ  R13, R10
 744  	ADCQ  R14, R11
 745  	ADCQ  $0x00, R12
 746  	ADDQ  R15, R10
 747  	ADCQ  R8, R11
 748  	ADCQ  $0x00, R12
 749  	LEAQ  16(R9), R9
 750  	CMPQ  CX, $-6
 751  	JG    openSSEInternalLoop
 752  
 753  	// Add in the state
 754  	PADDD ·chacha20Constants<>+0(SB), X0
 755  	PADDD ·chacha20Constants<>+0(SB), X1
 756  	PADDD ·chacha20Constants<>+0(SB), X2
 757  	PADDD ·chacha20Constants<>+0(SB), X12
 758  	PADDD 32(BP), X3
 759  	PADDD 32(BP), X4
 760  	PADDD 32(BP), X5
 761  	PADDD 32(BP), X13
 762  	PADDD 48(BP), X6
 763  	PADDD 48(BP), X7
 764  	PADDD 48(BP), X8
 765  	PADDD 48(BP), X14
 766  	PADDD 80(BP), X9
 767  	PADDD 96(BP), X10
 768  	PADDD 112(BP), X11
 769  	PADDD 128(BP), X15
 770  
 771  	// Load - xor - store
 772  	MOVO  X15, 64(BP)
 773  	MOVOU (SI), X15
 774  	PXOR  X15, X0
 775  	MOVOU X0, (DI)
 776  	MOVOU 16(SI), X15
 777  	PXOR  X15, X3
 778  	MOVOU X3, 16(DI)
 779  	MOVOU 32(SI), X15
 780  	PXOR  X15, X6
 781  	MOVOU X6, 32(DI)
 782  	MOVOU 48(SI), X15
 783  	PXOR  X15, X9
 784  	MOVOU X9, 48(DI)
 785  	MOVOU 64(SI), X9
 786  	PXOR  X9, X1
 787  	MOVOU X1, 64(DI)
 788  	MOVOU 80(SI), X9
 789  	PXOR  X9, X4
 790  	MOVOU X4, 80(DI)
 791  	MOVOU 96(SI), X9
 792  	PXOR  X9, X7
 793  	MOVOU X7, 96(DI)
 794  	MOVOU 112(SI), X9
 795  	PXOR  X9, X10
 796  	MOVOU X10, 112(DI)
 797  	MOVOU 128(SI), X9
 798  	PXOR  X9, X2
 799  	MOVOU X2, 128(DI)
 800  	MOVOU 144(SI), X9
 801  	PXOR  X9, X5
 802  	MOVOU X5, 144(DI)
 803  	MOVOU 160(SI), X9
 804  	PXOR  X9, X8
 805  	MOVOU X8, 160(DI)
 806  	MOVOU 176(SI), X9
 807  	PXOR  X9, X11
 808  	MOVOU X11, 176(DI)
 809  	MOVOU 192(SI), X9
 810  	PXOR  X9, X12
 811  	MOVOU X12, 192(DI)
 812  	MOVOU 208(SI), X9
 813  	PXOR  X9, X13
 814  	MOVOU X13, 208(DI)
 815  	MOVOU 224(SI), X9
 816  	PXOR  X9, X14
 817  	MOVOU X14, 224(DI)
 818  	MOVOU 240(SI), X9
 819  	PXOR  64(BP), X9
 820  	MOVOU X9, 240(DI)
 821  	LEAQ  256(SI), SI
 822  	LEAQ  256(DI), DI
 823  	SUBQ  $0x00000100, BX
 824  	JMP   openSSEMainLoop
 825  
 826  openSSEMainLoopDone:
 827  	// Handle the various tail sizes efficiently
 828  	TESTQ BX, BX
 829  	JE    openSSEFinalize
 830  	CMPQ  BX, $0x40
 831  	JBE   openSSETail64
 832  	CMPQ  BX, $0x80
 833  	JBE   openSSETail128
 834  	CMPQ  BX, $0xc0
 835  	JBE   openSSETail192
 836  	JMP   openSSETail256
 837  
 838  openSSEFinalize:
 839  	// Hash in the PT, AAD lengths
 840  	ADDQ  ad_len+80(FP), R10
 841  	ADCQ  src_len+56(FP), R11
 842  	ADCQ  $0x01, R12
 843  	MOVQ  (BP), AX
 844  	MOVQ  AX, R15
 845  	MULQ  R10
 846  	MOVQ  AX, R13
 847  	MOVQ  DX, R14
 848  	MOVQ  (BP), AX
 849  	MULQ  R11
 850  	IMULQ R12, R15
 851  	ADDQ  AX, R14
 852  	ADCQ  DX, R15
 853  	MOVQ  8(BP), AX
 854  	MOVQ  AX, R8
 855  	MULQ  R10
 856  	ADDQ  AX, R14
 857  	ADCQ  $0x00, DX
 858  	MOVQ  DX, R10
 859  	MOVQ  8(BP), AX
 860  	MULQ  R11
 861  	ADDQ  AX, R15
 862  	ADCQ  $0x00, DX
 863  	IMULQ R12, R8
 864  	ADDQ  R10, R15
 865  	ADCQ  DX, R8
 866  	MOVQ  R13, R10
 867  	MOVQ  R14, R11
 868  	MOVQ  R15, R12
 869  	ANDQ  $0x03, R12
 870  	MOVQ  R15, R13
 871  	ANDQ  $-4, R13
 872  	MOVQ  R8, R14
 873  	SHRQ  $0x02, R8, R15
 874  	SHRQ  $0x02, R8
 875  	ADDQ  R13, R10
 876  	ADCQ  R14, R11
 877  	ADCQ  $0x00, R12
 878  	ADDQ  R15, R10
 879  	ADCQ  R8, R11
 880  	ADCQ  $0x00, R12
 881  
 882  	// Final reduce
 883  	MOVQ    R10, R13
 884  	MOVQ    R11, R14
 885  	MOVQ    R12, R15
 886  	SUBQ    $-5, R10
 887  	SBBQ    $-1, R11
 888  	SBBQ    $0x03, R12
 889  	CMOVQCS R13, R10
 890  	CMOVQCS R14, R11
 891  	CMOVQCS R15, R12
 892  
 893  	// Add in the "s" part of the key
 894  	ADDQ 16(BP), R10
 895  	ADCQ 24(BP), R11
 896  
 897  	// Finally, constant time compare to the tag at the end of the message
 898  	XORQ    AX, AX
 899  	MOVQ    $0x00000001, DX
 900  	XORQ    (SI), R10
 901  	XORQ    8(SI), R11
 902  	ORQ     R11, R10
 903  	CMOVQEQ DX, AX
 904  
 905  	// Return true iff tags are equal
 906  	MOVB AX, ret+96(FP)
 907  	RET
 908  
 909  openSSE128:
 910  	MOVOU ·chacha20Constants<>+0(SB), X0
 911  	MOVOU 16(R8), X3
 912  	MOVOU 32(R8), X6
 913  	MOVOU 48(R8), X9
 914  	MOVO  X0, X1
 915  	MOVO  X3, X4
 916  	MOVO  X6, X7
 917  	MOVO  X9, X10
 918  	PADDL ·sseIncMask<>+0(SB), X10
 919  	MOVO  X1, X2
 920  	MOVO  X4, X5
 921  	MOVO  X7, X8
 922  	MOVO  X10, X11
 923  	PADDL ·sseIncMask<>+0(SB), X11
 924  	MOVO  X3, X13
 925  	MOVO  X6, X14
 926  	MOVO  X10, X15
 927  	MOVQ  $0x0000000a, R9
 928  
 929  openSSE128InnerCipherLoop:
 930  	PADDD X3, X0
 931  	PXOR  X0, X9
 932  	ROL16(X9, X12)
 933  	PADDD X9, X6
 934  	PXOR  X6, X3
 935  	MOVO  X3, X12
 936  	PSLLL $0x0c, X12
 937  	PSRLL $0x14, X3
 938  	PXOR  X12, X3
 939  	PADDD X3, X0
 940  	PXOR  X0, X9
 941  	ROL8(X9, X12)
 942  	PADDD X9, X6
 943  	PXOR  X6, X3
 944  	MOVO  X3, X12
 945  	PSLLL $0x07, X12
 946  	PSRLL $0x19, X3
 947  	PXOR  X12, X3
 948  	PADDD X4, X1
 949  	PXOR  X1, X10
 950  	ROL16(X10, X12)
 951  	PADDD X10, X7
 952  	PXOR  X7, X4
 953  	MOVO  X4, X12
 954  	PSLLL $0x0c, X12
 955  	PSRLL $0x14, X4
 956  	PXOR  X12, X4
 957  	PADDD X4, X1
 958  	PXOR  X1, X10
 959  	ROL8(X10, X12)
 960  	PADDD X10, X7
 961  	PXOR  X7, X4
 962  	MOVO  X4, X12
 963  	PSLLL $0x07, X12
 964  	PSRLL $0x19, X4
 965  	PXOR  X12, X4
 966  	PADDD X5, X2
 967  	PXOR  X2, X11
 968  	ROL16(X11, X12)
 969  	PADDD X11, X8
 970  	PXOR  X8, X5
 971  	MOVO  X5, X12
 972  	PSLLL $0x0c, X12
 973  	PSRLL $0x14, X5
 974  	PXOR  X12, X5
 975  	PADDD X5, X2
 976  	PXOR  X2, X11
 977  	ROL8(X11, X12)
 978  	PADDD X11, X8
 979  	PXOR  X8, X5
 980  	MOVO  X5, X12
 981  	PSLLL $0x07, X12
 982  	PSRLL $0x19, X5
 983  	PXOR  X12, X5
 984  	BYTE  $0x66
 985  	BYTE  $0x0f
 986  	BYTE  $0x3a
 987  	BYTE  $0x0f
 988  	BYTE  $0xdb
 989  	BYTE  $0x04
 990  	BYTE  $0x66
 991  	BYTE  $0x0f
 992  	BYTE  $0x3a
 993  	BYTE  $0x0f
 994  	BYTE  $0xe4
 995  	BYTE  $0x04
 996  	BYTE  $0x66
 997  	BYTE  $0x0f
 998  	BYTE  $0x3a
 999  	BYTE  $0x0f
1000  	BYTE  $0xed
1001  	BYTE  $0x04
1002  	BYTE  $0x66
1003  	BYTE  $0x0f
1004  	BYTE  $0x3a
1005  	BYTE  $0x0f
1006  	BYTE  $0xf6
1007  	BYTE  $0x08
1008  	BYTE  $0x66
1009  	BYTE  $0x0f
1010  	BYTE  $0x3a
1011  	BYTE  $0x0f
1012  	BYTE  $0xff
1013  	BYTE  $0x08
1014  	BYTE  $0x66
1015  	BYTE  $0x45
1016  	BYTE  $0x0f
1017  	BYTE  $0x3a
1018  	BYTE  $0x0f
1019  	BYTE  $0xc0
1020  	BYTE  $0x08
1021  	BYTE  $0x66
1022  	BYTE  $0x45
1023  	BYTE  $0x0f
1024  	BYTE  $0x3a
1025  	BYTE  $0x0f
1026  	BYTE  $0xc9
1027  	BYTE  $0x0c
1028  	BYTE  $0x66
1029  	BYTE  $0x45
1030  	BYTE  $0x0f
1031  	BYTE  $0x3a
1032  	BYTE  $0x0f
1033  	BYTE  $0xd2
1034  	BYTE  $0x0c
1035  	BYTE  $0x66
1036  	BYTE  $0x45
1037  	BYTE  $0x0f
1038  	BYTE  $0x3a
1039  	BYTE  $0x0f
1040  	BYTE  $0xdb
1041  	BYTE  $0x0c
1042  	PADDD X3, X0
1043  	PXOR  X0, X9
1044  	ROL16(X9, X12)
1045  	PADDD X9, X6
1046  	PXOR  X6, X3
1047  	MOVO  X3, X12
1048  	PSLLL $0x0c, X12
1049  	PSRLL $0x14, X3
1050  	PXOR  X12, X3
1051  	PADDD X3, X0
1052  	PXOR  X0, X9
1053  	ROL8(X9, X12)
1054  	PADDD X9, X6
1055  	PXOR  X6, X3
1056  	MOVO  X3, X12
1057  	PSLLL $0x07, X12
1058  	PSRLL $0x19, X3
1059  	PXOR  X12, X3
1060  	PADDD X4, X1
1061  	PXOR  X1, X10
1062  	ROL16(X10, X12)
1063  	PADDD X10, X7
1064  	PXOR  X7, X4
1065  	MOVO  X4, X12
1066  	PSLLL $0x0c, X12
1067  	PSRLL $0x14, X4
1068  	PXOR  X12, X4
1069  	PADDD X4, X1
1070  	PXOR  X1, X10
1071  	ROL8(X10, X12)
1072  	PADDD X10, X7
1073  	PXOR  X7, X4
1074  	MOVO  X4, X12
1075  	PSLLL $0x07, X12
1076  	PSRLL $0x19, X4
1077  	PXOR  X12, X4
1078  	PADDD X5, X2
1079  	PXOR  X2, X11
1080  	ROL16(X11, X12)
1081  	PADDD X11, X8
1082  	PXOR  X8, X5
1083  	MOVO  X5, X12
1084  	PSLLL $0x0c, X12
1085  	PSRLL $0x14, X5
1086  	PXOR  X12, X5
1087  	PADDD X5, X2
1088  	PXOR  X2, X11
1089  	ROL8(X11, X12)
1090  	PADDD X11, X8
1091  	PXOR  X8, X5
1092  	MOVO  X5, X12
1093  	PSLLL $0x07, X12
1094  	PSRLL $0x19, X5
1095  	PXOR  X12, X5
1096  	BYTE  $0x66
1097  	BYTE  $0x0f
1098  	BYTE  $0x3a
1099  	BYTE  $0x0f
1100  	BYTE  $0xdb
1101  	BYTE  $0x0c
1102  	BYTE  $0x66
1103  	BYTE  $0x0f
1104  	BYTE  $0x3a
1105  	BYTE  $0x0f
1106  	BYTE  $0xe4
1107  	BYTE  $0x0c
1108  	BYTE  $0x66
1109  	BYTE  $0x0f
1110  	BYTE  $0x3a
1111  	BYTE  $0x0f
1112  	BYTE  $0xed
1113  	BYTE  $0x0c
1114  	BYTE  $0x66
1115  	BYTE  $0x0f
1116  	BYTE  $0x3a
1117  	BYTE  $0x0f
1118  	BYTE  $0xf6
1119  	BYTE  $0x08
1120  	BYTE  $0x66
1121  	BYTE  $0x0f
1122  	BYTE  $0x3a
1123  	BYTE  $0x0f
1124  	BYTE  $0xff
1125  	BYTE  $0x08
1126  	BYTE  $0x66
1127  	BYTE  $0x45
1128  	BYTE  $0x0f
1129  	BYTE  $0x3a
1130  	BYTE  $0x0f
1131  	BYTE  $0xc0
1132  	BYTE  $0x08
1133  	BYTE  $0x66
1134  	BYTE  $0x45
1135  	BYTE  $0x0f
1136  	BYTE  $0x3a
1137  	BYTE  $0x0f
1138  	BYTE  $0xc9
1139  	BYTE  $0x04
1140  	BYTE  $0x66
1141  	BYTE  $0x45
1142  	BYTE  $0x0f
1143  	BYTE  $0x3a
1144  	BYTE  $0x0f
1145  	BYTE  $0xd2
1146  	BYTE  $0x04
1147  	BYTE  $0x66
1148  	BYTE  $0x45
1149  	BYTE  $0x0f
1150  	BYTE  $0x3a
1151  	BYTE  $0x0f
1152  	BYTE  $0xdb
1153  	BYTE  $0x04
1154  	DECQ  R9
1155  	JNE   openSSE128InnerCipherLoop
1156  
1157  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1158  	PADDL ·chacha20Constants<>+0(SB), X0
1159  	PADDL ·chacha20Constants<>+0(SB), X1
1160  	PADDL ·chacha20Constants<>+0(SB), X2
1161  	PADDL X13, X3
1162  	PADDL X13, X4
1163  	PADDL X13, X5
1164  	PADDL X14, X7
1165  	PADDL X14, X8
1166  	PADDL X15, X10
1167  	PADDL ·sseIncMask<>+0(SB), X15
1168  	PADDL X15, X11
1169  
1170  	// Clamp and store the key
1171  	PAND  ·polyClampMask<>+0(SB), X0
1172  	MOVOU X0, (BP)
1173  	MOVOU X3, 16(BP)
1174  
1175  	// Hash
1176  	MOVQ ad_len+80(FP), R9
1177  	CALL polyHashADInternal<>(SB)
1178  
1179  openSSE128Open:
1180  	CMPQ BX, $0x10
1181  	JB   openSSETail16
1182  	SUBQ $0x10, BX
1183  
1184  	// Load for hashing
1185  	ADDQ (SI), R10
1186  	ADCQ 8(SI), R11
1187  	ADCQ $0x01, R12
1188  
1189  	// Load for decryption
1190  	MOVOU (SI), X12
1191  	PXOR  X12, X1
1192  	MOVOU X1, (DI)
1193  	LEAQ  16(SI), SI
1194  	LEAQ  16(DI), DI
1195  	MOVQ  (BP), AX
1196  	MOVQ  AX, R15
1197  	MULQ  R10
1198  	MOVQ  AX, R13
1199  	MOVQ  DX, R14
1200  	MOVQ  (BP), AX
1201  	MULQ  R11
1202  	IMULQ R12, R15
1203  	ADDQ  AX, R14
1204  	ADCQ  DX, R15
1205  	MOVQ  8(BP), AX
1206  	MOVQ  AX, R8
1207  	MULQ  R10
1208  	ADDQ  AX, R14
1209  	ADCQ  $0x00, DX
1210  	MOVQ  DX, R10
1211  	MOVQ  8(BP), AX
1212  	MULQ  R11
1213  	ADDQ  AX, R15
1214  	ADCQ  $0x00, DX
1215  	IMULQ R12, R8
1216  	ADDQ  R10, R15
1217  	ADCQ  DX, R8
1218  	MOVQ  R13, R10
1219  	MOVQ  R14, R11
1220  	MOVQ  R15, R12
1221  	ANDQ  $0x03, R12
1222  	MOVQ  R15, R13
1223  	ANDQ  $-4, R13
1224  	MOVQ  R8, R14
1225  	SHRQ  $0x02, R8, R15
1226  	SHRQ  $0x02, R8
1227  	ADDQ  R13, R10
1228  	ADCQ  R14, R11
1229  	ADCQ  $0x00, R12
1230  	ADDQ  R15, R10
1231  	ADCQ  R8, R11
1232  	ADCQ  $0x00, R12
1233  
1234  	// Shift the stream "left"
1235  	MOVO X4, X1
1236  	MOVO X7, X4
1237  	MOVO X10, X7
1238  	MOVO X2, X10
1239  	MOVO X5, X2
1240  	MOVO X8, X5
1241  	MOVO X11, X8
1242  	JMP  openSSE128Open
1243  
1244  openSSETail16:
1245  	TESTQ BX, BX
1246  	JE    openSSEFinalize
1247  
1248  	// We can safely load the CT from the end, because it is padded with the MAC
1249  	MOVQ  BX, R9
1250  	SHLQ  $0x04, R9
1251  	LEAQ  ·andMask<>+0(SB), R13
1252  	MOVOU (SI), X12
1253  	ADDQ  BX, SI
1254  	PAND  -16(R13)(R9*1), X12
1255  	MOVO  X12, 64(BP)
1256  	MOVQ  X12, R13
1257  	MOVQ  72(BP), R14
1258  	PXOR  X1, X12
1259  
1260  	// We can only store one byte at a time, since plaintext can be shorter than 16 bytes
1261  openSSETail16Store:
1262  	MOVQ   X12, R8
1263  	MOVB   R8, (DI)
1264  	PSRLDQ $0x01, X12
1265  	INCQ   DI
1266  	DECQ   BX
1267  	JNE    openSSETail16Store
1268  	ADDQ   R13, R10
1269  	ADCQ   R14, R11
1270  	ADCQ   $0x01, R12
1271  	MOVQ   (BP), AX
1272  	MOVQ   AX, R15
1273  	MULQ   R10
1274  	MOVQ   AX, R13
1275  	MOVQ   DX, R14
1276  	MOVQ   (BP), AX
1277  	MULQ   R11
1278  	IMULQ  R12, R15
1279  	ADDQ   AX, R14
1280  	ADCQ   DX, R15
1281  	MOVQ   8(BP), AX
1282  	MOVQ   AX, R8
1283  	MULQ   R10
1284  	ADDQ   AX, R14
1285  	ADCQ   $0x00, DX
1286  	MOVQ   DX, R10
1287  	MOVQ   8(BP), AX
1288  	MULQ   R11
1289  	ADDQ   AX, R15
1290  	ADCQ   $0x00, DX
1291  	IMULQ  R12, R8
1292  	ADDQ   R10, R15
1293  	ADCQ   DX, R8
1294  	MOVQ   R13, R10
1295  	MOVQ   R14, R11
1296  	MOVQ   R15, R12
1297  	ANDQ   $0x03, R12
1298  	MOVQ   R15, R13
1299  	ANDQ   $-4, R13
1300  	MOVQ   R8, R14
1301  	SHRQ   $0x02, R8, R15
1302  	SHRQ   $0x02, R8
1303  	ADDQ   R13, R10
1304  	ADCQ   R14, R11
1305  	ADCQ   $0x00, R12
1306  	ADDQ   R15, R10
1307  	ADCQ   R8, R11
1308  	ADCQ   $0x00, R12
1309  	JMP    openSSEFinalize
1310  
1311  openSSETail64:
1312  	MOVO  ·chacha20Constants<>+0(SB), X0
1313  	MOVO  32(BP), X3
1314  	MOVO  48(BP), X6
1315  	MOVO  128(BP), X9
1316  	PADDL ·sseIncMask<>+0(SB), X9
1317  	MOVO  X9, 80(BP)
1318  	XORQ  R9, R9
1319  	MOVQ  BX, CX
1320  	CMPQ  CX, $0x10
1321  	JB    openSSETail64LoopB
1322  
1323  openSSETail64LoopA:
1324  	ADDQ  (SI)(R9*1), R10
1325  	ADCQ  8(SI)(R9*1), R11
1326  	ADCQ  $0x01, R12
1327  	MOVQ  (BP), AX
1328  	MOVQ  AX, R15
1329  	MULQ  R10
1330  	MOVQ  AX, R13
1331  	MOVQ  DX, R14
1332  	MOVQ  (BP), AX
1333  	MULQ  R11
1334  	IMULQ R12, R15
1335  	ADDQ  AX, R14
1336  	ADCQ  DX, R15
1337  	MOVQ  8(BP), AX
1338  	MOVQ  AX, R8
1339  	MULQ  R10
1340  	ADDQ  AX, R14
1341  	ADCQ  $0x00, DX
1342  	MOVQ  DX, R10
1343  	MOVQ  8(BP), AX
1344  	MULQ  R11
1345  	ADDQ  AX, R15
1346  	ADCQ  $0x00, DX
1347  	IMULQ R12, R8
1348  	ADDQ  R10, R15
1349  	ADCQ  DX, R8
1350  	MOVQ  R13, R10
1351  	MOVQ  R14, R11
1352  	MOVQ  R15, R12
1353  	ANDQ  $0x03, R12
1354  	MOVQ  R15, R13
1355  	ANDQ  $-4, R13
1356  	MOVQ  R8, R14
1357  	SHRQ  $0x02, R8, R15
1358  	SHRQ  $0x02, R8
1359  	ADDQ  R13, R10
1360  	ADCQ  R14, R11
1361  	ADCQ  $0x00, R12
1362  	ADDQ  R15, R10
1363  	ADCQ  R8, R11
1364  	ADCQ  $0x00, R12
1365  	SUBQ  $0x10, CX
1366  
1367  openSSETail64LoopB:
1368  	ADDQ  $0x10, R9
1369  	PADDD X3, X0
1370  	PXOR  X0, X9
1371  	ROL16(X9, X12)
1372  	PADDD X9, X6
1373  	PXOR  X6, X3
1374  	MOVO  X3, X12
1375  	PSLLL $0x0c, X12
1376  	PSRLL $0x14, X3
1377  	PXOR  X12, X3
1378  	PADDD X3, X0
1379  	PXOR  X0, X9
1380  	ROL8(X9, X12)
1381  	PADDD X9, X6
1382  	PXOR  X6, X3
1383  	MOVO  X3, X12
1384  	PSLLL $0x07, X12
1385  	PSRLL $0x19, X3
1386  	PXOR  X12, X3
1387  	BYTE  $0x66
1388  	BYTE  $0x0f
1389  	BYTE  $0x3a
1390  	BYTE  $0x0f
1391  	BYTE  $0xdb
1392  	BYTE  $0x04
1393  	BYTE  $0x66
1394  	BYTE  $0x0f
1395  	BYTE  $0x3a
1396  	BYTE  $0x0f
1397  	BYTE  $0xf6
1398  	BYTE  $0x08
1399  	BYTE  $0x66
1400  	BYTE  $0x45
1401  	BYTE  $0x0f
1402  	BYTE  $0x3a
1403  	BYTE  $0x0f
1404  	BYTE  $0xc9
1405  	BYTE  $0x0c
1406  	PADDD X3, X0
1407  	PXOR  X0, X9
1408  	ROL16(X9, X12)
1409  	PADDD X9, X6
1410  	PXOR  X6, X3
1411  	MOVO  X3, X12
1412  	PSLLL $0x0c, X12
1413  	PSRLL $0x14, X3
1414  	PXOR  X12, X3
1415  	PADDD X3, X0
1416  	PXOR  X0, X9
1417  	ROL8(X9, X12)
1418  	PADDD X9, X6
1419  	PXOR  X6, X3
1420  	MOVO  X3, X12
1421  	PSLLL $0x07, X12
1422  	PSRLL $0x19, X3
1423  	PXOR  X12, X3
1424  	BYTE  $0x66
1425  	BYTE  $0x0f
1426  	BYTE  $0x3a
1427  	BYTE  $0x0f
1428  	BYTE  $0xdb
1429  	BYTE  $0x0c
1430  	BYTE  $0x66
1431  	BYTE  $0x0f
1432  	BYTE  $0x3a
1433  	BYTE  $0x0f
1434  	BYTE  $0xf6
1435  	BYTE  $0x08
1436  	BYTE  $0x66
1437  	BYTE  $0x45
1438  	BYTE  $0x0f
1439  	BYTE  $0x3a
1440  	BYTE  $0x0f
1441  	BYTE  $0xc9
1442  	BYTE  $0x04
1443  	CMPQ  CX, $0x10
1444  	JAE   openSSETail64LoopA
1445  	CMPQ  R9, $0xa0
1446  	JNE   openSSETail64LoopB
1447  	PADDL ·chacha20Constants<>+0(SB), X0
1448  	PADDL 32(BP), X3
1449  	PADDL 48(BP), X6
1450  	PADDL 80(BP), X9
1451  
1452  openSSETail64DecLoop:
1453  	CMPQ  BX, $0x10
1454  	JB    openSSETail64DecLoopDone
1455  	SUBQ  $0x10, BX
1456  	MOVOU (SI), X12
1457  	PXOR  X12, X0
1458  	MOVOU X0, (DI)
1459  	LEAQ  16(SI), SI
1460  	LEAQ  16(DI), DI
1461  	MOVO  X3, X0
1462  	MOVO  X6, X3
1463  	MOVO  X9, X6
1464  	JMP   openSSETail64DecLoop
1465  
1466  openSSETail64DecLoopDone:
1467  	MOVO X0, X1
1468  	JMP  openSSETail16
1469  
1470  openSSETail128:
1471  	MOVO  ·chacha20Constants<>+0(SB), X1
1472  	MOVO  32(BP), X4
1473  	MOVO  48(BP), X7
1474  	MOVO  128(BP), X10
1475  	PADDL ·sseIncMask<>+0(SB), X10
1476  	MOVO  X10, 80(BP)
1477  	MOVO  X1, X0
1478  	MOVO  X4, X3
1479  	MOVO  X7, X6
1480  	MOVO  X10, X9
1481  	PADDL ·sseIncMask<>+0(SB), X9
1482  	MOVO  X9, 96(BP)
1483  	XORQ  R9, R9
1484  	MOVQ  BX, CX
1485  	ANDQ  $-16, CX
1486  
1487  openSSETail128LoopA:
1488  	ADDQ  (SI)(R9*1), R10
1489  	ADCQ  8(SI)(R9*1), R11
1490  	ADCQ  $0x01, R12
1491  	MOVQ  (BP), AX
1492  	MOVQ  AX, R15
1493  	MULQ  R10
1494  	MOVQ  AX, R13
1495  	MOVQ  DX, R14
1496  	MOVQ  (BP), AX
1497  	MULQ  R11
1498  	IMULQ R12, R15
1499  	ADDQ  AX, R14
1500  	ADCQ  DX, R15
1501  	MOVQ  8(BP), AX
1502  	MOVQ  AX, R8
1503  	MULQ  R10
1504  	ADDQ  AX, R14
1505  	ADCQ  $0x00, DX
1506  	MOVQ  DX, R10
1507  	MOVQ  8(BP), AX
1508  	MULQ  R11
1509  	ADDQ  AX, R15
1510  	ADCQ  $0x00, DX
1511  	IMULQ R12, R8
1512  	ADDQ  R10, R15
1513  	ADCQ  DX, R8
1514  	MOVQ  R13, R10
1515  	MOVQ  R14, R11
1516  	MOVQ  R15, R12
1517  	ANDQ  $0x03, R12
1518  	MOVQ  R15, R13
1519  	ANDQ  $-4, R13
1520  	MOVQ  R8, R14
1521  	SHRQ  $0x02, R8, R15
1522  	SHRQ  $0x02, R8
1523  	ADDQ  R13, R10
1524  	ADCQ  R14, R11
1525  	ADCQ  $0x00, R12
1526  	ADDQ  R15, R10
1527  	ADCQ  R8, R11
1528  	ADCQ  $0x00, R12
1529  
1530  openSSETail128LoopB:
1531  	ADDQ  $0x10, R9
1532  	PADDD X3, X0
1533  	PXOR  X0, X9
1534  	ROL16(X9, X12)
1535  	PADDD X9, X6
1536  	PXOR  X6, X3
1537  	MOVO  X3, X12
1538  	PSLLL $0x0c, X12
1539  	PSRLL $0x14, X3
1540  	PXOR  X12, X3
1541  	PADDD X3, X0
1542  	PXOR  X0, X9
1543  	ROL8(X9, X12)
1544  	PADDD X9, X6
1545  	PXOR  X6, X3
1546  	MOVO  X3, X12
1547  	PSLLL $0x07, X12
1548  	PSRLL $0x19, X3
1549  	PXOR  X12, X3
1550  	PADDD X4, X1
1551  	PXOR  X1, X10
1552  	ROL16(X10, X12)
1553  	PADDD X10, X7
1554  	PXOR  X7, X4
1555  	MOVO  X4, X12
1556  	PSLLL $0x0c, X12
1557  	PSRLL $0x14, X4
1558  	PXOR  X12, X4
1559  	PADDD X4, X1
1560  	PXOR  X1, X10
1561  	ROL8(X10, X12)
1562  	PADDD X10, X7
1563  	PXOR  X7, X4
1564  	MOVO  X4, X12
1565  	PSLLL $0x07, X12
1566  	PSRLL $0x19, X4
1567  	PXOR  X12, X4
1568  	BYTE  $0x66
1569  	BYTE  $0x0f
1570  	BYTE  $0x3a
1571  	BYTE  $0x0f
1572  	BYTE  $0xdb
1573  	BYTE  $0x04
1574  	BYTE  $0x66
1575  	BYTE  $0x0f
1576  	BYTE  $0x3a
1577  	BYTE  $0x0f
1578  	BYTE  $0xf6
1579  	BYTE  $0x08
1580  	BYTE  $0x66
1581  	BYTE  $0x45
1582  	BYTE  $0x0f
1583  	BYTE  $0x3a
1584  	BYTE  $0x0f
1585  	BYTE  $0xc9
1586  	BYTE  $0x0c
1587  	BYTE  $0x66
1588  	BYTE  $0x0f
1589  	BYTE  $0x3a
1590  	BYTE  $0x0f
1591  	BYTE  $0xe4
1592  	BYTE  $0x04
1593  	BYTE  $0x66
1594  	BYTE  $0x0f
1595  	BYTE  $0x3a
1596  	BYTE  $0x0f
1597  	BYTE  $0xff
1598  	BYTE  $0x08
1599  	BYTE  $0x66
1600  	BYTE  $0x45
1601  	BYTE  $0x0f
1602  	BYTE  $0x3a
1603  	BYTE  $0x0f
1604  	BYTE  $0xd2
1605  	BYTE  $0x0c
1606  	PADDD X3, X0
1607  	PXOR  X0, X9
1608  	ROL16(X9, X12)
1609  	PADDD X9, X6
1610  	PXOR  X6, X3
1611  	MOVO  X3, X12
1612  	PSLLL $0x0c, X12
1613  	PSRLL $0x14, X3
1614  	PXOR  X12, X3
1615  	PADDD X3, X0
1616  	PXOR  X0, X9
1617  	ROL8(X9, X12)
1618  	PADDD X9, X6
1619  	PXOR  X6, X3
1620  	MOVO  X3, X12
1621  	PSLLL $0x07, X12
1622  	PSRLL $0x19, X3
1623  	PXOR  X12, X3
1624  	PADDD X4, X1
1625  	PXOR  X1, X10
1626  	ROL16(X10, X12)
1627  	PADDD X10, X7
1628  	PXOR  X7, X4
1629  	MOVO  X4, X12
1630  	PSLLL $0x0c, X12
1631  	PSRLL $0x14, X4
1632  	PXOR  X12, X4
1633  	PADDD X4, X1
1634  	PXOR  X1, X10
1635  	ROL8(X10, X12)
1636  	PADDD X10, X7
1637  	PXOR  X7, X4
1638  	MOVO  X4, X12
1639  	PSLLL $0x07, X12
1640  	PSRLL $0x19, X4
1641  	PXOR  X12, X4
1642  	BYTE  $0x66
1643  	BYTE  $0x0f
1644  	BYTE  $0x3a
1645  	BYTE  $0x0f
1646  	BYTE  $0xdb
1647  	BYTE  $0x0c
1648  	BYTE  $0x66
1649  	BYTE  $0x0f
1650  	BYTE  $0x3a
1651  	BYTE  $0x0f
1652  	BYTE  $0xf6
1653  	BYTE  $0x08
1654  	BYTE  $0x66
1655  	BYTE  $0x45
1656  	BYTE  $0x0f
1657  	BYTE  $0x3a
1658  	BYTE  $0x0f
1659  	BYTE  $0xc9
1660  	BYTE  $0x04
1661  	BYTE  $0x66
1662  	BYTE  $0x0f
1663  	BYTE  $0x3a
1664  	BYTE  $0x0f
1665  	BYTE  $0xe4
1666  	BYTE  $0x0c
1667  	BYTE  $0x66
1668  	BYTE  $0x0f
1669  	BYTE  $0x3a
1670  	BYTE  $0x0f
1671  	BYTE  $0xff
1672  	BYTE  $0x08
1673  	BYTE  $0x66
1674  	BYTE  $0x45
1675  	BYTE  $0x0f
1676  	BYTE  $0x3a
1677  	BYTE  $0x0f
1678  	BYTE  $0xd2
1679  	BYTE  $0x04
1680  	CMPQ  R9, CX
1681  	JB    openSSETail128LoopA
1682  	CMPQ  R9, $0xa0
1683  	JNE   openSSETail128LoopB
1684  	PADDL ·chacha20Constants<>+0(SB), X0
1685  	PADDL ·chacha20Constants<>+0(SB), X1
1686  	PADDL 32(BP), X3
1687  	PADDL 32(BP), X4
1688  	PADDL 48(BP), X6
1689  	PADDL 48(BP), X7
1690  	PADDL 96(BP), X9
1691  	PADDL 80(BP), X10
1692  	MOVOU (SI), X12
1693  	MOVOU 16(SI), X13
1694  	MOVOU 32(SI), X14
1695  	MOVOU 48(SI), X15
1696  	PXOR  X12, X1
1697  	PXOR  X13, X4
1698  	PXOR  X14, X7
1699  	PXOR  X15, X10
1700  	MOVOU X1, (DI)
1701  	MOVOU X4, 16(DI)
1702  	MOVOU X7, 32(DI)
1703  	MOVOU X10, 48(DI)
1704  	SUBQ  $0x40, BX
1705  	LEAQ  64(SI), SI
1706  	LEAQ  64(DI), DI
1707  	JMP   openSSETail64DecLoop
1708  
1709  openSSETail192:
1710  	MOVO    ·chacha20Constants<>+0(SB), X2
1711  	MOVO    32(BP), X5
1712  	MOVO    48(BP), X8
1713  	MOVO    128(BP), X11
1714  	PADDL   ·sseIncMask<>+0(SB), X11
1715  	MOVO    X11, 80(BP)
1716  	MOVO    X2, X1
1717  	MOVO    X5, X4
1718  	MOVO    X8, X7
1719  	MOVO    X11, X10
1720  	PADDL   ·sseIncMask<>+0(SB), X10
1721  	MOVO    X10, 96(BP)
1722  	MOVO    X1, X0
1723  	MOVO    X4, X3
1724  	MOVO    X7, X6
1725  	MOVO    X10, X9
1726  	PADDL   ·sseIncMask<>+0(SB), X9
1727  	MOVO    X9, 112(BP)
1728  	MOVQ    BX, CX
1729  	MOVQ    $0x000000a0, R9
1730  	CMPQ    CX, $0xa0
1731  	CMOVQGT R9, CX
1732  	ANDQ    $-16, CX
1733  	XORQ    R9, R9
1734  
1735  openSSLTail192LoopA:
1736  	ADDQ  (SI)(R9*1), R10
1737  	ADCQ  8(SI)(R9*1), R11
1738  	ADCQ  $0x01, R12
1739  	MOVQ  (BP), AX
1740  	MOVQ  AX, R15
1741  	MULQ  R10
1742  	MOVQ  AX, R13
1743  	MOVQ  DX, R14
1744  	MOVQ  (BP), AX
1745  	MULQ  R11
1746  	IMULQ R12, R15
1747  	ADDQ  AX, R14
1748  	ADCQ  DX, R15
1749  	MOVQ  8(BP), AX
1750  	MOVQ  AX, R8
1751  	MULQ  R10
1752  	ADDQ  AX, R14
1753  	ADCQ  $0x00, DX
1754  	MOVQ  DX, R10
1755  	MOVQ  8(BP), AX
1756  	MULQ  R11
1757  	ADDQ  AX, R15
1758  	ADCQ  $0x00, DX
1759  	IMULQ R12, R8
1760  	ADDQ  R10, R15
1761  	ADCQ  DX, R8
1762  	MOVQ  R13, R10
1763  	MOVQ  R14, R11
1764  	MOVQ  R15, R12
1765  	ANDQ  $0x03, R12
1766  	MOVQ  R15, R13
1767  	ANDQ  $-4, R13
1768  	MOVQ  R8, R14
1769  	SHRQ  $0x02, R8, R15
1770  	SHRQ  $0x02, R8
1771  	ADDQ  R13, R10
1772  	ADCQ  R14, R11
1773  	ADCQ  $0x00, R12
1774  	ADDQ  R15, R10
1775  	ADCQ  R8, R11
1776  	ADCQ  $0x00, R12
1777  
1778  openSSLTail192LoopB:
1779  	ADDQ  $0x10, R9
1780  	PADDD X3, X0
1781  	PXOR  X0, X9
1782  	ROL16(X9, X12)
1783  	PADDD X9, X6
1784  	PXOR  X6, X3
1785  	MOVO  X3, X12
1786  	PSLLL $0x0c, X12
1787  	PSRLL $0x14, X3
1788  	PXOR  X12, X3
1789  	PADDD X3, X0
1790  	PXOR  X0, X9
1791  	ROL8(X9, X12)
1792  	PADDD X9, X6
1793  	PXOR  X6, X3
1794  	MOVO  X3, X12
1795  	PSLLL $0x07, X12
1796  	PSRLL $0x19, X3
1797  	PXOR  X12, X3
1798  	PADDD X4, X1
1799  	PXOR  X1, X10
1800  	ROL16(X10, X12)
1801  	PADDD X10, X7
1802  	PXOR  X7, X4
1803  	MOVO  X4, X12
1804  	PSLLL $0x0c, X12
1805  	PSRLL $0x14, X4
1806  	PXOR  X12, X4
1807  	PADDD X4, X1
1808  	PXOR  X1, X10
1809  	ROL8(X10, X12)
1810  	PADDD X10, X7
1811  	PXOR  X7, X4
1812  	MOVO  X4, X12
1813  	PSLLL $0x07, X12
1814  	PSRLL $0x19, X4
1815  	PXOR  X12, X4
1816  	PADDD X5, X2
1817  	PXOR  X2, X11
1818  	ROL16(X11, X12)
1819  	PADDD X11, X8
1820  	PXOR  X8, X5
1821  	MOVO  X5, X12
1822  	PSLLL $0x0c, X12
1823  	PSRLL $0x14, X5
1824  	PXOR  X12, X5
1825  	PADDD X5, X2
1826  	PXOR  X2, X11
1827  	ROL8(X11, X12)
1828  	PADDD X11, X8
1829  	PXOR  X8, X5
1830  	MOVO  X5, X12
1831  	PSLLL $0x07, X12
1832  	PSRLL $0x19, X5
1833  	PXOR  X12, X5
1834  	BYTE  $0x66
1835  	BYTE  $0x0f
1836  	BYTE  $0x3a
1837  	BYTE  $0x0f
1838  	BYTE  $0xdb
1839  	BYTE  $0x04
1840  	BYTE  $0x66
1841  	BYTE  $0x0f
1842  	BYTE  $0x3a
1843  	BYTE  $0x0f
1844  	BYTE  $0xf6
1845  	BYTE  $0x08
1846  	BYTE  $0x66
1847  	BYTE  $0x45
1848  	BYTE  $0x0f
1849  	BYTE  $0x3a
1850  	BYTE  $0x0f
1851  	BYTE  $0xc9
1852  	BYTE  $0x0c
1853  	BYTE  $0x66
1854  	BYTE  $0x0f
1855  	BYTE  $0x3a
1856  	BYTE  $0x0f
1857  	BYTE  $0xe4
1858  	BYTE  $0x04
1859  	BYTE  $0x66
1860  	BYTE  $0x0f
1861  	BYTE  $0x3a
1862  	BYTE  $0x0f
1863  	BYTE  $0xff
1864  	BYTE  $0x08
1865  	BYTE  $0x66
1866  	BYTE  $0x45
1867  	BYTE  $0x0f
1868  	BYTE  $0x3a
1869  	BYTE  $0x0f
1870  	BYTE  $0xd2
1871  	BYTE  $0x0c
1872  	BYTE  $0x66
1873  	BYTE  $0x0f
1874  	BYTE  $0x3a
1875  	BYTE  $0x0f
1876  	BYTE  $0xed
1877  	BYTE  $0x04
1878  	BYTE  $0x66
1879  	BYTE  $0x45
1880  	BYTE  $0x0f
1881  	BYTE  $0x3a
1882  	BYTE  $0x0f
1883  	BYTE  $0xc0
1884  	BYTE  $0x08
1885  	BYTE  $0x66
1886  	BYTE  $0x45
1887  	BYTE  $0x0f
1888  	BYTE  $0x3a
1889  	BYTE  $0x0f
1890  	BYTE  $0xdb
1891  	BYTE  $0x0c
1892  	PADDD X3, X0
1893  	PXOR  X0, X9
1894  	ROL16(X9, X12)
1895  	PADDD X9, X6
1896  	PXOR  X6, X3
1897  	MOVO  X3, X12
1898  	PSLLL $0x0c, X12
1899  	PSRLL $0x14, X3
1900  	PXOR  X12, X3
1901  	PADDD X3, X0
1902  	PXOR  X0, X9
1903  	ROL8(X9, X12)
1904  	PADDD X9, X6
1905  	PXOR  X6, X3
1906  	MOVO  X3, X12
1907  	PSLLL $0x07, X12
1908  	PSRLL $0x19, X3
1909  	PXOR  X12, X3
1910  	PADDD X4, X1
1911  	PXOR  X1, X10
1912  	ROL16(X10, X12)
1913  	PADDD X10, X7
1914  	PXOR  X7, X4
1915  	MOVO  X4, X12
1916  	PSLLL $0x0c, X12
1917  	PSRLL $0x14, X4
1918  	PXOR  X12, X4
1919  	PADDD X4, X1
1920  	PXOR  X1, X10
1921  	ROL8(X10, X12)
1922  	PADDD X10, X7
1923  	PXOR  X7, X4
1924  	MOVO  X4, X12
1925  	PSLLL $0x07, X12
1926  	PSRLL $0x19, X4
1927  	PXOR  X12, X4
1928  	PADDD X5, X2
1929  	PXOR  X2, X11
1930  	ROL16(X11, X12)
1931  	PADDD X11, X8
1932  	PXOR  X8, X5
1933  	MOVO  X5, X12
1934  	PSLLL $0x0c, X12
1935  	PSRLL $0x14, X5
1936  	PXOR  X12, X5
1937  	PADDD X5, X2
1938  	PXOR  X2, X11
1939  	ROL8(X11, X12)
1940  	PADDD X11, X8
1941  	PXOR  X8, X5
1942  	MOVO  X5, X12
1943  	PSLLL $0x07, X12
1944  	PSRLL $0x19, X5
1945  	PXOR  X12, X5
1946  	BYTE  $0x66
1947  	BYTE  $0x0f
1948  	BYTE  $0x3a
1949  	BYTE  $0x0f
1950  	BYTE  $0xdb
1951  	BYTE  $0x0c
1952  	BYTE  $0x66
1953  	BYTE  $0x0f
1954  	BYTE  $0x3a
1955  	BYTE  $0x0f
1956  	BYTE  $0xf6
1957  	BYTE  $0x08
1958  	BYTE  $0x66
1959  	BYTE  $0x45
1960  	BYTE  $0x0f
1961  	BYTE  $0x3a
1962  	BYTE  $0x0f
1963  	BYTE  $0xc9
1964  	BYTE  $0x04
1965  	BYTE  $0x66
1966  	BYTE  $0x0f
1967  	BYTE  $0x3a
1968  	BYTE  $0x0f
1969  	BYTE  $0xe4
1970  	BYTE  $0x0c
1971  	BYTE  $0x66
1972  	BYTE  $0x0f
1973  	BYTE  $0x3a
1974  	BYTE  $0x0f
1975  	BYTE  $0xff
1976  	BYTE  $0x08
1977  	BYTE  $0x66
1978  	BYTE  $0x45
1979  	BYTE  $0x0f
1980  	BYTE  $0x3a
1981  	BYTE  $0x0f
1982  	BYTE  $0xd2
1983  	BYTE  $0x04
1984  	BYTE  $0x66
1985  	BYTE  $0x0f
1986  	BYTE  $0x3a
1987  	BYTE  $0x0f
1988  	BYTE  $0xed
1989  	BYTE  $0x0c
1990  	BYTE  $0x66
1991  	BYTE  $0x45
1992  	BYTE  $0x0f
1993  	BYTE  $0x3a
1994  	BYTE  $0x0f
1995  	BYTE  $0xc0
1996  	BYTE  $0x08
1997  	BYTE  $0x66
1998  	BYTE  $0x45
1999  	BYTE  $0x0f
2000  	BYTE  $0x3a
2001  	BYTE  $0x0f
2002  	BYTE  $0xdb
2003  	BYTE  $0x04
2004  	CMPQ  R9, CX
2005  	JB    openSSLTail192LoopA
2006  	CMPQ  R9, $0xa0
2007  	JNE   openSSLTail192LoopB
2008  	CMPQ  BX, $0xb0
2009  	JB    openSSLTail192Store
2010  	ADDQ  160(SI), R10
2011  	ADCQ  168(SI), R11
2012  	ADCQ  $0x01, R12
2013  	MOVQ  (BP), AX
2014  	MOVQ  AX, R15
2015  	MULQ  R10
2016  	MOVQ  AX, R13
2017  	MOVQ  DX, R14
2018  	MOVQ  (BP), AX
2019  	MULQ  R11
2020  	IMULQ R12, R15
2021  	ADDQ  AX, R14
2022  	ADCQ  DX, R15
2023  	MOVQ  8(BP), AX
2024  	MOVQ  AX, R8
2025  	MULQ  R10
2026  	ADDQ  AX, R14
2027  	ADCQ  $0x00, DX
2028  	MOVQ  DX, R10
2029  	MOVQ  8(BP), AX
2030  	MULQ  R11
2031  	ADDQ  AX, R15
2032  	ADCQ  $0x00, DX
2033  	IMULQ R12, R8
2034  	ADDQ  R10, R15
2035  	ADCQ  DX, R8
2036  	MOVQ  R13, R10
2037  	MOVQ  R14, R11
2038  	MOVQ  R15, R12
2039  	ANDQ  $0x03, R12
2040  	MOVQ  R15, R13
2041  	ANDQ  $-4, R13
2042  	MOVQ  R8, R14
2043  	SHRQ  $0x02, R8, R15
2044  	SHRQ  $0x02, R8
2045  	ADDQ  R13, R10
2046  	ADCQ  R14, R11
2047  	ADCQ  $0x00, R12
2048  	ADDQ  R15, R10
2049  	ADCQ  R8, R11
2050  	ADCQ  $0x00, R12
2051  	CMPQ  BX, $0xc0
2052  	JB    openSSLTail192Store
2053  	ADDQ  176(SI), R10
2054  	ADCQ  184(SI), R11
2055  	ADCQ  $0x01, R12
2056  	MOVQ  (BP), AX
2057  	MOVQ  AX, R15
2058  	MULQ  R10
2059  	MOVQ  AX, R13
2060  	MOVQ  DX, R14
2061  	MOVQ  (BP), AX
2062  	MULQ  R11
2063  	IMULQ R12, R15
2064  	ADDQ  AX, R14
2065  	ADCQ  DX, R15
2066  	MOVQ  8(BP), AX
2067  	MOVQ  AX, R8
2068  	MULQ  R10
2069  	ADDQ  AX, R14
2070  	ADCQ  $0x00, DX
2071  	MOVQ  DX, R10
2072  	MOVQ  8(BP), AX
2073  	MULQ  R11
2074  	ADDQ  AX, R15
2075  	ADCQ  $0x00, DX
2076  	IMULQ R12, R8
2077  	ADDQ  R10, R15
2078  	ADCQ  DX, R8
2079  	MOVQ  R13, R10
2080  	MOVQ  R14, R11
2081  	MOVQ  R15, R12
2082  	ANDQ  $0x03, R12
2083  	MOVQ  R15, R13
2084  	ANDQ  $-4, R13
2085  	MOVQ  R8, R14
2086  	SHRQ  $0x02, R8, R15
2087  	SHRQ  $0x02, R8
2088  	ADDQ  R13, R10
2089  	ADCQ  R14, R11
2090  	ADCQ  $0x00, R12
2091  	ADDQ  R15, R10
2092  	ADCQ  R8, R11
2093  	ADCQ  $0x00, R12
2094  
2095  openSSLTail192Store:
2096  	PADDL ·chacha20Constants<>+0(SB), X0
2097  	PADDL ·chacha20Constants<>+0(SB), X1
2098  	PADDL ·chacha20Constants<>+0(SB), X2
2099  	PADDL 32(BP), X3
2100  	PADDL 32(BP), X4
2101  	PADDL 32(BP), X5
2102  	PADDL 48(BP), X6
2103  	PADDL 48(BP), X7
2104  	PADDL 48(BP), X8
2105  	PADDL 112(BP), X9
2106  	PADDL 96(BP), X10
2107  	PADDL 80(BP), X11
2108  	MOVOU (SI), X12
2109  	MOVOU 16(SI), X13
2110  	MOVOU 32(SI), X14
2111  	MOVOU 48(SI), X15
2112  	PXOR  X12, X2
2113  	PXOR  X13, X5
2114  	PXOR  X14, X8
2115  	PXOR  X15, X11
2116  	MOVOU X2, (DI)
2117  	MOVOU X5, 16(DI)
2118  	MOVOU X8, 32(DI)
2119  	MOVOU X11, 48(DI)
2120  	MOVOU 64(SI), X12
2121  	MOVOU 80(SI), X13
2122  	MOVOU 96(SI), X14
2123  	MOVOU 112(SI), X15
2124  	PXOR  X12, X1
2125  	PXOR  X13, X4
2126  	PXOR  X14, X7
2127  	PXOR  X15, X10
2128  	MOVOU X1, 64(DI)
2129  	MOVOU X4, 80(DI)
2130  	MOVOU X7, 96(DI)
2131  	MOVOU X10, 112(DI)
2132  	SUBQ  $0x80, BX
2133  	LEAQ  128(SI), SI
2134  	LEAQ  128(DI), DI
2135  	JMP   openSSETail64DecLoop
2136  
2137  openSSETail256:
2138  	MOVO  ·chacha20Constants<>+0(SB), X0
2139  	MOVO  32(BP), X3
2140  	MOVO  48(BP), X6
2141  	MOVO  128(BP), X9
2142  	PADDL ·sseIncMask<>+0(SB), X9
2143  	MOVO  X0, X1
2144  	MOVO  X3, X4
2145  	MOVO  X6, X7
2146  	MOVO  X9, X10
2147  	PADDL ·sseIncMask<>+0(SB), X10
2148  	MOVO  X1, X2
2149  	MOVO  X4, X5
2150  	MOVO  X7, X8
2151  	MOVO  X10, X11
2152  	PADDL ·sseIncMask<>+0(SB), X11
2153  	MOVO  X2, X12
2154  	MOVO  X5, X13
2155  	MOVO  X8, X14
2156  	MOVO  X11, X15
2157  	PADDL ·sseIncMask<>+0(SB), X15
2158  
2159  	// Store counters
2160  	MOVO X9, 80(BP)
2161  	MOVO X10, 96(BP)
2162  	MOVO X11, 112(BP)
2163  	MOVO X15, 128(BP)
2164  	XORQ R9, R9
2165  
2166  openSSETail256Loop:
2167  	ADDQ  (SI)(R9*1), R10
2168  	ADCQ  8(SI)(R9*1), R11
2169  	ADCQ  $0x01, R12
2170  	MOVO  X14, 64(BP)
2171  	PADDD X3, X0
2172  	PXOR  X0, X9
2173  	ROL16(X9, X14)
2174  	PADDD X9, X6
2175  	PXOR  X6, X3
2176  	MOVO  X3, X14
2177  	PSLLL $0x0c, X14
2178  	PSRLL $0x14, X3
2179  	PXOR  X14, X3
2180  	PADDD X3, X0
2181  	PXOR  X0, X9
2182  	ROL8(X9, X14)
2183  	PADDD X9, X6
2184  	PXOR  X6, X3
2185  	MOVO  X3, X14
2186  	PSLLL $0x07, X14
2187  	PSRLL $0x19, X3
2188  	PXOR  X14, X3
2189  	PADDD X4, X1
2190  	PXOR  X1, X10
2191  	ROL16(X10, X14)
2192  	PADDD X10, X7
2193  	PXOR  X7, X4
2194  	MOVO  X4, X14
2195  	PSLLL $0x0c, X14
2196  	PSRLL $0x14, X4
2197  	PXOR  X14, X4
2198  	PADDD X4, X1
2199  	PXOR  X1, X10
2200  	ROL8(X10, X14)
2201  	PADDD X10, X7
2202  	PXOR  X7, X4
2203  	MOVO  X4, X14
2204  	PSLLL $0x07, X14
2205  	PSRLL $0x19, X4
2206  	PXOR  X14, X4
2207  	PADDD X5, X2
2208  	PXOR  X2, X11
2209  	ROL16(X11, X14)
2210  	PADDD X11, X8
2211  	PXOR  X8, X5
2212  	MOVO  X5, X14
2213  	PSLLL $0x0c, X14
2214  	PSRLL $0x14, X5
2215  	PXOR  X14, X5
2216  	PADDD X5, X2
2217  	PXOR  X2, X11
2218  	ROL8(X11, X14)
2219  	PADDD X11, X8
2220  	PXOR  X8, X5
2221  	MOVO  X5, X14
2222  	PSLLL $0x07, X14
2223  	PSRLL $0x19, X5
2224  	PXOR  X14, X5
2225  	MOVO  64(BP), X14
2226  	MOVO  X7, 64(BP)
2227  	PADDD X13, X12
2228  	PXOR  X12, X15
2229  	ROL16(X15, X7)
2230  	PADDD X15, X14
2231  	PXOR  X14, X13
2232  	MOVO  X13, X7
2233  	PSLLL $0x0c, X7
2234  	PSRLL $0x14, X13
2235  	PXOR  X7, X13
2236  	PADDD X13, X12
2237  	PXOR  X12, X15
2238  	ROL8(X15, X7)
2239  	PADDD X15, X14
2240  	PXOR  X14, X13
2241  	MOVO  X13, X7
2242  	PSLLL $0x07, X7
2243  	PSRLL $0x19, X13
2244  	PXOR  X7, X13
2245  	MOVO  64(BP), X7
2246  	BYTE  $0x66
2247  	BYTE  $0x0f
2248  	BYTE  $0x3a
2249  	BYTE  $0x0f
2250  	BYTE  $0xdb
2251  	BYTE  $0x04
2252  	BYTE  $0x66
2253  	BYTE  $0x0f
2254  	BYTE  $0x3a
2255  	BYTE  $0x0f
2256  	BYTE  $0xe4
2257  	BYTE  $0x04
2258  	BYTE  $0x66
2259  	BYTE  $0x0f
2260  	BYTE  $0x3a
2261  	BYTE  $0x0f
2262  	BYTE  $0xed
2263  	BYTE  $0x04
2264  	BYTE  $0x66
2265  	BYTE  $0x45
2266  	BYTE  $0x0f
2267  	BYTE  $0x3a
2268  	BYTE  $0x0f
2269  	BYTE  $0xed
2270  	BYTE  $0x04
2271  	BYTE  $0x66
2272  	BYTE  $0x0f
2273  	BYTE  $0x3a
2274  	BYTE  $0x0f
2275  	BYTE  $0xf6
2276  	BYTE  $0x08
2277  	BYTE  $0x66
2278  	BYTE  $0x0f
2279  	BYTE  $0x3a
2280  	BYTE  $0x0f
2281  	BYTE  $0xff
2282  	BYTE  $0x08
2283  	BYTE  $0x66
2284  	BYTE  $0x45
2285  	BYTE  $0x0f
2286  	BYTE  $0x3a
2287  	BYTE  $0x0f
2288  	BYTE  $0xc0
2289  	BYTE  $0x08
2290  	BYTE  $0x66
2291  	BYTE  $0x45
2292  	BYTE  $0x0f
2293  	BYTE  $0x3a
2294  	BYTE  $0x0f
2295  	BYTE  $0xf6
2296  	BYTE  $0x08
2297  	BYTE  $0x66
2298  	BYTE  $0x45
2299  	BYTE  $0x0f
2300  	BYTE  $0x3a
2301  	BYTE  $0x0f
2302  	BYTE  $0xc9
2303  	BYTE  $0x0c
2304  	BYTE  $0x66
2305  	BYTE  $0x45
2306  	BYTE  $0x0f
2307  	BYTE  $0x3a
2308  	BYTE  $0x0f
2309  	BYTE  $0xd2
2310  	BYTE  $0x0c
2311  	BYTE  $0x66
2312  	BYTE  $0x45
2313  	BYTE  $0x0f
2314  	BYTE  $0x3a
2315  	BYTE  $0x0f
2316  	BYTE  $0xdb
2317  	BYTE  $0x0c
2318  	BYTE  $0x66
2319  	BYTE  $0x45
2320  	BYTE  $0x0f
2321  	BYTE  $0x3a
2322  	BYTE  $0x0f
2323  	BYTE  $0xff
2324  	BYTE  $0x0c
2325  	MOVQ  (BP), AX
2326  	MOVQ  AX, R15
2327  	MULQ  R10
2328  	MOVQ  AX, R13
2329  	MOVQ  DX, R14
2330  	MOVQ  (BP), AX
2331  	MULQ  R11
2332  	IMULQ R12, R15
2333  	ADDQ  AX, R14
2334  	ADCQ  DX, R15
2335  	MOVQ  8(BP), AX
2336  	MOVQ  AX, R8
2337  	MULQ  R10
2338  	ADDQ  AX, R14
2339  	ADCQ  $0x00, DX
2340  	MOVQ  DX, R10
2341  	MOVQ  8(BP), AX
2342  	MULQ  R11
2343  	ADDQ  AX, R15
2344  	ADCQ  $0x00, DX
2345  	MOVO  X14, 64(BP)
2346  	PADDD X3, X0
2347  	PXOR  X0, X9
2348  	ROL16(X9, X14)
2349  	PADDD X9, X6
2350  	PXOR  X6, X3
2351  	MOVO  X3, X14
2352  	PSLLL $0x0c, X14
2353  	PSRLL $0x14, X3
2354  	PXOR  X14, X3
2355  	PADDD X3, X0
2356  	PXOR  X0, X9
2357  	ROL8(X9, X14)
2358  	PADDD X9, X6
2359  	PXOR  X6, X3
2360  	MOVO  X3, X14
2361  	PSLLL $0x07, X14
2362  	PSRLL $0x19, X3
2363  	PXOR  X14, X3
2364  	PADDD X4, X1
2365  	PXOR  X1, X10
2366  	ROL16(X10, X14)
2367  	PADDD X10, X7
2368  	PXOR  X7, X4
2369  	MOVO  X4, X14
2370  	PSLLL $0x0c, X14
2371  	PSRLL $0x14, X4
2372  	PXOR  X14, X4
2373  	PADDD X4, X1
2374  	PXOR  X1, X10
2375  	ROL8(X10, X14)
2376  	PADDD X10, X7
2377  	PXOR  X7, X4
2378  	MOVO  X4, X14
2379  	PSLLL $0x07, X14
2380  	PSRLL $0x19, X4
2381  	PXOR  X14, X4
2382  	PADDD X5, X2
2383  	PXOR  X2, X11
2384  	ROL16(X11, X14)
2385  	PADDD X11, X8
2386  	PXOR  X8, X5
2387  	MOVO  X5, X14
2388  	PSLLL $0x0c, X14
2389  	PSRLL $0x14, X5
2390  	PXOR  X14, X5
2391  	PADDD X5, X2
2392  	PXOR  X2, X11
2393  	ROL8(X11, X14)
2394  	PADDD X11, X8
2395  	PXOR  X8, X5
2396  	MOVO  X5, X14
2397  	PSLLL $0x07, X14
2398  	PSRLL $0x19, X5
2399  	PXOR  X14, X5
2400  	MOVO  64(BP), X14
2401  	MOVO  X7, 64(BP)
2402  	PADDD X13, X12
2403  	PXOR  X12, X15
2404  	ROL16(X15, X7)
2405  	PADDD X15, X14
2406  	PXOR  X14, X13
2407  	MOVO  X13, X7
2408  	PSLLL $0x0c, X7
2409  	PSRLL $0x14, X13
2410  	PXOR  X7, X13
2411  	PADDD X13, X12
2412  	PXOR  X12, X15
2413  	ROL8(X15, X7)
2414  	PADDD X15, X14
2415  	PXOR  X14, X13
2416  	MOVO  X13, X7
2417  	PSLLL $0x07, X7
2418  	PSRLL $0x19, X13
2419  	PXOR  X7, X13
2420  	MOVO  64(BP), X7
2421  	IMULQ R12, R8
2422  	ADDQ  R10, R15
2423  	ADCQ  DX, R8
2424  	MOVQ  R13, R10
2425  	MOVQ  R14, R11
2426  	MOVQ  R15, R12
2427  	ANDQ  $0x03, R12
2428  	MOVQ  R15, R13
2429  	ANDQ  $-4, R13
2430  	MOVQ  R8, R14
2431  	SHRQ  $0x02, R8, R15
2432  	SHRQ  $0x02, R8
2433  	ADDQ  R13, R10
2434  	ADCQ  R14, R11
2435  	ADCQ  $0x00, R12
2436  	ADDQ  R15, R10
2437  	ADCQ  R8, R11
2438  	ADCQ  $0x00, R12
2439  	BYTE  $0x66
2440  	BYTE  $0x0f
2441  	BYTE  $0x3a
2442  	BYTE  $0x0f
2443  	BYTE  $0xdb
2444  	BYTE  $0x0c
2445  	BYTE  $0x66
2446  	BYTE  $0x0f
2447  	BYTE  $0x3a
2448  	BYTE  $0x0f
2449  	BYTE  $0xe4
2450  	BYTE  $0x0c
2451  	BYTE  $0x66
2452  	BYTE  $0x0f
2453  	BYTE  $0x3a
2454  	BYTE  $0x0f
2455  	BYTE  $0xed
2456  	BYTE  $0x0c
2457  	BYTE  $0x66
2458  	BYTE  $0x45
2459  	BYTE  $0x0f
2460  	BYTE  $0x3a
2461  	BYTE  $0x0f
2462  	BYTE  $0xed
2463  	BYTE  $0x0c
2464  	BYTE  $0x66
2465  	BYTE  $0x0f
2466  	BYTE  $0x3a
2467  	BYTE  $0x0f
2468  	BYTE  $0xf6
2469  	BYTE  $0x08
2470  	BYTE  $0x66
2471  	BYTE  $0x0f
2472  	BYTE  $0x3a
2473  	BYTE  $0x0f
2474  	BYTE  $0xff
2475  	BYTE  $0x08
2476  	BYTE  $0x66
2477  	BYTE  $0x45
2478  	BYTE  $0x0f
2479  	BYTE  $0x3a
2480  	BYTE  $0x0f
2481  	BYTE  $0xc0
2482  	BYTE  $0x08
2483  	BYTE  $0x66
2484  	BYTE  $0x45
2485  	BYTE  $0x0f
2486  	BYTE  $0x3a
2487  	BYTE  $0x0f
2488  	BYTE  $0xf6
2489  	BYTE  $0x08
2490  	BYTE  $0x66
2491  	BYTE  $0x45
2492  	BYTE  $0x0f
2493  	BYTE  $0x3a
2494  	BYTE  $0x0f
2495  	BYTE  $0xc9
2496  	BYTE  $0x04
2497  	BYTE  $0x66
2498  	BYTE  $0x45
2499  	BYTE  $0x0f
2500  	BYTE  $0x3a
2501  	BYTE  $0x0f
2502  	BYTE  $0xd2
2503  	BYTE  $0x04
2504  	BYTE  $0x66
2505  	BYTE  $0x45
2506  	BYTE  $0x0f
2507  	BYTE  $0x3a
2508  	BYTE  $0x0f
2509  	BYTE  $0xdb
2510  	BYTE  $0x04
2511  	BYTE  $0x66
2512  	BYTE  $0x45
2513  	BYTE  $0x0f
2514  	BYTE  $0x3a
2515  	BYTE  $0x0f
2516  	BYTE  $0xff
2517  	BYTE  $0x04
2518  	ADDQ  $0x10, R9
2519  	CMPQ  R9, $0xa0
2520  	JB    openSSETail256Loop
2521  	MOVQ  BX, CX
2522  	ANDQ  $-16, CX
2523  
2524  openSSETail256HashLoop:
2525  	ADDQ  (SI)(R9*1), R10
2526  	ADCQ  8(SI)(R9*1), R11
2527  	ADCQ  $0x01, R12
2528  	MOVQ  (BP), AX
2529  	MOVQ  AX, R15
2530  	MULQ  R10
2531  	MOVQ  AX, R13
2532  	MOVQ  DX, R14
2533  	MOVQ  (BP), AX
2534  	MULQ  R11
2535  	IMULQ R12, R15
2536  	ADDQ  AX, R14
2537  	ADCQ  DX, R15
2538  	MOVQ  8(BP), AX
2539  	MOVQ  AX, R8
2540  	MULQ  R10
2541  	ADDQ  AX, R14
2542  	ADCQ  $0x00, DX
2543  	MOVQ  DX, R10
2544  	MOVQ  8(BP), AX
2545  	MULQ  R11
2546  	ADDQ  AX, R15
2547  	ADCQ  $0x00, DX
2548  	IMULQ R12, R8
2549  	ADDQ  R10, R15
2550  	ADCQ  DX, R8
2551  	MOVQ  R13, R10
2552  	MOVQ  R14, R11
2553  	MOVQ  R15, R12
2554  	ANDQ  $0x03, R12
2555  	MOVQ  R15, R13
2556  	ANDQ  $-4, R13
2557  	MOVQ  R8, R14
2558  	SHRQ  $0x02, R8, R15
2559  	SHRQ  $0x02, R8
2560  	ADDQ  R13, R10
2561  	ADCQ  R14, R11
2562  	ADCQ  $0x00, R12
2563  	ADDQ  R15, R10
2564  	ADCQ  R8, R11
2565  	ADCQ  $0x00, R12
2566  	ADDQ  $0x10, R9
2567  	CMPQ  R9, CX
2568  	JB    openSSETail256HashLoop
2569  
2570  	// Add in the state
2571  	PADDD ·chacha20Constants<>+0(SB), X0
2572  	PADDD ·chacha20Constants<>+0(SB), X1
2573  	PADDD ·chacha20Constants<>+0(SB), X2
2574  	PADDD ·chacha20Constants<>+0(SB), X12
2575  	PADDD 32(BP), X3
2576  	PADDD 32(BP), X4
2577  	PADDD 32(BP), X5
2578  	PADDD 32(BP), X13
2579  	PADDD 48(BP), X6
2580  	PADDD 48(BP), X7
2581  	PADDD 48(BP), X8
2582  	PADDD 48(BP), X14
2583  	PADDD 80(BP), X9
2584  	PADDD 96(BP), X10
2585  	PADDD 112(BP), X11
2586  	PADDD 128(BP), X15
2587  	MOVO  X15, 64(BP)
2588  
2589  	// Load - xor - store
2590  	MOVOU (SI), X15
2591  	PXOR  X15, X0
2592  	MOVOU 16(SI), X15
2593  	PXOR  X15, X3
2594  	MOVOU 32(SI), X15
2595  	PXOR  X15, X6
2596  	MOVOU 48(SI), X15
2597  	PXOR  X15, X9
2598  	MOVOU X0, (DI)
2599  	MOVOU X3, 16(DI)
2600  	MOVOU X6, 32(DI)
2601  	MOVOU X9, 48(DI)
2602  	MOVOU 64(SI), X0
2603  	MOVOU 80(SI), X3
2604  	MOVOU 96(SI), X6
2605  	MOVOU 112(SI), X9
2606  	PXOR  X0, X1
2607  	PXOR  X3, X4
2608  	PXOR  X6, X7
2609  	PXOR  X9, X10
2610  	MOVOU X1, 64(DI)
2611  	MOVOU X4, 80(DI)
2612  	MOVOU X7, 96(DI)
2613  	MOVOU X10, 112(DI)
2614  	MOVOU 128(SI), X0
2615  	MOVOU 144(SI), X3
2616  	MOVOU 160(SI), X6
2617  	MOVOU 176(SI), X9
2618  	PXOR  X0, X2
2619  	PXOR  X3, X5
2620  	PXOR  X6, X8
2621  	PXOR  X9, X11
2622  	MOVOU X2, 128(DI)
2623  	MOVOU X5, 144(DI)
2624  	MOVOU X8, 160(DI)
2625  	MOVOU X11, 176(DI)
2626  	LEAQ  192(SI), SI
2627  	LEAQ  192(DI), DI
2628  	SUBQ  $0xc0, BX
2629  	MOVO  X12, X0
2630  	MOVO  X13, X3
2631  	MOVO  X14, X6
2632  	MOVO  64(BP), X9
2633  	JMP   openSSETail64DecLoop
2634  
2635  chacha20Poly1305Open_AVX2:
2636  	VZEROUPPER
2637  	VMOVDQU ·chacha20Constants<>+0(SB), Y0
2638  	BYTE    $0xc4
2639  	BYTE    $0x42
2640  	BYTE    $0x7d
2641  	BYTE    $0x5a
2642  	BYTE    $0x70
2643  	BYTE    $0x10
2644  	BYTE    $0xc4
2645  	BYTE    $0x42
2646  	BYTE    $0x7d
2647  	BYTE    $0x5a
2648  	BYTE    $0x60
2649  	BYTE    $0x20
2650  	BYTE    $0xc4
2651  	BYTE    $0xc2
2652  	BYTE    $0x7d
2653  	BYTE    $0x5a
2654  	BYTE    $0x60
2655  	BYTE    $0x30
2656  	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
2657  
2658  	// Special optimization, for very short buffers
2659  	CMPQ BX, $0xc0
2660  	JBE  openAVX2192
2661  	CMPQ BX, $0x00000140
2662  	JBE  openAVX2320
2663  
2664  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
2665  	VMOVDQA Y14, 32(BP)
2666  	VMOVDQA Y12, 64(BP)
2667  	VMOVDQA Y4, 192(BP)
2668  	MOVQ    $0x0000000a, R9
2669  
2670  openAVX2PreparePolyKey:
2671  	VPADDD     Y14, Y0, Y0
2672  	VPXOR      Y0, Y4, Y4
2673  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
2674  	VPADDD     Y4, Y12, Y12
2675  	VPXOR      Y12, Y14, Y14
2676  	VPSLLD     $0x0c, Y14, Y3
2677  	VPSRLD     $0x14, Y14, Y14
2678  	VPXOR      Y3, Y14, Y14
2679  	VPADDD     Y14, Y0, Y0
2680  	VPXOR      Y0, Y4, Y4
2681  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
2682  	VPADDD     Y4, Y12, Y12
2683  	VPXOR      Y12, Y14, Y14
2684  	VPSLLD     $0x07, Y14, Y3
2685  	VPSRLD     $0x19, Y14, Y14
2686  	VPXOR      Y3, Y14, Y14
2687  	VPALIGNR   $0x04, Y14, Y14, Y14
2688  	VPALIGNR   $0x08, Y12, Y12, Y12
2689  	VPALIGNR   $0x0c, Y4, Y4, Y4
2690  	VPADDD     Y14, Y0, Y0
2691  	VPXOR      Y0, Y4, Y4
2692  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
2693  	VPADDD     Y4, Y12, Y12
2694  	VPXOR      Y12, Y14, Y14
2695  	VPSLLD     $0x0c, Y14, Y3
2696  	VPSRLD     $0x14, Y14, Y14
2697  	VPXOR      Y3, Y14, Y14
2698  	VPADDD     Y14, Y0, Y0
2699  	VPXOR      Y0, Y4, Y4
2700  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
2701  	VPADDD     Y4, Y12, Y12
2702  	VPXOR      Y12, Y14, Y14
2703  	VPSLLD     $0x07, Y14, Y3
2704  	VPSRLD     $0x19, Y14, Y14
2705  	VPXOR      Y3, Y14, Y14
2706  	VPALIGNR   $0x0c, Y14, Y14, Y14
2707  	VPALIGNR   $0x08, Y12, Y12, Y12
2708  	VPALIGNR   $0x04, Y4, Y4, Y4
2709  	DECQ       R9
2710  	JNE        openAVX2PreparePolyKey
2711  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
2712  	VPADDD     32(BP), Y14, Y14
2713  	VPADDD     64(BP), Y12, Y12
2714  	VPADDD     192(BP), Y4, Y4
2715  	VPERM2I128 $0x02, Y0, Y14, Y3
2716  
2717  	// Clamp and store poly key
2718  	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
2719  	VMOVDQA Y3, (BP)
2720  
2721  	// Stream for the first 64 bytes
2722  	VPERM2I128 $0x13, Y0, Y14, Y0
2723  	VPERM2I128 $0x13, Y12, Y4, Y14
2724  
2725  	// Hash AD + first 64 bytes
2726  	MOVQ ad_len+80(FP), R9
2727  	CALL polyHashADInternal<>(SB)
2728  	XORQ CX, CX
2729  
2730  openAVX2InitialHash64:
2731  	ADDQ  (SI)(CX*1), R10
2732  	ADCQ  8(SI)(CX*1), R11
2733  	ADCQ  $0x01, R12
2734  	MOVQ  (BP), DX
2735  	MOVQ  DX, R15
2736  	MULXQ R10, R13, R14
2737  	IMULQ R12, R15
2738  	MULXQ R11, AX, DX
2739  	ADDQ  AX, R14
2740  	ADCQ  DX, R15
2741  	MOVQ  8(BP), DX
2742  	MULXQ R10, R10, AX
2743  	ADDQ  R10, R14
2744  	MULXQ R11, R11, R8
2745  	ADCQ  R11, R15
2746  	ADCQ  $0x00, R8
2747  	IMULQ R12, DX
2748  	ADDQ  AX, R15
2749  	ADCQ  DX, R8
2750  	MOVQ  R13, R10
2751  	MOVQ  R14, R11
2752  	MOVQ  R15, R12
2753  	ANDQ  $0x03, R12
2754  	MOVQ  R15, R13
2755  	ANDQ  $-4, R13
2756  	MOVQ  R8, R14
2757  	SHRQ  $0x02, R8, R15
2758  	SHRQ  $0x02, R8
2759  	ADDQ  R13, R10
2760  	ADCQ  R14, R11
2761  	ADCQ  $0x00, R12
2762  	ADDQ  R15, R10
2763  	ADCQ  R8, R11
2764  	ADCQ  $0x00, R12
2765  	ADDQ  $0x10, CX
2766  	CMPQ  CX, $0x40
2767  	JNE   openAVX2InitialHash64
2768  
2769  	// Decrypt the first 64 bytes
2770  	VPXOR   (SI), Y0, Y0
2771  	VPXOR   32(SI), Y14, Y14
2772  	VMOVDQU Y0, (DI)
2773  	VMOVDQU Y14, 32(DI)
2774  	LEAQ    64(SI), SI
2775  	LEAQ    64(DI), DI
2776  	SUBQ    $0x40, BX
2777  
2778  openAVX2MainLoop:
2779  	CMPQ BX, $0x00000200
2780  	JB   openAVX2MainLoopDone
2781  
2782  	// Load state, increment counter blocks, store the incremented counters
2783  	VMOVDQU ·chacha20Constants<>+0(SB), Y0
2784  	VMOVDQA Y0, Y5
2785  	VMOVDQA Y0, Y6
2786  	VMOVDQA Y0, Y7
2787  	VMOVDQA 32(BP), Y14
2788  	VMOVDQA Y14, Y9
2789  	VMOVDQA Y14, Y10
2790  	VMOVDQA Y14, Y11
2791  	VMOVDQA 64(BP), Y12
2792  	VMOVDQA Y12, Y13
2793  	VMOVDQA Y12, Y8
2794  	VMOVDQA Y12, Y15
2795  	VMOVDQA 192(BP), Y4
2796  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
2797  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
2798  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
2799  	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
2800  	VMOVDQA Y4, 96(BP)
2801  	VMOVDQA Y1, 128(BP)
2802  	VMOVDQA Y2, 160(BP)
2803  	VMOVDQA Y3, 192(BP)
2804  	XORQ    CX, CX
2805  
2806  openAVX2InternalLoop:
2807  	ADDQ     (SI)(CX*1), R10
2808  	ADCQ     8(SI)(CX*1), R11
2809  	ADCQ     $0x01, R12
2810  	VPADDD   Y14, Y0, Y0
2811  	VPADDD   Y9, Y5, Y5
2812  	VPADDD   Y10, Y6, Y6
2813  	VPADDD   Y11, Y7, Y7
2814  	MOVQ     (BP), DX
2815  	MOVQ     DX, R15
2816  	MULXQ    R10, R13, R14
2817  	IMULQ    R12, R15
2818  	MULXQ    R11, AX, DX
2819  	ADDQ     AX, R14
2820  	ADCQ     DX, R15
2821  	VPXOR    Y0, Y4, Y4
2822  	VPXOR    Y5, Y1, Y1
2823  	VPXOR    Y6, Y2, Y2
2824  	VPXOR    Y7, Y3, Y3
2825  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
2826  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
2827  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
2828  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
2829  	MOVQ     8(BP), DX
2830  	MULXQ    R10, R10, AX
2831  	ADDQ     R10, R14
2832  	MULXQ    R11, R11, R8
2833  	ADCQ     R11, R15
2834  	ADCQ     $0x00, R8
2835  	VPADDD   Y4, Y12, Y12
2836  	VPADDD   Y1, Y13, Y13
2837  	VPADDD   Y2, Y8, Y8
2838  	VPADDD   Y3, Y15, Y15
2839  	VPXOR    Y12, Y14, Y14
2840  	VPXOR    Y13, Y9, Y9
2841  	VPXOR    Y8, Y10, Y10
2842  	VPXOR    Y15, Y11, Y11
2843  	IMULQ    R12, DX
2844  	ADDQ     AX, R15
2845  	ADCQ     DX, R8
2846  	VMOVDQA  Y15, 224(BP)
2847  	VPSLLD   $0x0c, Y14, Y15
2848  	VPSRLD   $0x14, Y14, Y14
2849  	VPXOR    Y15, Y14, Y14
2850  	VPSLLD   $0x0c, Y9, Y15
2851  	VPSRLD   $0x14, Y9, Y9
2852  	VPXOR    Y15, Y9, Y9
2853  	VPSLLD   $0x0c, Y10, Y15
2854  	VPSRLD   $0x14, Y10, Y10
2855  	VPXOR    Y15, Y10, Y10
2856  	VPSLLD   $0x0c, Y11, Y15
2857  	VPSRLD   $0x14, Y11, Y11
2858  	VPXOR    Y15, Y11, Y11
2859  	VMOVDQA  224(BP), Y15
2860  	MOVQ     R13, R10
2861  	MOVQ     R14, R11
2862  	MOVQ     R15, R12
2863  	ANDQ     $0x03, R12
2864  	MOVQ     R15, R13
2865  	ANDQ     $-4, R13
2866  	MOVQ     R8, R14
2867  	SHRQ     $0x02, R8, R15
2868  	SHRQ     $0x02, R8
2869  	ADDQ     R13, R10
2870  	ADCQ     R14, R11
2871  	ADCQ     $0x00, R12
2872  	ADDQ     R15, R10
2873  	ADCQ     R8, R11
2874  	ADCQ     $0x00, R12
2875  	VPADDD   Y14, Y0, Y0
2876  	VPADDD   Y9, Y5, Y5
2877  	VPADDD   Y10, Y6, Y6
2878  	VPADDD   Y11, Y7, Y7
2879  	VPXOR    Y0, Y4, Y4
2880  	VPXOR    Y5, Y1, Y1
2881  	VPXOR    Y6, Y2, Y2
2882  	VPXOR    Y7, Y3, Y3
2883  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
2884  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
2885  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
2886  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
2887  	ADDQ     16(SI)(CX*1), R10
2888  	ADCQ     24(SI)(CX*1), R11
2889  	ADCQ     $0x01, R12
2890  	VPADDD   Y4, Y12, Y12
2891  	VPADDD   Y1, Y13, Y13
2892  	VPADDD   Y2, Y8, Y8
2893  	VPADDD   Y3, Y15, Y15
2894  	MOVQ     (BP), DX
2895  	MOVQ     DX, R15
2896  	MULXQ    R10, R13, R14
2897  	IMULQ    R12, R15
2898  	MULXQ    R11, AX, DX
2899  	ADDQ     AX, R14
2900  	ADCQ     DX, R15
2901  	VPXOR    Y12, Y14, Y14
2902  	VPXOR    Y13, Y9, Y9
2903  	VPXOR    Y8, Y10, Y10
2904  	VPXOR    Y15, Y11, Y11
2905  	VMOVDQA  Y15, 224(BP)
2906  	VPSLLD   $0x07, Y14, Y15
2907  	VPSRLD   $0x19, Y14, Y14
2908  	VPXOR    Y15, Y14, Y14
2909  	VPSLLD   $0x07, Y9, Y15
2910  	VPSRLD   $0x19, Y9, Y9
2911  	VPXOR    Y15, Y9, Y9
2912  	VPSLLD   $0x07, Y10, Y15
2913  	VPSRLD   $0x19, Y10, Y10
2914  	VPXOR    Y15, Y10, Y10
2915  	VPSLLD   $0x07, Y11, Y15
2916  	VPSRLD   $0x19, Y11, Y11
2917  	VPXOR    Y15, Y11, Y11
2918  	VMOVDQA  224(BP), Y15
2919  	MOVQ     8(BP), DX
2920  	MULXQ    R10, R10, AX
2921  	ADDQ     R10, R14
2922  	MULXQ    R11, R11, R8
2923  	ADCQ     R11, R15
2924  	ADCQ     $0x00, R8
2925  	VPALIGNR $0x04, Y14, Y14, Y14
2926  	VPALIGNR $0x04, Y9, Y9, Y9
2927  	VPALIGNR $0x04, Y10, Y10, Y10
2928  	VPALIGNR $0x04, Y11, Y11, Y11
2929  	VPALIGNR $0x08, Y12, Y12, Y12
2930  	VPALIGNR $0x08, Y13, Y13, Y13
2931  	VPALIGNR $0x08, Y8, Y8, Y8
2932  	VPALIGNR $0x08, Y15, Y15, Y15
2933  	VPALIGNR $0x0c, Y4, Y4, Y4
2934  	VPALIGNR $0x0c, Y1, Y1, Y1
2935  	VPALIGNR $0x0c, Y2, Y2, Y2
2936  	VPALIGNR $0x0c, Y3, Y3, Y3
2937  	VPADDD   Y14, Y0, Y0
2938  	VPADDD   Y9, Y5, Y5
2939  	VPADDD   Y10, Y6, Y6
2940  	VPADDD   Y11, Y7, Y7
2941  	IMULQ    R12, DX
2942  	ADDQ     AX, R15
2943  	ADCQ     DX, R8
2944  	VPXOR    Y0, Y4, Y4
2945  	VPXOR    Y5, Y1, Y1
2946  	VPXOR    Y6, Y2, Y2
2947  	VPXOR    Y7, Y3, Y3
2948  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
2949  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
2950  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
2951  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
2952  	MOVQ     R13, R10
2953  	MOVQ     R14, R11
2954  	MOVQ     R15, R12
2955  	ANDQ     $0x03, R12
2956  	MOVQ     R15, R13
2957  	ANDQ     $-4, R13
2958  	MOVQ     R8, R14
2959  	SHRQ     $0x02, R8, R15
2960  	SHRQ     $0x02, R8
2961  	ADDQ     R13, R10
2962  	ADCQ     R14, R11
2963  	ADCQ     $0x00, R12
2964  	ADDQ     R15, R10
2965  	ADCQ     R8, R11
2966  	ADCQ     $0x00, R12
2967  	VPADDD   Y4, Y12, Y12
2968  	VPADDD   Y1, Y13, Y13
2969  	VPADDD   Y2, Y8, Y8
2970  	VPADDD   Y3, Y15, Y15
2971  	VPXOR    Y12, Y14, Y14
2972  	VPXOR    Y13, Y9, Y9
2973  	VPXOR    Y8, Y10, Y10
2974  	VPXOR    Y15, Y11, Y11
2975  	ADDQ     32(SI)(CX*1), R10
2976  	ADCQ     40(SI)(CX*1), R11
2977  	ADCQ     $0x01, R12
2978  	LEAQ     48(CX), CX
2979  	VMOVDQA  Y15, 224(BP)
2980  	VPSLLD   $0x0c, Y14, Y15
2981  	VPSRLD   $0x14, Y14, Y14
2982  	VPXOR    Y15, Y14, Y14
2983  	VPSLLD   $0x0c, Y9, Y15
2984  	VPSRLD   $0x14, Y9, Y9
2985  	VPXOR    Y15, Y9, Y9
2986  	VPSLLD   $0x0c, Y10, Y15
2987  	VPSRLD   $0x14, Y10, Y10
2988  	VPXOR    Y15, Y10, Y10
2989  	VPSLLD   $0x0c, Y11, Y15
2990  	VPSRLD   $0x14, Y11, Y11
2991  	VPXOR    Y15, Y11, Y11
2992  	VMOVDQA  224(BP), Y15
2993  	MOVQ     (BP), DX
2994  	MOVQ     DX, R15
2995  	MULXQ    R10, R13, R14
2996  	IMULQ    R12, R15
2997  	MULXQ    R11, AX, DX
2998  	ADDQ     AX, R14
2999  	ADCQ     DX, R15
3000  	VPADDD   Y14, Y0, Y0
3001  	VPADDD   Y9, Y5, Y5
3002  	VPADDD   Y10, Y6, Y6
3003  	VPADDD   Y11, Y7, Y7
3004  	VPXOR    Y0, Y4, Y4
3005  	VPXOR    Y5, Y1, Y1
3006  	VPXOR    Y6, Y2, Y2
3007  	VPXOR    Y7, Y3, Y3
3008  	MOVQ     8(BP), DX
3009  	MULXQ    R10, R10, AX
3010  	ADDQ     R10, R14
3011  	MULXQ    R11, R11, R8
3012  	ADCQ     R11, R15
3013  	ADCQ     $0x00, R8
3014  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
3015  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
3016  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
3017  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
3018  	VPADDD   Y4, Y12, Y12
3019  	VPADDD   Y1, Y13, Y13
3020  	VPADDD   Y2, Y8, Y8
3021  	VPADDD   Y3, Y15, Y15
3022  	IMULQ    R12, DX
3023  	ADDQ     AX, R15
3024  	ADCQ     DX, R8
3025  	VPXOR    Y12, Y14, Y14
3026  	VPXOR    Y13, Y9, Y9
3027  	VPXOR    Y8, Y10, Y10
3028  	VPXOR    Y15, Y11, Y11
3029  	VMOVDQA  Y15, 224(BP)
3030  	VPSLLD   $0x07, Y14, Y15
3031  	VPSRLD   $0x19, Y14, Y14
3032  	VPXOR    Y15, Y14, Y14
3033  	VPSLLD   $0x07, Y9, Y15
3034  	VPSRLD   $0x19, Y9, Y9
3035  	VPXOR    Y15, Y9, Y9
3036  	VPSLLD   $0x07, Y10, Y15
3037  	VPSRLD   $0x19, Y10, Y10
3038  	VPXOR    Y15, Y10, Y10
3039  	VPSLLD   $0x07, Y11, Y15
3040  	VPSRLD   $0x19, Y11, Y11
3041  	VPXOR    Y15, Y11, Y11
3042  	VMOVDQA  224(BP), Y15
3043  	MOVQ     R13, R10
3044  	MOVQ     R14, R11
3045  	MOVQ     R15, R12
3046  	ANDQ     $0x03, R12
3047  	MOVQ     R15, R13
3048  	ANDQ     $-4, R13
3049  	MOVQ     R8, R14
3050  	SHRQ     $0x02, R8, R15
3051  	SHRQ     $0x02, R8
3052  	ADDQ     R13, R10
3053  	ADCQ     R14, R11
3054  	ADCQ     $0x00, R12
3055  	ADDQ     R15, R10
3056  	ADCQ     R8, R11
3057  	ADCQ     $0x00, R12
3058  	VPALIGNR $0x0c, Y14, Y14, Y14
3059  	VPALIGNR $0x0c, Y9, Y9, Y9
3060  	VPALIGNR $0x0c, Y10, Y10, Y10
3061  	VPALIGNR $0x0c, Y11, Y11, Y11
3062  	VPALIGNR $0x08, Y12, Y12, Y12
3063  	VPALIGNR $0x08, Y13, Y13, Y13
3064  	VPALIGNR $0x08, Y8, Y8, Y8
3065  	VPALIGNR $0x08, Y15, Y15, Y15
3066  	VPALIGNR $0x04, Y4, Y4, Y4
3067  	VPALIGNR $0x04, Y1, Y1, Y1
3068  	VPALIGNR $0x04, Y2, Y2, Y2
3069  	VPALIGNR $0x04, Y3, Y3, Y3
3070  	CMPQ     CX, $0x000001e0
3071  	JNE      openAVX2InternalLoop
3072  	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
3073  	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
3074  	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
3075  	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
3076  	VPADDD   32(BP), Y14, Y14
3077  	VPADDD   32(BP), Y9, Y9
3078  	VPADDD   32(BP), Y10, Y10
3079  	VPADDD   32(BP), Y11, Y11
3080  	VPADDD   64(BP), Y12, Y12
3081  	VPADDD   64(BP), Y13, Y13
3082  	VPADDD   64(BP), Y8, Y8
3083  	VPADDD   64(BP), Y15, Y15
3084  	VPADDD   96(BP), Y4, Y4
3085  	VPADDD   128(BP), Y1, Y1
3086  	VPADDD   160(BP), Y2, Y2
3087  	VPADDD   192(BP), Y3, Y3
3088  	VMOVDQA  Y15, 224(BP)
3089  
3090  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
3091  	ADDQ       480(SI), R10
3092  	ADCQ       488(SI), R11
3093  	ADCQ       $0x01, R12
3094  	MOVQ       (BP), DX
3095  	MOVQ       DX, R15
3096  	MULXQ      R10, R13, R14
3097  	IMULQ      R12, R15
3098  	MULXQ      R11, AX, DX
3099  	ADDQ       AX, R14
3100  	ADCQ       DX, R15
3101  	MOVQ       8(BP), DX
3102  	MULXQ      R10, R10, AX
3103  	ADDQ       R10, R14
3104  	MULXQ      R11, R11, R8
3105  	ADCQ       R11, R15
3106  	ADCQ       $0x00, R8
3107  	IMULQ      R12, DX
3108  	ADDQ       AX, R15
3109  	ADCQ       DX, R8
3110  	MOVQ       R13, R10
3111  	MOVQ       R14, R11
3112  	MOVQ       R15, R12
3113  	ANDQ       $0x03, R12
3114  	MOVQ       R15, R13
3115  	ANDQ       $-4, R13
3116  	MOVQ       R8, R14
3117  	SHRQ       $0x02, R8, R15
3118  	SHRQ       $0x02, R8
3119  	ADDQ       R13, R10
3120  	ADCQ       R14, R11
3121  	ADCQ       $0x00, R12
3122  	ADDQ       R15, R10
3123  	ADCQ       R8, R11
3124  	ADCQ       $0x00, R12
3125  	VPERM2I128 $0x02, Y0, Y14, Y15
3126  	VPERM2I128 $0x13, Y0, Y14, Y14
3127  	VPERM2I128 $0x02, Y12, Y4, Y0
3128  	VPERM2I128 $0x13, Y12, Y4, Y12
3129  	VPXOR      (SI), Y15, Y15
3130  	VPXOR      32(SI), Y0, Y0
3131  	VPXOR      64(SI), Y14, Y14
3132  	VPXOR      96(SI), Y12, Y12
3133  	VMOVDQU    Y15, (DI)
3134  	VMOVDQU    Y0, 32(DI)
3135  	VMOVDQU    Y14, 64(DI)
3136  	VMOVDQU    Y12, 96(DI)
3137  	VPERM2I128 $0x02, Y5, Y9, Y0
3138  	VPERM2I128 $0x02, Y13, Y1, Y14
3139  	VPERM2I128 $0x13, Y5, Y9, Y12
3140  	VPERM2I128 $0x13, Y13, Y1, Y4
3141  	VPXOR      128(SI), Y0, Y0
3142  	VPXOR      160(SI), Y14, Y14
3143  	VPXOR      192(SI), Y12, Y12
3144  	VPXOR      224(SI), Y4, Y4
3145  	VMOVDQU    Y0, 128(DI)
3146  	VMOVDQU    Y14, 160(DI)
3147  	VMOVDQU    Y12, 192(DI)
3148  	VMOVDQU    Y4, 224(DI)
3149  
3150  	// and here
3151  	ADDQ       496(SI), R10
3152  	ADCQ       504(SI), R11
3153  	ADCQ       $0x01, R12
3154  	MOVQ       (BP), DX
3155  	MOVQ       DX, R15
3156  	MULXQ      R10, R13, R14
3157  	IMULQ      R12, R15
3158  	MULXQ      R11, AX, DX
3159  	ADDQ       AX, R14
3160  	ADCQ       DX, R15
3161  	MOVQ       8(BP), DX
3162  	MULXQ      R10, R10, AX
3163  	ADDQ       R10, R14
3164  	MULXQ      R11, R11, R8
3165  	ADCQ       R11, R15
3166  	ADCQ       $0x00, R8
3167  	IMULQ      R12, DX
3168  	ADDQ       AX, R15
3169  	ADCQ       DX, R8
3170  	MOVQ       R13, R10
3171  	MOVQ       R14, R11
3172  	MOVQ       R15, R12
3173  	ANDQ       $0x03, R12
3174  	MOVQ       R15, R13
3175  	ANDQ       $-4, R13
3176  	MOVQ       R8, R14
3177  	SHRQ       $0x02, R8, R15
3178  	SHRQ       $0x02, R8
3179  	ADDQ       R13, R10
3180  	ADCQ       R14, R11
3181  	ADCQ       $0x00, R12
3182  	ADDQ       R15, R10
3183  	ADCQ       R8, R11
3184  	ADCQ       $0x00, R12
3185  	VPERM2I128 $0x02, Y6, Y10, Y0
3186  	VPERM2I128 $0x02, Y8, Y2, Y14
3187  	VPERM2I128 $0x13, Y6, Y10, Y12
3188  	VPERM2I128 $0x13, Y8, Y2, Y4
3189  	VPXOR      256(SI), Y0, Y0
3190  	VPXOR      288(SI), Y14, Y14
3191  	VPXOR      320(SI), Y12, Y12
3192  	VPXOR      352(SI), Y4, Y4
3193  	VMOVDQU    Y0, 256(DI)
3194  	VMOVDQU    Y14, 288(DI)
3195  	VMOVDQU    Y12, 320(DI)
3196  	VMOVDQU    Y4, 352(DI)
3197  	VPERM2I128 $0x02, Y7, Y11, Y0
3198  	VPERM2I128 $0x02, 224(BP), Y3, Y14
3199  	VPERM2I128 $0x13, Y7, Y11, Y12
3200  	VPERM2I128 $0x13, 224(BP), Y3, Y4
3201  	VPXOR      384(SI), Y0, Y0
3202  	VPXOR      416(SI), Y14, Y14
3203  	VPXOR      448(SI), Y12, Y12
3204  	VPXOR      480(SI), Y4, Y4
3205  	VMOVDQU    Y0, 384(DI)
3206  	VMOVDQU    Y14, 416(DI)
3207  	VMOVDQU    Y12, 448(DI)
3208  	VMOVDQU    Y4, 480(DI)
3209  	LEAQ       512(SI), SI
3210  	LEAQ       512(DI), DI
3211  	SUBQ       $0x00000200, BX
3212  	JMP        openAVX2MainLoop
3213  
3214  openAVX2MainLoopDone:
3215  	// Handle the various tail sizes efficiently
3216  	TESTQ BX, BX
3217  	JE    openSSEFinalize
3218  	CMPQ  BX, $0x80
3219  	JBE   openAVX2Tail128
3220  	CMPQ  BX, $0x00000100
3221  	JBE   openAVX2Tail256
3222  	CMPQ  BX, $0x00000180
3223  	JBE   openAVX2Tail384
3224  	JMP   openAVX2Tail512
3225  
3226  openAVX2192:
3227  	VMOVDQA Y0, Y5
3228  	VMOVDQA Y14, Y9
3229  	VMOVDQA Y12, Y13
3230  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
3231  	VMOVDQA Y0, Y6
3232  	VMOVDQA Y14, Y10
3233  	VMOVDQA Y12, Y8
3234  	VMOVDQA Y4, Y2
3235  	VMOVDQA Y1, Y15
3236  	MOVQ    $0x0000000a, R9
3237  
3238  openAVX2192InnerCipherLoop:
3239  	VPADDD     Y14, Y0, Y0
3240  	VPXOR      Y0, Y4, Y4
3241  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
3242  	VPADDD     Y4, Y12, Y12
3243  	VPXOR      Y12, Y14, Y14
3244  	VPSLLD     $0x0c, Y14, Y3
3245  	VPSRLD     $0x14, Y14, Y14
3246  	VPXOR      Y3, Y14, Y14
3247  	VPADDD     Y14, Y0, Y0
3248  	VPXOR      Y0, Y4, Y4
3249  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
3250  	VPADDD     Y4, Y12, Y12
3251  	VPXOR      Y12, Y14, Y14
3252  	VPSLLD     $0x07, Y14, Y3
3253  	VPSRLD     $0x19, Y14, Y14
3254  	VPXOR      Y3, Y14, Y14
3255  	VPADDD     Y9, Y5, Y5
3256  	VPXOR      Y5, Y1, Y1
3257  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
3258  	VPADDD     Y1, Y13, Y13
3259  	VPXOR      Y13, Y9, Y9
3260  	VPSLLD     $0x0c, Y9, Y3
3261  	VPSRLD     $0x14, Y9, Y9
3262  	VPXOR      Y3, Y9, Y9
3263  	VPADDD     Y9, Y5, Y5
3264  	VPXOR      Y5, Y1, Y1
3265  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
3266  	VPADDD     Y1, Y13, Y13
3267  	VPXOR      Y13, Y9, Y9
3268  	VPSLLD     $0x07, Y9, Y3
3269  	VPSRLD     $0x19, Y9, Y9
3270  	VPXOR      Y3, Y9, Y9
3271  	VPALIGNR   $0x04, Y14, Y14, Y14
3272  	VPALIGNR   $0x04, Y9, Y9, Y9
3273  	VPALIGNR   $0x08, Y12, Y12, Y12
3274  	VPALIGNR   $0x08, Y13, Y13, Y13
3275  	VPALIGNR   $0x0c, Y4, Y4, Y4
3276  	VPALIGNR   $0x0c, Y1, Y1, Y1
3277  	VPADDD     Y14, Y0, Y0
3278  	VPXOR      Y0, Y4, Y4
3279  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
3280  	VPADDD     Y4, Y12, Y12
3281  	VPXOR      Y12, Y14, Y14
3282  	VPSLLD     $0x0c, Y14, Y3
3283  	VPSRLD     $0x14, Y14, Y14
3284  	VPXOR      Y3, Y14, Y14
3285  	VPADDD     Y14, Y0, Y0
3286  	VPXOR      Y0, Y4, Y4
3287  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
3288  	VPADDD     Y4, Y12, Y12
3289  	VPXOR      Y12, Y14, Y14
3290  	VPSLLD     $0x07, Y14, Y3
3291  	VPSRLD     $0x19, Y14, Y14
3292  	VPXOR      Y3, Y14, Y14
3293  	VPADDD     Y9, Y5, Y5
3294  	VPXOR      Y5, Y1, Y1
3295  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
3296  	VPADDD     Y1, Y13, Y13
3297  	VPXOR      Y13, Y9, Y9
3298  	VPSLLD     $0x0c, Y9, Y3
3299  	VPSRLD     $0x14, Y9, Y9
3300  	VPXOR      Y3, Y9, Y9
3301  	VPADDD     Y9, Y5, Y5
3302  	VPXOR      Y5, Y1, Y1
3303  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
3304  	VPADDD     Y1, Y13, Y13
3305  	VPXOR      Y13, Y9, Y9
3306  	VPSLLD     $0x07, Y9, Y3
3307  	VPSRLD     $0x19, Y9, Y9
3308  	VPXOR      Y3, Y9, Y9
3309  	VPALIGNR   $0x0c, Y14, Y14, Y14
3310  	VPALIGNR   $0x0c, Y9, Y9, Y9
3311  	VPALIGNR   $0x08, Y12, Y12, Y12
3312  	VPALIGNR   $0x08, Y13, Y13, Y13
3313  	VPALIGNR   $0x04, Y4, Y4, Y4
3314  	VPALIGNR   $0x04, Y1, Y1, Y1
3315  	DECQ       R9
3316  	JNE        openAVX2192InnerCipherLoop
3317  	VPADDD     Y6, Y0, Y0
3318  	VPADDD     Y6, Y5, Y5
3319  	VPADDD     Y10, Y14, Y14
3320  	VPADDD     Y10, Y9, Y9
3321  	VPADDD     Y8, Y12, Y12
3322  	VPADDD     Y8, Y13, Y13
3323  	VPADDD     Y2, Y4, Y4
3324  	VPADDD     Y15, Y1, Y1
3325  	VPERM2I128 $0x02, Y0, Y14, Y3
3326  
3327  	// Clamp and store poly key
3328  	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
3329  	VMOVDQA Y3, (BP)
3330  
3331  	// Stream for up to 192 bytes
3332  	VPERM2I128 $0x13, Y0, Y14, Y0
3333  	VPERM2I128 $0x13, Y12, Y4, Y14
3334  	VPERM2I128 $0x02, Y5, Y9, Y12
3335  	VPERM2I128 $0x02, Y13, Y1, Y4
3336  	VPERM2I128 $0x13, Y5, Y9, Y5
3337  	VPERM2I128 $0x13, Y13, Y1, Y9
3338  
3339  openAVX2ShortOpen:
3340  	// Hash
3341  	MOVQ ad_len+80(FP), R9
3342  	CALL polyHashADInternal<>(SB)
3343  
3344  openAVX2ShortOpenLoop:
3345  	CMPQ BX, $0x20
3346  	JB   openAVX2ShortTail32
3347  	SUBQ $0x20, BX
3348  
3349  	// Load for hashing
3350  	ADDQ  (SI), R10
3351  	ADCQ  8(SI), R11
3352  	ADCQ  $0x01, R12
3353  	MOVQ  (BP), DX
3354  	MOVQ  DX, R15
3355  	MULXQ R10, R13, R14
3356  	IMULQ R12, R15
3357  	MULXQ R11, AX, DX
3358  	ADDQ  AX, R14
3359  	ADCQ  DX, R15
3360  	MOVQ  8(BP), DX
3361  	MULXQ R10, R10, AX
3362  	ADDQ  R10, R14
3363  	MULXQ R11, R11, R8
3364  	ADCQ  R11, R15
3365  	ADCQ  $0x00, R8
3366  	IMULQ R12, DX
3367  	ADDQ  AX, R15
3368  	ADCQ  DX, R8
3369  	MOVQ  R13, R10
3370  	MOVQ  R14, R11
3371  	MOVQ  R15, R12
3372  	ANDQ  $0x03, R12
3373  	MOVQ  R15, R13
3374  	ANDQ  $-4, R13
3375  	MOVQ  R8, R14
3376  	SHRQ  $0x02, R8, R15
3377  	SHRQ  $0x02, R8
3378  	ADDQ  R13, R10
3379  	ADCQ  R14, R11
3380  	ADCQ  $0x00, R12
3381  	ADDQ  R15, R10
3382  	ADCQ  R8, R11
3383  	ADCQ  $0x00, R12
3384  	ADDQ  16(SI), R10
3385  	ADCQ  24(SI), R11
3386  	ADCQ  $0x01, R12
3387  	MOVQ  (BP), DX
3388  	MOVQ  DX, R15
3389  	MULXQ R10, R13, R14
3390  	IMULQ R12, R15
3391  	MULXQ R11, AX, DX
3392  	ADDQ  AX, R14
3393  	ADCQ  DX, R15
3394  	MOVQ  8(BP), DX
3395  	MULXQ R10, R10, AX
3396  	ADDQ  R10, R14
3397  	MULXQ R11, R11, R8
3398  	ADCQ  R11, R15
3399  	ADCQ  $0x00, R8
3400  	IMULQ R12, DX
3401  	ADDQ  AX, R15
3402  	ADCQ  DX, R8
3403  	MOVQ  R13, R10
3404  	MOVQ  R14, R11
3405  	MOVQ  R15, R12
3406  	ANDQ  $0x03, R12
3407  	MOVQ  R15, R13
3408  	ANDQ  $-4, R13
3409  	MOVQ  R8, R14
3410  	SHRQ  $0x02, R8, R15
3411  	SHRQ  $0x02, R8
3412  	ADDQ  R13, R10
3413  	ADCQ  R14, R11
3414  	ADCQ  $0x00, R12
3415  	ADDQ  R15, R10
3416  	ADCQ  R8, R11
3417  	ADCQ  $0x00, R12
3418  
3419  	// Load for decryption
3420  	VPXOR   (SI), Y0, Y0
3421  	VMOVDQU Y0, (DI)
3422  	LEAQ    32(SI), SI
3423  	LEAQ    32(DI), DI
3424  
3425  	// Shift stream left
3426  	VMOVDQA Y14, Y0
3427  	VMOVDQA Y12, Y14
3428  	VMOVDQA Y4, Y12
3429  	VMOVDQA Y5, Y4
3430  	VMOVDQA Y9, Y5
3431  	VMOVDQA Y13, Y9
3432  	VMOVDQA Y1, Y13
3433  	VMOVDQA Y6, Y1
3434  	VMOVDQA Y10, Y6
3435  	JMP     openAVX2ShortOpenLoop
3436  
3437  openAVX2ShortTail32:
3438  	CMPQ    BX, $0x10
3439  	VMOVDQA X0, X1
3440  	JB      openAVX2ShortDone
3441  	SUBQ    $0x10, BX
3442  
3443  	// Load for hashing
3444  	ADDQ  (SI), R10
3445  	ADCQ  8(SI), R11
3446  	ADCQ  $0x01, R12
3447  	MOVQ  (BP), DX
3448  	MOVQ  DX, R15
3449  	MULXQ R10, R13, R14
3450  	IMULQ R12, R15
3451  	MULXQ R11, AX, DX
3452  	ADDQ  AX, R14
3453  	ADCQ  DX, R15
3454  	MOVQ  8(BP), DX
3455  	MULXQ R10, R10, AX
3456  	ADDQ  R10, R14
3457  	MULXQ R11, R11, R8
3458  	ADCQ  R11, R15
3459  	ADCQ  $0x00, R8
3460  	IMULQ R12, DX
3461  	ADDQ  AX, R15
3462  	ADCQ  DX, R8
3463  	MOVQ  R13, R10
3464  	MOVQ  R14, R11
3465  	MOVQ  R15, R12
3466  	ANDQ  $0x03, R12
3467  	MOVQ  R15, R13
3468  	ANDQ  $-4, R13
3469  	MOVQ  R8, R14
3470  	SHRQ  $0x02, R8, R15
3471  	SHRQ  $0x02, R8
3472  	ADDQ  R13, R10
3473  	ADCQ  R14, R11
3474  	ADCQ  $0x00, R12
3475  	ADDQ  R15, R10
3476  	ADCQ  R8, R11
3477  	ADCQ  $0x00, R12
3478  
3479  	// Load for decryption
3480  	VPXOR      (SI), X0, X12
3481  	VMOVDQU    X12, (DI)
3482  	LEAQ       16(SI), SI
3483  	LEAQ       16(DI), DI
3484  	VPERM2I128 $0x11, Y0, Y0, Y0
3485  	VMOVDQA    X0, X1
3486  
3487  openAVX2ShortDone:
3488  	VZEROUPPER
3489  	JMP openSSETail16
3490  
3491  openAVX2320:
3492  	VMOVDQA Y0, Y5
3493  	VMOVDQA Y14, Y9
3494  	VMOVDQA Y12, Y13
3495  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
3496  	VMOVDQA Y0, Y6
3497  	VMOVDQA Y14, Y10
3498  	VMOVDQA Y12, Y8
3499  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
3500  	VMOVDQA Y14, Y7
3501  	VMOVDQA Y12, Y11
3502  	VMOVDQA Y4, Y15
3503  	MOVQ    $0x0000000a, R9
3504  
3505  openAVX2320InnerCipherLoop:
3506  	VPADDD   Y14, Y0, Y0
3507  	VPXOR    Y0, Y4, Y4
3508  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
3509  	VPADDD   Y4, Y12, Y12
3510  	VPXOR    Y12, Y14, Y14
3511  	VPSLLD   $0x0c, Y14, Y3
3512  	VPSRLD   $0x14, Y14, Y14
3513  	VPXOR    Y3, Y14, Y14
3514  	VPADDD   Y14, Y0, Y0
3515  	VPXOR    Y0, Y4, Y4
3516  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
3517  	VPADDD   Y4, Y12, Y12
3518  	VPXOR    Y12, Y14, Y14
3519  	VPSLLD   $0x07, Y14, Y3
3520  	VPSRLD   $0x19, Y14, Y14
3521  	VPXOR    Y3, Y14, Y14
3522  	VPADDD   Y9, Y5, Y5
3523  	VPXOR    Y5, Y1, Y1
3524  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
3525  	VPADDD   Y1, Y13, Y13
3526  	VPXOR    Y13, Y9, Y9
3527  	VPSLLD   $0x0c, Y9, Y3
3528  	VPSRLD   $0x14, Y9, Y9
3529  	VPXOR    Y3, Y9, Y9
3530  	VPADDD   Y9, Y5, Y5
3531  	VPXOR    Y5, Y1, Y1
3532  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
3533  	VPADDD   Y1, Y13, Y13
3534  	VPXOR    Y13, Y9, Y9
3535  	VPSLLD   $0x07, Y9, Y3
3536  	VPSRLD   $0x19, Y9, Y9
3537  	VPXOR    Y3, Y9, Y9
3538  	VPADDD   Y10, Y6, Y6
3539  	VPXOR    Y6, Y2, Y2
3540  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
3541  	VPADDD   Y2, Y8, Y8
3542  	VPXOR    Y8, Y10, Y10
3543  	VPSLLD   $0x0c, Y10, Y3
3544  	VPSRLD   $0x14, Y10, Y10
3545  	VPXOR    Y3, Y10, Y10
3546  	VPADDD   Y10, Y6, Y6
3547  	VPXOR    Y6, Y2, Y2
3548  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
3549  	VPADDD   Y2, Y8, Y8
3550  	VPXOR    Y8, Y10, Y10
3551  	VPSLLD   $0x07, Y10, Y3
3552  	VPSRLD   $0x19, Y10, Y10
3553  	VPXOR    Y3, Y10, Y10
3554  	VPALIGNR $0x04, Y14, Y14, Y14
3555  	VPALIGNR $0x04, Y9, Y9, Y9
3556  	VPALIGNR $0x04, Y10, Y10, Y10
3557  	VPALIGNR $0x08, Y12, Y12, Y12
3558  	VPALIGNR $0x08, Y13, Y13, Y13
3559  	VPALIGNR $0x08, Y8, Y8, Y8
3560  	VPALIGNR $0x0c, Y4, Y4, Y4
3561  	VPALIGNR $0x0c, Y1, Y1, Y1
3562  	VPALIGNR $0x0c, Y2, Y2, Y2
3563  	VPADDD   Y14, Y0, Y0
3564  	VPXOR    Y0, Y4, Y4
3565  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
3566  	VPADDD   Y4, Y12, Y12
3567  	VPXOR    Y12, Y14, Y14
3568  	VPSLLD   $0x0c, Y14, Y3
3569  	VPSRLD   $0x14, Y14, Y14
3570  	VPXOR    Y3, Y14, Y14
3571  	VPADDD   Y14, Y0, Y0
3572  	VPXOR    Y0, Y4, Y4
3573  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
3574  	VPADDD   Y4, Y12, Y12
3575  	VPXOR    Y12, Y14, Y14
3576  	VPSLLD   $0x07, Y14, Y3
3577  	VPSRLD   $0x19, Y14, Y14
3578  	VPXOR    Y3, Y14, Y14
3579  	VPADDD   Y9, Y5, Y5
3580  	VPXOR    Y5, Y1, Y1
3581  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
3582  	VPADDD   Y1, Y13, Y13
3583  	VPXOR    Y13, Y9, Y9
3584  	VPSLLD   $0x0c, Y9, Y3
3585  	VPSRLD   $0x14, Y9, Y9
3586  	VPXOR    Y3, Y9, Y9
3587  	VPADDD   Y9, Y5, Y5
3588  	VPXOR    Y5, Y1, Y1
3589  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
3590  	VPADDD   Y1, Y13, Y13
3591  	VPXOR    Y13, Y9, Y9
3592  	VPSLLD   $0x07, Y9, Y3
3593  	VPSRLD   $0x19, Y9, Y9
3594  	VPXOR    Y3, Y9, Y9
3595  	VPADDD   Y10, Y6, Y6
3596  	VPXOR    Y6, Y2, Y2
3597  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
3598  	VPADDD   Y2, Y8, Y8
3599  	VPXOR    Y8, Y10, Y10
3600  	VPSLLD   $0x0c, Y10, Y3
3601  	VPSRLD   $0x14, Y10, Y10
3602  	VPXOR    Y3, Y10, Y10
3603  	VPADDD   Y10, Y6, Y6
3604  	VPXOR    Y6, Y2, Y2
3605  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
3606  	VPADDD   Y2, Y8, Y8
3607  	VPXOR    Y8, Y10, Y10
3608  	VPSLLD   $0x07, Y10, Y3
3609  	VPSRLD   $0x19, Y10, Y10
3610  	VPXOR    Y3, Y10, Y10
3611  	VPALIGNR $0x0c, Y14, Y14, Y14
3612  	VPALIGNR $0x0c, Y9, Y9, Y9
3613  	VPALIGNR $0x0c, Y10, Y10, Y10
3614  	VPALIGNR $0x08, Y12, Y12, Y12
3615  	VPALIGNR $0x08, Y13, Y13, Y13
3616  	VPALIGNR $0x08, Y8, Y8, Y8
3617  	VPALIGNR $0x04, Y4, Y4, Y4
3618  	VPALIGNR $0x04, Y1, Y1, Y1
3619  	VPALIGNR $0x04, Y2, Y2, Y2
3620  	DECQ     R9
3621  	JNE      openAVX2320InnerCipherLoop
3622  	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
3623  	VPADDD   Y3, Y0, Y0
3624  	VPADDD   Y3, Y5, Y5
3625  	VPADDD   Y3, Y6, Y6
3626  	VPADDD   Y7, Y14, Y14
3627  	VPADDD   Y7, Y9, Y9
3628  	VPADDD   Y7, Y10, Y10
3629  	VPADDD   Y11, Y12, Y12
3630  	VPADDD   Y11, Y13, Y13
3631  	VPADDD   Y11, Y8, Y8
3632  	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
3633  	VPADDD   Y15, Y4, Y4
3634  	VPADDD   Y3, Y15, Y15
3635  	VPADDD   Y15, Y1, Y1
3636  	VPADDD   Y3, Y15, Y15
3637  	VPADDD   Y15, Y2, Y2
3638  
3639  	// Clamp and store poly key
3640  	VPERM2I128 $0x02, Y0, Y14, Y3
3641  	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
3642  	VMOVDQA    Y3, (BP)
3643  
3644  	// Stream for up to 320 bytes
3645  	VPERM2I128 $0x13, Y0, Y14, Y0
3646  	VPERM2I128 $0x13, Y12, Y4, Y14
3647  	VPERM2I128 $0x02, Y5, Y9, Y12
3648  	VPERM2I128 $0x02, Y13, Y1, Y4
3649  	VPERM2I128 $0x13, Y5, Y9, Y5
3650  	VPERM2I128 $0x13, Y13, Y1, Y9
3651  	VPERM2I128 $0x02, Y6, Y10, Y13
3652  	VPERM2I128 $0x02, Y8, Y2, Y1
3653  	VPERM2I128 $0x13, Y6, Y10, Y6
3654  	VPERM2I128 $0x13, Y8, Y2, Y10
3655  	JMP        openAVX2ShortOpen
3656  
3657  openAVX2Tail128:
3658  	// Need to decrypt up to 128 bytes - prepare two blocks
3659  	VMOVDQA ·chacha20Constants<>+0(SB), Y5
3660  	VMOVDQA 32(BP), Y9
3661  	VMOVDQA 64(BP), Y13
3662  	VMOVDQA 192(BP), Y1
3663  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y1
3664  	VMOVDQA Y1, Y4
3665  	XORQ    R9, R9
3666  	MOVQ    BX, CX
3667  	ANDQ    $-16, CX
3668  	TESTQ   CX, CX
3669  	JE      openAVX2Tail128LoopB
3670  
3671  openAVX2Tail128LoopA:
3672  	ADDQ  (SI)(R9*1), R10
3673  	ADCQ  8(SI)(R9*1), R11
3674  	ADCQ  $0x01, R12
3675  	MOVQ  (BP), DX
3676  	MOVQ  DX, R15
3677  	MULXQ R10, R13, R14
3678  	IMULQ R12, R15
3679  	MULXQ R11, AX, DX
3680  	ADDQ  AX, R14
3681  	ADCQ  DX, R15
3682  	MOVQ  8(BP), DX
3683  	MULXQ R10, R10, AX
3684  	ADDQ  R10, R14
3685  	MULXQ R11, R11, R8
3686  	ADCQ  R11, R15
3687  	ADCQ  $0x00, R8
3688  	IMULQ R12, DX
3689  	ADDQ  AX, R15
3690  	ADCQ  DX, R8
3691  	MOVQ  R13, R10
3692  	MOVQ  R14, R11
3693  	MOVQ  R15, R12
3694  	ANDQ  $0x03, R12
3695  	MOVQ  R15, R13
3696  	ANDQ  $-4, R13
3697  	MOVQ  R8, R14
3698  	SHRQ  $0x02, R8, R15
3699  	SHRQ  $0x02, R8
3700  	ADDQ  R13, R10
3701  	ADCQ  R14, R11
3702  	ADCQ  $0x00, R12
3703  	ADDQ  R15, R10
3704  	ADCQ  R8, R11
3705  	ADCQ  $0x00, R12
3706  
3707  openAVX2Tail128LoopB:
3708  	ADDQ       $0x10, R9
3709  	VPADDD     Y9, Y5, Y5
3710  	VPXOR      Y5, Y1, Y1
3711  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
3712  	VPADDD     Y1, Y13, Y13
3713  	VPXOR      Y13, Y9, Y9
3714  	VPSLLD     $0x0c, Y9, Y3
3715  	VPSRLD     $0x14, Y9, Y9
3716  	VPXOR      Y3, Y9, Y9
3717  	VPADDD     Y9, Y5, Y5
3718  	VPXOR      Y5, Y1, Y1
3719  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
3720  	VPADDD     Y1, Y13, Y13
3721  	VPXOR      Y13, Y9, Y9
3722  	VPSLLD     $0x07, Y9, Y3
3723  	VPSRLD     $0x19, Y9, Y9
3724  	VPXOR      Y3, Y9, Y9
3725  	VPALIGNR   $0x04, Y9, Y9, Y9
3726  	VPALIGNR   $0x08, Y13, Y13, Y13
3727  	VPALIGNR   $0x0c, Y1, Y1, Y1
3728  	VPADDD     Y9, Y5, Y5
3729  	VPXOR      Y5, Y1, Y1
3730  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
3731  	VPADDD     Y1, Y13, Y13
3732  	VPXOR      Y13, Y9, Y9
3733  	VPSLLD     $0x0c, Y9, Y3
3734  	VPSRLD     $0x14, Y9, Y9
3735  	VPXOR      Y3, Y9, Y9
3736  	VPADDD     Y9, Y5, Y5
3737  	VPXOR      Y5, Y1, Y1
3738  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
3739  	VPADDD     Y1, Y13, Y13
3740  	VPXOR      Y13, Y9, Y9
3741  	VPSLLD     $0x07, Y9, Y3
3742  	VPSRLD     $0x19, Y9, Y9
3743  	VPXOR      Y3, Y9, Y9
3744  	VPALIGNR   $0x0c, Y9, Y9, Y9
3745  	VPALIGNR   $0x08, Y13, Y13, Y13
3746  	VPALIGNR   $0x04, Y1, Y1, Y1
3747  	CMPQ       R9, CX
3748  	JB         openAVX2Tail128LoopA
3749  	CMPQ       R9, $0xa0
3750  	JNE        openAVX2Tail128LoopB
3751  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
3752  	VPADDD     32(BP), Y9, Y9
3753  	VPADDD     64(BP), Y13, Y13
3754  	VPADDD     Y4, Y1, Y1
3755  	VPERM2I128 $0x02, Y5, Y9, Y0
3756  	VPERM2I128 $0x02, Y13, Y1, Y14
3757  	VPERM2I128 $0x13, Y5, Y9, Y12
3758  	VPERM2I128 $0x13, Y13, Y1, Y4
3759  
3760  openAVX2TailLoop:
3761  	CMPQ BX, $0x20
3762  	JB   openAVX2Tail
3763  	SUBQ $0x20, BX
3764  
3765  	// Load for decryption
3766  	VPXOR   (SI), Y0, Y0
3767  	VMOVDQU Y0, (DI)
3768  	LEAQ    32(SI), SI
3769  	LEAQ    32(DI), DI
3770  	VMOVDQA Y14, Y0
3771  	VMOVDQA Y12, Y14
3772  	VMOVDQA Y4, Y12
3773  	JMP     openAVX2TailLoop
3774  
3775  openAVX2Tail:
3776  	CMPQ    BX, $0x10
3777  	VMOVDQA X0, X1
3778  	JB      openAVX2TailDone
3779  	SUBQ    $0x10, BX
3780  
3781  	// Load for decryption
3782  	VPXOR      (SI), X0, X12
3783  	VMOVDQU    X12, (DI)
3784  	LEAQ       16(SI), SI
3785  	LEAQ       16(DI), DI
3786  	VPERM2I128 $0x11, Y0, Y0, Y0
3787  	VMOVDQA    X0, X1
3788  
3789  openAVX2TailDone:
3790  	VZEROUPPER
3791  	JMP openSSETail16
3792  
3793  openAVX2Tail256:
3794  	VMOVDQA ·chacha20Constants<>+0(SB), Y0
3795  	VMOVDQA Y0, Y5
3796  	VMOVDQA 32(BP), Y14
3797  	VMOVDQA Y14, Y9
3798  	VMOVDQA 64(BP), Y12
3799  	VMOVDQA Y12, Y13
3800  	VMOVDQA 192(BP), Y4
3801  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
3802  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
3803  	VMOVDQA Y4, Y7
3804  	VMOVDQA Y1, Y11
3805  
3806  	// Compute the number of iterations that will hash data
3807  	MOVQ    BX, 224(BP)
3808  	MOVQ    BX, CX
3809  	SUBQ    $0x80, CX
3810  	SHRQ    $0x04, CX
3811  	MOVQ    $0x0000000a, R9
3812  	CMPQ    CX, $0x0a
3813  	CMOVQGT R9, CX
3814  	MOVQ    SI, BX
3815  	XORQ    R9, R9
3816  
3817  openAVX2Tail256LoopA:
3818  	ADDQ  (BX), R10
3819  	ADCQ  8(BX), R11
3820  	ADCQ  $0x01, R12
3821  	MOVQ  (BP), DX
3822  	MOVQ  DX, R15
3823  	MULXQ R10, R13, R14
3824  	IMULQ R12, R15
3825  	MULXQ R11, AX, DX
3826  	ADDQ  AX, R14
3827  	ADCQ  DX, R15
3828  	MOVQ  8(BP), DX
3829  	MULXQ R10, R10, AX
3830  	ADDQ  R10, R14
3831  	MULXQ R11, R11, R8
3832  	ADCQ  R11, R15
3833  	ADCQ  $0x00, R8
3834  	IMULQ R12, DX
3835  	ADDQ  AX, R15
3836  	ADCQ  DX, R8
3837  	MOVQ  R13, R10
3838  	MOVQ  R14, R11
3839  	MOVQ  R15, R12
3840  	ANDQ  $0x03, R12
3841  	MOVQ  R15, R13
3842  	ANDQ  $-4, R13
3843  	MOVQ  R8, R14
3844  	SHRQ  $0x02, R8, R15
3845  	SHRQ  $0x02, R8
3846  	ADDQ  R13, R10
3847  	ADCQ  R14, R11
3848  	ADCQ  $0x00, R12
3849  	ADDQ  R15, R10
3850  	ADCQ  R8, R11
3851  	ADCQ  $0x00, R12
3852  	LEAQ  16(BX), BX
3853  
3854  openAVX2Tail256LoopB:
3855  	VPADDD   Y14, Y0, Y0
3856  	VPXOR    Y0, Y4, Y4
3857  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
3858  	VPADDD   Y4, Y12, Y12
3859  	VPXOR    Y12, Y14, Y14
3860  	VPSLLD   $0x0c, Y14, Y3
3861  	VPSRLD   $0x14, Y14, Y14
3862  	VPXOR    Y3, Y14, Y14
3863  	VPADDD   Y14, Y0, Y0
3864  	VPXOR    Y0, Y4, Y4
3865  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
3866  	VPADDD   Y4, Y12, Y12
3867  	VPXOR    Y12, Y14, Y14
3868  	VPSLLD   $0x07, Y14, Y3
3869  	VPSRLD   $0x19, Y14, Y14
3870  	VPXOR    Y3, Y14, Y14
3871  	VPADDD   Y9, Y5, Y5
3872  	VPXOR    Y5, Y1, Y1
3873  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
3874  	VPADDD   Y1, Y13, Y13
3875  	VPXOR    Y13, Y9, Y9
3876  	VPSLLD   $0x0c, Y9, Y3
3877  	VPSRLD   $0x14, Y9, Y9
3878  	VPXOR    Y3, Y9, Y9
3879  	VPADDD   Y9, Y5, Y5
3880  	VPXOR    Y5, Y1, Y1
3881  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
3882  	VPADDD   Y1, Y13, Y13
3883  	VPXOR    Y13, Y9, Y9
3884  	VPSLLD   $0x07, Y9, Y3
3885  	VPSRLD   $0x19, Y9, Y9
3886  	VPXOR    Y3, Y9, Y9
3887  	VPALIGNR $0x04, Y14, Y14, Y14
3888  	VPALIGNR $0x04, Y9, Y9, Y9
3889  	VPALIGNR $0x08, Y12, Y12, Y12
3890  	VPALIGNR $0x08, Y13, Y13, Y13
3891  	VPALIGNR $0x0c, Y4, Y4, Y4
3892  	VPALIGNR $0x0c, Y1, Y1, Y1
3893  	INCQ     R9
3894  	VPADDD   Y14, Y0, Y0
3895  	VPXOR    Y0, Y4, Y4
3896  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
3897  	VPADDD   Y4, Y12, Y12
3898  	VPXOR    Y12, Y14, Y14
3899  	VPSLLD   $0x0c, Y14, Y3
3900  	VPSRLD   $0x14, Y14, Y14
3901  	VPXOR    Y3, Y14, Y14
3902  	VPADDD   Y14, Y0, Y0
3903  	VPXOR    Y0, Y4, Y4
3904  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
3905  	VPADDD   Y4, Y12, Y12
3906  	VPXOR    Y12, Y14, Y14
3907  	VPSLLD   $0x07, Y14, Y3
3908  	VPSRLD   $0x19, Y14, Y14
3909  	VPXOR    Y3, Y14, Y14
3910  	VPADDD   Y9, Y5, Y5
3911  	VPXOR    Y5, Y1, Y1
3912  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
3913  	VPADDD   Y1, Y13, Y13
3914  	VPXOR    Y13, Y9, Y9
3915  	VPSLLD   $0x0c, Y9, Y3
3916  	VPSRLD   $0x14, Y9, Y9
3917  	VPXOR    Y3, Y9, Y9
3918  	VPADDD   Y9, Y5, Y5
3919  	VPXOR    Y5, Y1, Y1
3920  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
3921  	VPADDD   Y1, Y13, Y13
3922  	VPXOR    Y13, Y9, Y9
3923  	VPSLLD   $0x07, Y9, Y3
3924  	VPSRLD   $0x19, Y9, Y9
3925  	VPXOR    Y3, Y9, Y9
3926  	VPALIGNR $0x0c, Y14, Y14, Y14
3927  	VPALIGNR $0x0c, Y9, Y9, Y9
3928  	VPALIGNR $0x08, Y12, Y12, Y12
3929  	VPALIGNR $0x08, Y13, Y13, Y13
3930  	VPALIGNR $0x04, Y4, Y4, Y4
3931  	VPALIGNR $0x04, Y1, Y1, Y1
3932  	CMPQ     R9, CX
3933  	JB       openAVX2Tail256LoopA
3934  	CMPQ     R9, $0x0a
3935  	JNE      openAVX2Tail256LoopB
3936  	MOVQ     BX, R9
3937  	SUBQ     SI, BX
3938  	MOVQ     BX, CX
3939  	MOVQ     224(BP), BX
3940  
3941  openAVX2Tail256Hash:
3942  	ADDQ  $0x10, CX
3943  	CMPQ  CX, BX
3944  	JGT   openAVX2Tail256HashEnd
3945  	ADDQ  (R9), R10
3946  	ADCQ  8(R9), R11
3947  	ADCQ  $0x01, R12
3948  	MOVQ  (BP), DX
3949  	MOVQ  DX, R15
3950  	MULXQ R10, R13, R14
3951  	IMULQ R12, R15
3952  	MULXQ R11, AX, DX
3953  	ADDQ  AX, R14
3954  	ADCQ  DX, R15
3955  	MOVQ  8(BP), DX
3956  	MULXQ R10, R10, AX
3957  	ADDQ  R10, R14
3958  	MULXQ R11, R11, R8
3959  	ADCQ  R11, R15
3960  	ADCQ  $0x00, R8
3961  	IMULQ R12, DX
3962  	ADDQ  AX, R15
3963  	ADCQ  DX, R8
3964  	MOVQ  R13, R10
3965  	MOVQ  R14, R11
3966  	MOVQ  R15, R12
3967  	ANDQ  $0x03, R12
3968  	MOVQ  R15, R13
3969  	ANDQ  $-4, R13
3970  	MOVQ  R8, R14
3971  	SHRQ  $0x02, R8, R15
3972  	SHRQ  $0x02, R8
3973  	ADDQ  R13, R10
3974  	ADCQ  R14, R11
3975  	ADCQ  $0x00, R12
3976  	ADDQ  R15, R10
3977  	ADCQ  R8, R11
3978  	ADCQ  $0x00, R12
3979  	LEAQ  16(R9), R9
3980  	JMP   openAVX2Tail256Hash
3981  
3982  openAVX2Tail256HashEnd:
3983  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
3984  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
3985  	VPADDD     32(BP), Y14, Y14
3986  	VPADDD     32(BP), Y9, Y9
3987  	VPADDD     64(BP), Y12, Y12
3988  	VPADDD     64(BP), Y13, Y13
3989  	VPADDD     Y7, Y4, Y4
3990  	VPADDD     Y11, Y1, Y1
3991  	VPERM2I128 $0x02, Y0, Y14, Y6
3992  	VPERM2I128 $0x02, Y12, Y4, Y10
3993  	VPERM2I128 $0x13, Y0, Y14, Y8
3994  	VPERM2I128 $0x13, Y12, Y4, Y2
3995  	VPERM2I128 $0x02, Y5, Y9, Y0
3996  	VPERM2I128 $0x02, Y13, Y1, Y14
3997  	VPERM2I128 $0x13, Y5, Y9, Y12
3998  	VPERM2I128 $0x13, Y13, Y1, Y4
3999  	VPXOR      (SI), Y6, Y6
4000  	VPXOR      32(SI), Y10, Y10
4001  	VPXOR      64(SI), Y8, Y8
4002  	VPXOR      96(SI), Y2, Y2
4003  	VMOVDQU    Y6, (DI)
4004  	VMOVDQU    Y10, 32(DI)
4005  	VMOVDQU    Y8, 64(DI)
4006  	VMOVDQU    Y2, 96(DI)
4007  	LEAQ       128(SI), SI
4008  	LEAQ       128(DI), DI
4009  	SUBQ       $0x80, BX
4010  	JMP        openAVX2TailLoop
4011  
4012  openAVX2Tail384:
4013  	// Need to decrypt up to 384 bytes - prepare six blocks
4014  	VMOVDQA ·chacha20Constants<>+0(SB), Y0
4015  	VMOVDQA Y0, Y5
4016  	VMOVDQA Y0, Y6
4017  	VMOVDQA 32(BP), Y14
4018  	VMOVDQA Y14, Y9
4019  	VMOVDQA Y14, Y10
4020  	VMOVDQA 64(BP), Y12
4021  	VMOVDQA Y12, Y13
4022  	VMOVDQA Y12, Y8
4023  	VMOVDQA 192(BP), Y4
4024  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
4025  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
4026  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
4027  	VMOVDQA Y4, 96(BP)
4028  	VMOVDQA Y1, 128(BP)
4029  	VMOVDQA Y2, 160(BP)
4030  
4031  	// Compute the number of iterations that will hash two blocks of data
4032  	MOVQ    BX, 224(BP)
4033  	MOVQ    BX, CX
4034  	SUBQ    $0x00000100, CX
4035  	SHRQ    $0x04, CX
4036  	ADDQ    $0x06, CX
4037  	MOVQ    $0x0000000a, R9
4038  	CMPQ    CX, $0x0a
4039  	CMOVQGT R9, CX
4040  	MOVQ    SI, BX
4041  	XORQ    R9, R9
4042  
4043  openAVX2Tail384LoopB:
4044  	ADDQ  (BX), R10
4045  	ADCQ  8(BX), R11
4046  	ADCQ  $0x01, R12
4047  	MOVQ  (BP), DX
4048  	MOVQ  DX, R15
4049  	MULXQ R10, R13, R14
4050  	IMULQ R12, R15
4051  	MULXQ R11, AX, DX
4052  	ADDQ  AX, R14
4053  	ADCQ  DX, R15
4054  	MOVQ  8(BP), DX
4055  	MULXQ R10, R10, AX
4056  	ADDQ  R10, R14
4057  	MULXQ R11, R11, R8
4058  	ADCQ  R11, R15
4059  	ADCQ  $0x00, R8
4060  	IMULQ R12, DX
4061  	ADDQ  AX, R15
4062  	ADCQ  DX, R8
4063  	MOVQ  R13, R10
4064  	MOVQ  R14, R11
4065  	MOVQ  R15, R12
4066  	ANDQ  $0x03, R12
4067  	MOVQ  R15, R13
4068  	ANDQ  $-4, R13
4069  	MOVQ  R8, R14
4070  	SHRQ  $0x02, R8, R15
4071  	SHRQ  $0x02, R8
4072  	ADDQ  R13, R10
4073  	ADCQ  R14, R11
4074  	ADCQ  $0x00, R12
4075  	ADDQ  R15, R10
4076  	ADCQ  R8, R11
4077  	ADCQ  $0x00, R12
4078  	LEAQ  16(BX), BX
4079  
4080  openAVX2Tail384LoopA:
4081  	VPADDD   Y14, Y0, Y0
4082  	VPXOR    Y0, Y4, Y4
4083  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
4084  	VPADDD   Y4, Y12, Y12
4085  	VPXOR    Y12, Y14, Y14
4086  	VPSLLD   $0x0c, Y14, Y3
4087  	VPSRLD   $0x14, Y14, Y14
4088  	VPXOR    Y3, Y14, Y14
4089  	VPADDD   Y14, Y0, Y0
4090  	VPXOR    Y0, Y4, Y4
4091  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
4092  	VPADDD   Y4, Y12, Y12
4093  	VPXOR    Y12, Y14, Y14
4094  	VPSLLD   $0x07, Y14, Y3
4095  	VPSRLD   $0x19, Y14, Y14
4096  	VPXOR    Y3, Y14, Y14
4097  	VPADDD   Y9, Y5, Y5
4098  	VPXOR    Y5, Y1, Y1
4099  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
4100  	VPADDD   Y1, Y13, Y13
4101  	VPXOR    Y13, Y9, Y9
4102  	VPSLLD   $0x0c, Y9, Y3
4103  	VPSRLD   $0x14, Y9, Y9
4104  	VPXOR    Y3, Y9, Y9
4105  	VPADDD   Y9, Y5, Y5
4106  	VPXOR    Y5, Y1, Y1
4107  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
4108  	VPADDD   Y1, Y13, Y13
4109  	VPXOR    Y13, Y9, Y9
4110  	VPSLLD   $0x07, Y9, Y3
4111  	VPSRLD   $0x19, Y9, Y9
4112  	VPXOR    Y3, Y9, Y9
4113  	VPADDD   Y10, Y6, Y6
4114  	VPXOR    Y6, Y2, Y2
4115  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
4116  	VPADDD   Y2, Y8, Y8
4117  	VPXOR    Y8, Y10, Y10
4118  	VPSLLD   $0x0c, Y10, Y3
4119  	VPSRLD   $0x14, Y10, Y10
4120  	VPXOR    Y3, Y10, Y10
4121  	VPADDD   Y10, Y6, Y6
4122  	VPXOR    Y6, Y2, Y2
4123  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
4124  	VPADDD   Y2, Y8, Y8
4125  	VPXOR    Y8, Y10, Y10
4126  	VPSLLD   $0x07, Y10, Y3
4127  	VPSRLD   $0x19, Y10, Y10
4128  	VPXOR    Y3, Y10, Y10
4129  	VPALIGNR $0x04, Y14, Y14, Y14
4130  	VPALIGNR $0x04, Y9, Y9, Y9
4131  	VPALIGNR $0x04, Y10, Y10, Y10
4132  	VPALIGNR $0x08, Y12, Y12, Y12
4133  	VPALIGNR $0x08, Y13, Y13, Y13
4134  	VPALIGNR $0x08, Y8, Y8, Y8
4135  	VPALIGNR $0x0c, Y4, Y4, Y4
4136  	VPALIGNR $0x0c, Y1, Y1, Y1
4137  	VPALIGNR $0x0c, Y2, Y2, Y2
4138  	ADDQ     (BX), R10
4139  	ADCQ     8(BX), R11
4140  	ADCQ     $0x01, R12
4141  	MOVQ     (BP), DX
4142  	MOVQ     DX, R15
4143  	MULXQ    R10, R13, R14
4144  	IMULQ    R12, R15
4145  	MULXQ    R11, AX, DX
4146  	ADDQ     AX, R14
4147  	ADCQ     DX, R15
4148  	MOVQ     8(BP), DX
4149  	MULXQ    R10, R10, AX
4150  	ADDQ     R10, R14
4151  	MULXQ    R11, R11, R8
4152  	ADCQ     R11, R15
4153  	ADCQ     $0x00, R8
4154  	IMULQ    R12, DX
4155  	ADDQ     AX, R15
4156  	ADCQ     DX, R8
4157  	MOVQ     R13, R10
4158  	MOVQ     R14, R11
4159  	MOVQ     R15, R12
4160  	ANDQ     $0x03, R12
4161  	MOVQ     R15, R13
4162  	ANDQ     $-4, R13
4163  	MOVQ     R8, R14
4164  	SHRQ     $0x02, R8, R15
4165  	SHRQ     $0x02, R8
4166  	ADDQ     R13, R10
4167  	ADCQ     R14, R11
4168  	ADCQ     $0x00, R12
4169  	ADDQ     R15, R10
4170  	ADCQ     R8, R11
4171  	ADCQ     $0x00, R12
4172  	LEAQ     16(BX), BX
4173  	INCQ     R9
4174  	VPADDD   Y14, Y0, Y0
4175  	VPXOR    Y0, Y4, Y4
4176  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
4177  	VPADDD   Y4, Y12, Y12
4178  	VPXOR    Y12, Y14, Y14
4179  	VPSLLD   $0x0c, Y14, Y3
4180  	VPSRLD   $0x14, Y14, Y14
4181  	VPXOR    Y3, Y14, Y14
4182  	VPADDD   Y14, Y0, Y0
4183  	VPXOR    Y0, Y4, Y4
4184  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
4185  	VPADDD   Y4, Y12, Y12
4186  	VPXOR    Y12, Y14, Y14
4187  	VPSLLD   $0x07, Y14, Y3
4188  	VPSRLD   $0x19, Y14, Y14
4189  	VPXOR    Y3, Y14, Y14
4190  	VPADDD   Y9, Y5, Y5
4191  	VPXOR    Y5, Y1, Y1
4192  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
4193  	VPADDD   Y1, Y13, Y13
4194  	VPXOR    Y13, Y9, Y9
4195  	VPSLLD   $0x0c, Y9, Y3
4196  	VPSRLD   $0x14, Y9, Y9
4197  	VPXOR    Y3, Y9, Y9
4198  	VPADDD   Y9, Y5, Y5
4199  	VPXOR    Y5, Y1, Y1
4200  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
4201  	VPADDD   Y1, Y13, Y13
4202  	VPXOR    Y13, Y9, Y9
4203  	VPSLLD   $0x07, Y9, Y3
4204  	VPSRLD   $0x19, Y9, Y9
4205  	VPXOR    Y3, Y9, Y9
4206  	VPADDD   Y10, Y6, Y6
4207  	VPXOR    Y6, Y2, Y2
4208  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
4209  	VPADDD   Y2, Y8, Y8
4210  	VPXOR    Y8, Y10, Y10
4211  	VPSLLD   $0x0c, Y10, Y3
4212  	VPSRLD   $0x14, Y10, Y10
4213  	VPXOR    Y3, Y10, Y10
4214  	VPADDD   Y10, Y6, Y6
4215  	VPXOR    Y6, Y2, Y2
4216  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
4217  	VPADDD   Y2, Y8, Y8
4218  	VPXOR    Y8, Y10, Y10
4219  	VPSLLD   $0x07, Y10, Y3
4220  	VPSRLD   $0x19, Y10, Y10
4221  	VPXOR    Y3, Y10, Y10
4222  	VPALIGNR $0x0c, Y14, Y14, Y14
4223  	VPALIGNR $0x0c, Y9, Y9, Y9
4224  	VPALIGNR $0x0c, Y10, Y10, Y10
4225  	VPALIGNR $0x08, Y12, Y12, Y12
4226  	VPALIGNR $0x08, Y13, Y13, Y13
4227  	VPALIGNR $0x08, Y8, Y8, Y8
4228  	VPALIGNR $0x04, Y4, Y4, Y4
4229  	VPALIGNR $0x04, Y1, Y1, Y1
4230  	VPALIGNR $0x04, Y2, Y2, Y2
4231  	CMPQ     R9, CX
4232  	JB       openAVX2Tail384LoopB
4233  	CMPQ     R9, $0x0a
4234  	JNE      openAVX2Tail384LoopA
4235  	MOVQ     BX, R9
4236  	SUBQ     SI, BX
4237  	MOVQ     BX, CX
4238  	MOVQ     224(BP), BX
4239  
4240  openAVX2Tail384Hash:
4241  	ADDQ  $0x10, CX
4242  	CMPQ  CX, BX
4243  	JGT   openAVX2Tail384HashEnd
4244  	ADDQ  (R9), R10
4245  	ADCQ  8(R9), R11
4246  	ADCQ  $0x01, R12
4247  	MOVQ  (BP), DX
4248  	MOVQ  DX, R15
4249  	MULXQ R10, R13, R14
4250  	IMULQ R12, R15
4251  	MULXQ R11, AX, DX
4252  	ADDQ  AX, R14
4253  	ADCQ  DX, R15
4254  	MOVQ  8(BP), DX
4255  	MULXQ R10, R10, AX
4256  	ADDQ  R10, R14
4257  	MULXQ R11, R11, R8
4258  	ADCQ  R11, R15
4259  	ADCQ  $0x00, R8
4260  	IMULQ R12, DX
4261  	ADDQ  AX, R15
4262  	ADCQ  DX, R8
4263  	MOVQ  R13, R10
4264  	MOVQ  R14, R11
4265  	MOVQ  R15, R12
4266  	ANDQ  $0x03, R12
4267  	MOVQ  R15, R13
4268  	ANDQ  $-4, R13
4269  	MOVQ  R8, R14
4270  	SHRQ  $0x02, R8, R15
4271  	SHRQ  $0x02, R8
4272  	ADDQ  R13, R10
4273  	ADCQ  R14, R11
4274  	ADCQ  $0x00, R12
4275  	ADDQ  R15, R10
4276  	ADCQ  R8, R11
4277  	ADCQ  $0x00, R12
4278  	LEAQ  16(R9), R9
4279  	JMP   openAVX2Tail384Hash
4280  
4281  openAVX2Tail384HashEnd:
4282  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
4283  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
4284  	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
4285  	VPADDD     32(BP), Y14, Y14
4286  	VPADDD     32(BP), Y9, Y9
4287  	VPADDD     32(BP), Y10, Y10
4288  	VPADDD     64(BP), Y12, Y12
4289  	VPADDD     64(BP), Y13, Y13
4290  	VPADDD     64(BP), Y8, Y8
4291  	VPADDD     96(BP), Y4, Y4
4292  	VPADDD     128(BP), Y1, Y1
4293  	VPADDD     160(BP), Y2, Y2
4294  	VPERM2I128 $0x02, Y0, Y14, Y3
4295  	VPERM2I128 $0x02, Y12, Y4, Y7
4296  	VPERM2I128 $0x13, Y0, Y14, Y11
4297  	VPERM2I128 $0x13, Y12, Y4, Y15
4298  	VPXOR      (SI), Y3, Y3
4299  	VPXOR      32(SI), Y7, Y7
4300  	VPXOR      64(SI), Y11, Y11
4301  	VPXOR      96(SI), Y15, Y15
4302  	VMOVDQU    Y3, (DI)
4303  	VMOVDQU    Y7, 32(DI)
4304  	VMOVDQU    Y11, 64(DI)
4305  	VMOVDQU    Y15, 96(DI)
4306  	VPERM2I128 $0x02, Y5, Y9, Y3
4307  	VPERM2I128 $0x02, Y13, Y1, Y7
4308  	VPERM2I128 $0x13, Y5, Y9, Y11
4309  	VPERM2I128 $0x13, Y13, Y1, Y15
4310  	VPXOR      128(SI), Y3, Y3
4311  	VPXOR      160(SI), Y7, Y7
4312  	VPXOR      192(SI), Y11, Y11
4313  	VPXOR      224(SI), Y15, Y15
4314  	VMOVDQU    Y3, 128(DI)
4315  	VMOVDQU    Y7, 160(DI)
4316  	VMOVDQU    Y11, 192(DI)
4317  	VMOVDQU    Y15, 224(DI)
4318  	VPERM2I128 $0x02, Y6, Y10, Y0
4319  	VPERM2I128 $0x02, Y8, Y2, Y14
4320  	VPERM2I128 $0x13, Y6, Y10, Y12
4321  	VPERM2I128 $0x13, Y8, Y2, Y4
4322  	LEAQ       256(SI), SI
4323  	LEAQ       256(DI), DI
4324  	SUBQ       $0x00000100, BX
4325  	JMP        openAVX2TailLoop
4326  
4327  openAVX2Tail512:
4328  	VMOVDQU ·chacha20Constants<>+0(SB), Y0
4329  	VMOVDQA Y0, Y5
4330  	VMOVDQA Y0, Y6
4331  	VMOVDQA Y0, Y7
4332  	VMOVDQA 32(BP), Y14
4333  	VMOVDQA Y14, Y9
4334  	VMOVDQA Y14, Y10
4335  	VMOVDQA Y14, Y11
4336  	VMOVDQA 64(BP), Y12
4337  	VMOVDQA Y12, Y13
4338  	VMOVDQA Y12, Y8
4339  	VMOVDQA Y12, Y15
4340  	VMOVDQA 192(BP), Y4
4341  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
4342  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
4343  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
4344  	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
4345  	VMOVDQA Y4, 96(BP)
4346  	VMOVDQA Y1, 128(BP)
4347  	VMOVDQA Y2, 160(BP)
4348  	VMOVDQA Y3, 192(BP)
4349  	XORQ    CX, CX
4350  	MOVQ    SI, R9
4351  
4352  openAVX2Tail512LoopB:
4353  	ADDQ  (R9), R10
4354  	ADCQ  8(R9), R11
4355  	ADCQ  $0x01, R12
4356  	MOVQ  (BP), DX
4357  	MOVQ  DX, R15
4358  	MULXQ R10, R13, R14
4359  	IMULQ R12, R15
4360  	MULXQ R11, AX, DX
4361  	ADDQ  AX, R14
4362  	ADCQ  DX, R15
4363  	MOVQ  8(BP), DX
4364  	MULXQ R10, R10, AX
4365  	ADDQ  R10, R14
4366  	MULXQ R11, R11, R8
4367  	ADCQ  R11, R15
4368  	ADCQ  $0x00, R8
4369  	IMULQ R12, DX
4370  	ADDQ  AX, R15
4371  	ADCQ  DX, R8
4372  	MOVQ  R13, R10
4373  	MOVQ  R14, R11
4374  	MOVQ  R15, R12
4375  	ANDQ  $0x03, R12
4376  	MOVQ  R15, R13
4377  	ANDQ  $-4, R13
4378  	MOVQ  R8, R14
4379  	SHRQ  $0x02, R8, R15
4380  	SHRQ  $0x02, R8
4381  	ADDQ  R13, R10
4382  	ADCQ  R14, R11
4383  	ADCQ  $0x00, R12
4384  	ADDQ  R15, R10
4385  	ADCQ  R8, R11
4386  	ADCQ  $0x00, R12
4387  	LEAQ  16(R9), R9
4388  
4389  openAVX2Tail512LoopA:
4390  	VPADDD   Y14, Y0, Y0
4391  	VPADDD   Y9, Y5, Y5
4392  	VPADDD   Y10, Y6, Y6
4393  	VPADDD   Y11, Y7, Y7
4394  	VPXOR    Y0, Y4, Y4
4395  	VPXOR    Y5, Y1, Y1
4396  	VPXOR    Y6, Y2, Y2
4397  	VPXOR    Y7, Y3, Y3
4398  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
4399  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
4400  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
4401  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
4402  	VPADDD   Y4, Y12, Y12
4403  	VPADDD   Y1, Y13, Y13
4404  	VPADDD   Y2, Y8, Y8
4405  	VPADDD   Y3, Y15, Y15
4406  	VPXOR    Y12, Y14, Y14
4407  	VPXOR    Y13, Y9, Y9
4408  	VPXOR    Y8, Y10, Y10
4409  	VPXOR    Y15, Y11, Y11
4410  	VMOVDQA  Y15, 224(BP)
4411  	VPSLLD   $0x0c, Y14, Y15
4412  	VPSRLD   $0x14, Y14, Y14
4413  	VPXOR    Y15, Y14, Y14
4414  	VPSLLD   $0x0c, Y9, Y15
4415  	VPSRLD   $0x14, Y9, Y9
4416  	VPXOR    Y15, Y9, Y9
4417  	VPSLLD   $0x0c, Y10, Y15
4418  	VPSRLD   $0x14, Y10, Y10
4419  	VPXOR    Y15, Y10, Y10
4420  	VPSLLD   $0x0c, Y11, Y15
4421  	VPSRLD   $0x14, Y11, Y11
4422  	VPXOR    Y15, Y11, Y11
4423  	VMOVDQA  224(BP), Y15
4424  	ADDQ     (R9), R10
4425  	ADCQ     8(R9), R11
4426  	ADCQ     $0x01, R12
4427  	MOVQ     (BP), DX
4428  	MOVQ     DX, R15
4429  	MULXQ    R10, R13, R14
4430  	IMULQ    R12, R15
4431  	MULXQ    R11, AX, DX
4432  	ADDQ     AX, R14
4433  	ADCQ     DX, R15
4434  	MOVQ     8(BP), DX
4435  	MULXQ    R10, R10, AX
4436  	ADDQ     R10, R14
4437  	MULXQ    R11, R11, R8
4438  	ADCQ     R11, R15
4439  	ADCQ     $0x00, R8
4440  	IMULQ    R12, DX
4441  	ADDQ     AX, R15
4442  	ADCQ     DX, R8
4443  	MOVQ     R13, R10
4444  	MOVQ     R14, R11
4445  	MOVQ     R15, R12
4446  	ANDQ     $0x03, R12
4447  	MOVQ     R15, R13
4448  	ANDQ     $-4, R13
4449  	MOVQ     R8, R14
4450  	SHRQ     $0x02, R8, R15
4451  	SHRQ     $0x02, R8
4452  	ADDQ     R13, R10
4453  	ADCQ     R14, R11
4454  	ADCQ     $0x00, R12
4455  	ADDQ     R15, R10
4456  	ADCQ     R8, R11
4457  	ADCQ     $0x00, R12
4458  	VPADDD   Y14, Y0, Y0
4459  	VPADDD   Y9, Y5, Y5
4460  	VPADDD   Y10, Y6, Y6
4461  	VPADDD   Y11, Y7, Y7
4462  	VPXOR    Y0, Y4, Y4
4463  	VPXOR    Y5, Y1, Y1
4464  	VPXOR    Y6, Y2, Y2
4465  	VPXOR    Y7, Y3, Y3
4466  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
4467  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
4468  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
4469  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
4470  	VPADDD   Y4, Y12, Y12
4471  	VPADDD   Y1, Y13, Y13
4472  	VPADDD   Y2, Y8, Y8
4473  	VPADDD   Y3, Y15, Y15
4474  	VPXOR    Y12, Y14, Y14
4475  	VPXOR    Y13, Y9, Y9
4476  	VPXOR    Y8, Y10, Y10
4477  	VPXOR    Y15, Y11, Y11
4478  	VMOVDQA  Y15, 224(BP)
4479  	VPSLLD   $0x07, Y14, Y15
4480  	VPSRLD   $0x19, Y14, Y14
4481  	VPXOR    Y15, Y14, Y14
4482  	VPSLLD   $0x07, Y9, Y15
4483  	VPSRLD   $0x19, Y9, Y9
4484  	VPXOR    Y15, Y9, Y9
4485  	VPSLLD   $0x07, Y10, Y15
4486  	VPSRLD   $0x19, Y10, Y10
4487  	VPXOR    Y15, Y10, Y10
4488  	VPSLLD   $0x07, Y11, Y15
4489  	VPSRLD   $0x19, Y11, Y11
4490  	VPXOR    Y15, Y11, Y11
4491  	VMOVDQA  224(BP), Y15
4492  	VPALIGNR $0x04, Y14, Y14, Y14
4493  	VPALIGNR $0x04, Y9, Y9, Y9
4494  	VPALIGNR $0x04, Y10, Y10, Y10
4495  	VPALIGNR $0x04, Y11, Y11, Y11
4496  	VPALIGNR $0x08, Y12, Y12, Y12
4497  	VPALIGNR $0x08, Y13, Y13, Y13
4498  	VPALIGNR $0x08, Y8, Y8, Y8
4499  	VPALIGNR $0x08, Y15, Y15, Y15
4500  	VPALIGNR $0x0c, Y4, Y4, Y4
4501  	VPALIGNR $0x0c, Y1, Y1, Y1
4502  	VPALIGNR $0x0c, Y2, Y2, Y2
4503  	VPALIGNR $0x0c, Y3, Y3, Y3
4504  	VPADDD   Y14, Y0, Y0
4505  	VPADDD   Y9, Y5, Y5
4506  	VPADDD   Y10, Y6, Y6
4507  	VPADDD   Y11, Y7, Y7
4508  	VPXOR    Y0, Y4, Y4
4509  	VPXOR    Y5, Y1, Y1
4510  	VPXOR    Y6, Y2, Y2
4511  	VPXOR    Y7, Y3, Y3
4512  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
4513  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
4514  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
4515  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
4516  	VPADDD   Y4, Y12, Y12
4517  	VPADDD   Y1, Y13, Y13
4518  	VPADDD   Y2, Y8, Y8
4519  	VPADDD   Y3, Y15, Y15
4520  	VPXOR    Y12, Y14, Y14
4521  	VPXOR    Y13, Y9, Y9
4522  	VPXOR    Y8, Y10, Y10
4523  	VPXOR    Y15, Y11, Y11
4524  	ADDQ     16(R9), R10
4525  	ADCQ     24(R9), R11
4526  	ADCQ     $0x01, R12
4527  	MOVQ     (BP), DX
4528  	MOVQ     DX, R15
4529  	MULXQ    R10, R13, R14
4530  	IMULQ    R12, R15
4531  	MULXQ    R11, AX, DX
4532  	ADDQ     AX, R14
4533  	ADCQ     DX, R15
4534  	MOVQ     8(BP), DX
4535  	MULXQ    R10, R10, AX
4536  	ADDQ     R10, R14
4537  	MULXQ    R11, R11, R8
4538  	ADCQ     R11, R15
4539  	ADCQ     $0x00, R8
4540  	IMULQ    R12, DX
4541  	ADDQ     AX, R15
4542  	ADCQ     DX, R8
4543  	MOVQ     R13, R10
4544  	MOVQ     R14, R11
4545  	MOVQ     R15, R12
4546  	ANDQ     $0x03, R12
4547  	MOVQ     R15, R13
4548  	ANDQ     $-4, R13
4549  	MOVQ     R8, R14
4550  	SHRQ     $0x02, R8, R15
4551  	SHRQ     $0x02, R8
4552  	ADDQ     R13, R10
4553  	ADCQ     R14, R11
4554  	ADCQ     $0x00, R12
4555  	ADDQ     R15, R10
4556  	ADCQ     R8, R11
4557  	ADCQ     $0x00, R12
4558  	LEAQ     32(R9), R9
4559  	VMOVDQA  Y15, 224(BP)
4560  	VPSLLD   $0x0c, Y14, Y15
4561  	VPSRLD   $0x14, Y14, Y14
4562  	VPXOR    Y15, Y14, Y14
4563  	VPSLLD   $0x0c, Y9, Y15
4564  	VPSRLD   $0x14, Y9, Y9
4565  	VPXOR    Y15, Y9, Y9
4566  	VPSLLD   $0x0c, Y10, Y15
4567  	VPSRLD   $0x14, Y10, Y10
4568  	VPXOR    Y15, Y10, Y10
4569  	VPSLLD   $0x0c, Y11, Y15
4570  	VPSRLD   $0x14, Y11, Y11
4571  	VPXOR    Y15, Y11, Y11
4572  	VMOVDQA  224(BP), Y15
4573  	VPADDD   Y14, Y0, Y0
4574  	VPADDD   Y9, Y5, Y5
4575  	VPADDD   Y10, Y6, Y6
4576  	VPADDD   Y11, Y7, Y7
4577  	VPXOR    Y0, Y4, Y4
4578  	VPXOR    Y5, Y1, Y1
4579  	VPXOR    Y6, Y2, Y2
4580  	VPXOR    Y7, Y3, Y3
4581  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
4582  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
4583  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
4584  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
4585  	VPADDD   Y4, Y12, Y12
4586  	VPADDD   Y1, Y13, Y13
4587  	VPADDD   Y2, Y8, Y8
4588  	VPADDD   Y3, Y15, Y15
4589  	VPXOR    Y12, Y14, Y14
4590  	VPXOR    Y13, Y9, Y9
4591  	VPXOR    Y8, Y10, Y10
4592  	VPXOR    Y15, Y11, Y11
4593  	VMOVDQA  Y15, 224(BP)
4594  	VPSLLD   $0x07, Y14, Y15
4595  	VPSRLD   $0x19, Y14, Y14
4596  	VPXOR    Y15, Y14, Y14
4597  	VPSLLD   $0x07, Y9, Y15
4598  	VPSRLD   $0x19, Y9, Y9
4599  	VPXOR    Y15, Y9, Y9
4600  	VPSLLD   $0x07, Y10, Y15
4601  	VPSRLD   $0x19, Y10, Y10
4602  	VPXOR    Y15, Y10, Y10
4603  	VPSLLD   $0x07, Y11, Y15
4604  	VPSRLD   $0x19, Y11, Y11
4605  	VPXOR    Y15, Y11, Y11
4606  	VMOVDQA  224(BP), Y15
4607  	VPALIGNR $0x0c, Y14, Y14, Y14
4608  	VPALIGNR $0x0c, Y9, Y9, Y9
4609  	VPALIGNR $0x0c, Y10, Y10, Y10
4610  	VPALIGNR $0x0c, Y11, Y11, Y11
4611  	VPALIGNR $0x08, Y12, Y12, Y12
4612  	VPALIGNR $0x08, Y13, Y13, Y13
4613  	VPALIGNR $0x08, Y8, Y8, Y8
4614  	VPALIGNR $0x08, Y15, Y15, Y15
4615  	VPALIGNR $0x04, Y4, Y4, Y4
4616  	VPALIGNR $0x04, Y1, Y1, Y1
4617  	VPALIGNR $0x04, Y2, Y2, Y2
4618  	VPALIGNR $0x04, Y3, Y3, Y3
4619  	INCQ     CX
4620  	CMPQ     CX, $0x04
4621  	JLT      openAVX2Tail512LoopB
4622  	CMPQ     CX, $0x0a
4623  	JNE      openAVX2Tail512LoopA
4624  	MOVQ     BX, CX
4625  	SUBQ     $0x00000180, CX
4626  	ANDQ     $-16, CX
4627  
4628  openAVX2Tail512HashLoop:
4629  	TESTQ CX, CX
4630  	JE    openAVX2Tail512HashEnd
4631  	ADDQ  (R9), R10
4632  	ADCQ  8(R9), R11
4633  	ADCQ  $0x01, R12
4634  	MOVQ  (BP), DX
4635  	MOVQ  DX, R15
4636  	MULXQ R10, R13, R14
4637  	IMULQ R12, R15
4638  	MULXQ R11, AX, DX
4639  	ADDQ  AX, R14
4640  	ADCQ  DX, R15
4641  	MOVQ  8(BP), DX
4642  	MULXQ R10, R10, AX
4643  	ADDQ  R10, R14
4644  	MULXQ R11, R11, R8
4645  	ADCQ  R11, R15
4646  	ADCQ  $0x00, R8
4647  	IMULQ R12, DX
4648  	ADDQ  AX, R15
4649  	ADCQ  DX, R8
4650  	MOVQ  R13, R10
4651  	MOVQ  R14, R11
4652  	MOVQ  R15, R12
4653  	ANDQ  $0x03, R12
4654  	MOVQ  R15, R13
4655  	ANDQ  $-4, R13
4656  	MOVQ  R8, R14
4657  	SHRQ  $0x02, R8, R15
4658  	SHRQ  $0x02, R8
4659  	ADDQ  R13, R10
4660  	ADCQ  R14, R11
4661  	ADCQ  $0x00, R12
4662  	ADDQ  R15, R10
4663  	ADCQ  R8, R11
4664  	ADCQ  $0x00, R12
4665  	LEAQ  16(R9), R9
4666  	SUBQ  $0x10, CX
4667  	JMP   openAVX2Tail512HashLoop
4668  
4669  openAVX2Tail512HashEnd:
4670  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
4671  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
4672  	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
4673  	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
4674  	VPADDD     32(BP), Y14, Y14
4675  	VPADDD     32(BP), Y9, Y9
4676  	VPADDD     32(BP), Y10, Y10
4677  	VPADDD     32(BP), Y11, Y11
4678  	VPADDD     64(BP), Y12, Y12
4679  	VPADDD     64(BP), Y13, Y13
4680  	VPADDD     64(BP), Y8, Y8
4681  	VPADDD     64(BP), Y15, Y15
4682  	VPADDD     96(BP), Y4, Y4
4683  	VPADDD     128(BP), Y1, Y1
4684  	VPADDD     160(BP), Y2, Y2
4685  	VPADDD     192(BP), Y3, Y3
4686  	VMOVDQA    Y15, 224(BP)
4687  	VPERM2I128 $0x02, Y0, Y14, Y15
4688  	VPERM2I128 $0x13, Y0, Y14, Y14
4689  	VPERM2I128 $0x02, Y12, Y4, Y0
4690  	VPERM2I128 $0x13, Y12, Y4, Y12
4691  	VPXOR      (SI), Y15, Y15
4692  	VPXOR      32(SI), Y0, Y0
4693  	VPXOR      64(SI), Y14, Y14
4694  	VPXOR      96(SI), Y12, Y12
4695  	VMOVDQU    Y15, (DI)
4696  	VMOVDQU    Y0, 32(DI)
4697  	VMOVDQU    Y14, 64(DI)
4698  	VMOVDQU    Y12, 96(DI)
4699  	VPERM2I128 $0x02, Y5, Y9, Y0
4700  	VPERM2I128 $0x02, Y13, Y1, Y14
4701  	VPERM2I128 $0x13, Y5, Y9, Y12
4702  	VPERM2I128 $0x13, Y13, Y1, Y4
4703  	VPXOR      128(SI), Y0, Y0
4704  	VPXOR      160(SI), Y14, Y14
4705  	VPXOR      192(SI), Y12, Y12
4706  	VPXOR      224(SI), Y4, Y4
4707  	VMOVDQU    Y0, 128(DI)
4708  	VMOVDQU    Y14, 160(DI)
4709  	VMOVDQU    Y12, 192(DI)
4710  	VMOVDQU    Y4, 224(DI)
4711  	VPERM2I128 $0x02, Y6, Y10, Y0
4712  	VPERM2I128 $0x02, Y8, Y2, Y14
4713  	VPERM2I128 $0x13, Y6, Y10, Y12
4714  	VPERM2I128 $0x13, Y8, Y2, Y4
4715  	VPXOR      256(SI), Y0, Y0
4716  	VPXOR      288(SI), Y14, Y14
4717  	VPXOR      320(SI), Y12, Y12
4718  	VPXOR      352(SI), Y4, Y4
4719  	VMOVDQU    Y0, 256(DI)
4720  	VMOVDQU    Y14, 288(DI)
4721  	VMOVDQU    Y12, 320(DI)
4722  	VMOVDQU    Y4, 352(DI)
4723  	VPERM2I128 $0x02, Y7, Y11, Y0
4724  	VPERM2I128 $0x02, 224(BP), Y3, Y14
4725  	VPERM2I128 $0x13, Y7, Y11, Y12
4726  	VPERM2I128 $0x13, 224(BP), Y3, Y4
4727  	LEAQ       384(SI), SI
4728  	LEAQ       384(DI), DI
4729  	SUBQ       $0x00000180, BX
4730  	JMP        openAVX2TailLoop
4731  
4732  DATA ·chacha20Constants<>+0(SB)/4, $0x61707865
4733  DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e
4734  DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32
4735  DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574
4736  DATA ·chacha20Constants<>+16(SB)/4, $0x61707865
4737  DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e
4738  DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32
4739  DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574
4740  GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32
4741  
4742  DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff
4743  DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc
4744  DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff
4745  DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff
4746  GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32
4747  
4748  DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001
4749  DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000
4750  GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16
4751  
4752  DATA ·andMask<>+0(SB)/8, $0x00000000000000ff
4753  DATA ·andMask<>+8(SB)/8, $0x0000000000000000
4754  DATA ·andMask<>+16(SB)/8, $0x000000000000ffff
4755  DATA ·andMask<>+24(SB)/8, $0x0000000000000000
4756  DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff
4757  DATA ·andMask<>+40(SB)/8, $0x0000000000000000
4758  DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff
4759  DATA ·andMask<>+56(SB)/8, $0x0000000000000000
4760  DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff
4761  DATA ·andMask<>+72(SB)/8, $0x0000000000000000
4762  DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff
4763  DATA ·andMask<>+88(SB)/8, $0x0000000000000000
4764  DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff
4765  DATA ·andMask<>+104(SB)/8, $0x0000000000000000
4766  DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff
4767  DATA ·andMask<>+120(SB)/8, $0x0000000000000000
4768  DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff
4769  DATA ·andMask<>+136(SB)/8, $0x00000000000000ff
4770  DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff
4771  DATA ·andMask<>+152(SB)/8, $0x000000000000ffff
4772  DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff
4773  DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff
4774  DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff
4775  DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff
4776  DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff
4777  DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff
4778  DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff
4779  DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff
4780  DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff
4781  DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff
4782  GLOBL ·andMask<>(SB), RODATA|NOPTR, $240
4783  
4784  DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000
4785  DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000
4786  DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001
4787  DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000
4788  GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32
4789  
4790  DATA ·rol16<>+0(SB)/8, $0x0504070601000302
4791  DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a
4792  DATA ·rol16<>+16(SB)/8, $0x0504070601000302
4793  DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a
4794  GLOBL ·rol16<>(SB), RODATA|NOPTR, $32
4795  
4796  DATA ·rol8<>+0(SB)/8, $0x0605040702010003
4797  DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b
4798  DATA ·rol8<>+16(SB)/8, $0x0605040702010003
4799  DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b
4800  GLOBL ·rol8<>(SB), RODATA|NOPTR, $32
4801  
4802  DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002
4803  DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000
4804  DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002
4805  DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000
4806  GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32
4807  
4808  // func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte)
4809  // Requires: AVX, AVX2, BMI2, CMOV, SSE2
4810  TEXT ·chacha20Poly1305Seal(SB), $288-96
4811  	MOVQ SP, BP
4812  	ADDQ $0x20, BP
4813  	ANDQ $-32, BP
4814  	MOVQ dst_base+0(FP), DI
4815  	MOVQ key_base+24(FP), R8
4816  	MOVQ src_base+48(FP), SI
4817  	MOVQ src_len+56(FP), BX
4818  	MOVQ ad_base+72(FP), CX
4819  	CMPB ·useAVX2+0(SB), $0x01
4820  	JE   chacha20Poly1305Seal_AVX2
4821  
4822  	// Special optimization, for very short buffers
4823  	CMPQ BX, $0x80
4824  	JBE  sealSSE128
4825  
4826  	// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
4827  	MOVOU ·chacha20Constants<>+0(SB), X0
4828  	MOVOU 16(R8), X3
4829  	MOVOU 32(R8), X6
4830  	MOVOU 48(R8), X9
4831  
4832  	// Store state on stack for future use
4833  	MOVO X3, 32(BP)
4834  	MOVO X6, 48(BP)
4835  
4836  	// Load state, increment counter blocks
4837  	MOVO  X0, X1
4838  	MOVO  X3, X4
4839  	MOVO  X6, X7
4840  	MOVO  X9, X10
4841  	PADDL ·sseIncMask<>+0(SB), X10
4842  	MOVO  X1, X2
4843  	MOVO  X4, X5
4844  	MOVO  X7, X8
4845  	MOVO  X10, X11
4846  	PADDL ·sseIncMask<>+0(SB), X11
4847  	MOVO  X2, X12
4848  	MOVO  X5, X13
4849  	MOVO  X8, X14
4850  	MOVO  X11, X15
4851  	PADDL ·sseIncMask<>+0(SB), X15
4852  
4853  	// Store counters
4854  	MOVO X9, 80(BP)
4855  	MOVO X10, 96(BP)
4856  	MOVO X11, 112(BP)
4857  	MOVO X15, 128(BP)
4858  	MOVQ $0x0000000a, R9
4859  
4860  sealSSEIntroLoop:
4861  	MOVO  X14, 64(BP)
4862  	PADDD X3, X0
4863  	PXOR  X0, X9
4864  	ROL16(X9, X14)
4865  	PADDD X9, X6
4866  	PXOR  X6, X3
4867  	MOVO  X3, X14
4868  	PSLLL $0x0c, X14
4869  	PSRLL $0x14, X3
4870  	PXOR  X14, X3
4871  	PADDD X3, X0
4872  	PXOR  X0, X9
4873  	ROL8(X9, X14)
4874  	PADDD X9, X6
4875  	PXOR  X6, X3
4876  	MOVO  X3, X14
4877  	PSLLL $0x07, X14
4878  	PSRLL $0x19, X3
4879  	PXOR  X14, X3
4880  	PADDD X4, X1
4881  	PXOR  X1, X10
4882  	ROL16(X10, X14)
4883  	PADDD X10, X7
4884  	PXOR  X7, X4
4885  	MOVO  X4, X14
4886  	PSLLL $0x0c, X14
4887  	PSRLL $0x14, X4
4888  	PXOR  X14, X4
4889  	PADDD X4, X1
4890  	PXOR  X1, X10
4891  	ROL8(X10, X14)
4892  	PADDD X10, X7
4893  	PXOR  X7, X4
4894  	MOVO  X4, X14
4895  	PSLLL $0x07, X14
4896  	PSRLL $0x19, X4
4897  	PXOR  X14, X4
4898  	PADDD X5, X2
4899  	PXOR  X2, X11
4900  	ROL16(X11, X14)
4901  	PADDD X11, X8
4902  	PXOR  X8, X5
4903  	MOVO  X5, X14
4904  	PSLLL $0x0c, X14
4905  	PSRLL $0x14, X5
4906  	PXOR  X14, X5
4907  	PADDD X5, X2
4908  	PXOR  X2, X11
4909  	ROL8(X11, X14)
4910  	PADDD X11, X8
4911  	PXOR  X8, X5
4912  	MOVO  X5, X14
4913  	PSLLL $0x07, X14
4914  	PSRLL $0x19, X5
4915  	PXOR  X14, X5
4916  	MOVO  64(BP), X14
4917  	MOVO  X7, 64(BP)
4918  	PADDD X13, X12
4919  	PXOR  X12, X15
4920  	ROL16(X15, X7)
4921  	PADDD X15, X14
4922  	PXOR  X14, X13
4923  	MOVO  X13, X7
4924  	PSLLL $0x0c, X7
4925  	PSRLL $0x14, X13
4926  	PXOR  X7, X13
4927  	PADDD X13, X12
4928  	PXOR  X12, X15
4929  	ROL8(X15, X7)
4930  	PADDD X15, X14
4931  	PXOR  X14, X13
4932  	MOVO  X13, X7
4933  	PSLLL $0x07, X7
4934  	PSRLL $0x19, X13
4935  	PXOR  X7, X13
4936  	MOVO  64(BP), X7
4937  	BYTE  $0x66
4938  	BYTE  $0x0f
4939  	BYTE  $0x3a
4940  	BYTE  $0x0f
4941  	BYTE  $0xdb
4942  	BYTE  $0x04
4943  	BYTE  $0x66
4944  	BYTE  $0x0f
4945  	BYTE  $0x3a
4946  	BYTE  $0x0f
4947  	BYTE  $0xe4
4948  	BYTE  $0x04
4949  	BYTE  $0x66
4950  	BYTE  $0x0f
4951  	BYTE  $0x3a
4952  	BYTE  $0x0f
4953  	BYTE  $0xed
4954  	BYTE  $0x04
4955  	BYTE  $0x66
4956  	BYTE  $0x45
4957  	BYTE  $0x0f
4958  	BYTE  $0x3a
4959  	BYTE  $0x0f
4960  	BYTE  $0xed
4961  	BYTE  $0x04
4962  	BYTE  $0x66
4963  	BYTE  $0x0f
4964  	BYTE  $0x3a
4965  	BYTE  $0x0f
4966  	BYTE  $0xf6
4967  	BYTE  $0x08
4968  	BYTE  $0x66
4969  	BYTE  $0x0f
4970  	BYTE  $0x3a
4971  	BYTE  $0x0f
4972  	BYTE  $0xff
4973  	BYTE  $0x08
4974  	BYTE  $0x66
4975  	BYTE  $0x45
4976  	BYTE  $0x0f
4977  	BYTE  $0x3a
4978  	BYTE  $0x0f
4979  	BYTE  $0xc0
4980  	BYTE  $0x08
4981  	BYTE  $0x66
4982  	BYTE  $0x45
4983  	BYTE  $0x0f
4984  	BYTE  $0x3a
4985  	BYTE  $0x0f
4986  	BYTE  $0xf6
4987  	BYTE  $0x08
4988  	BYTE  $0x66
4989  	BYTE  $0x45
4990  	BYTE  $0x0f
4991  	BYTE  $0x3a
4992  	BYTE  $0x0f
4993  	BYTE  $0xc9
4994  	BYTE  $0x0c
4995  	BYTE  $0x66
4996  	BYTE  $0x45
4997  	BYTE  $0x0f
4998  	BYTE  $0x3a
4999  	BYTE  $0x0f
5000  	BYTE  $0xd2
5001  	BYTE  $0x0c
5002  	BYTE  $0x66
5003  	BYTE  $0x45
5004  	BYTE  $0x0f
5005  	BYTE  $0x3a
5006  	BYTE  $0x0f
5007  	BYTE  $0xdb
5008  	BYTE  $0x0c
5009  	BYTE  $0x66
5010  	BYTE  $0x45
5011  	BYTE  $0x0f
5012  	BYTE  $0x3a
5013  	BYTE  $0x0f
5014  	BYTE  $0xff
5015  	BYTE  $0x0c
5016  	MOVO  X14, 64(BP)
5017  	PADDD X3, X0
5018  	PXOR  X0, X9
5019  	ROL16(X9, X14)
5020  	PADDD X9, X6
5021  	PXOR  X6, X3
5022  	MOVO  X3, X14
5023  	PSLLL $0x0c, X14
5024  	PSRLL $0x14, X3
5025  	PXOR  X14, X3
5026  	PADDD X3, X0
5027  	PXOR  X0, X9
5028  	ROL8(X9, X14)
5029  	PADDD X9, X6
5030  	PXOR  X6, X3
5031  	MOVO  X3, X14
5032  	PSLLL $0x07, X14
5033  	PSRLL $0x19, X3
5034  	PXOR  X14, X3
5035  	PADDD X4, X1
5036  	PXOR  X1, X10
5037  	ROL16(X10, X14)
5038  	PADDD X10, X7
5039  	PXOR  X7, X4
5040  	MOVO  X4, X14
5041  	PSLLL $0x0c, X14
5042  	PSRLL $0x14, X4
5043  	PXOR  X14, X4
5044  	PADDD X4, X1
5045  	PXOR  X1, X10
5046  	ROL8(X10, X14)
5047  	PADDD X10, X7
5048  	PXOR  X7, X4
5049  	MOVO  X4, X14
5050  	PSLLL $0x07, X14
5051  	PSRLL $0x19, X4
5052  	PXOR  X14, X4
5053  	PADDD X5, X2
5054  	PXOR  X2, X11
5055  	ROL16(X11, X14)
5056  	PADDD X11, X8
5057  	PXOR  X8, X5
5058  	MOVO  X5, X14
5059  	PSLLL $0x0c, X14
5060  	PSRLL $0x14, X5
5061  	PXOR  X14, X5
5062  	PADDD X5, X2
5063  	PXOR  X2, X11
5064  	ROL8(X11, X14)
5065  	PADDD X11, X8
5066  	PXOR  X8, X5
5067  	MOVO  X5, X14
5068  	PSLLL $0x07, X14
5069  	PSRLL $0x19, X5
5070  	PXOR  X14, X5
5071  	MOVO  64(BP), X14
5072  	MOVO  X7, 64(BP)
5073  	PADDD X13, X12
5074  	PXOR  X12, X15
5075  	ROL16(X15, X7)
5076  	PADDD X15, X14
5077  	PXOR  X14, X13
5078  	MOVO  X13, X7
5079  	PSLLL $0x0c, X7
5080  	PSRLL $0x14, X13
5081  	PXOR  X7, X13
5082  	PADDD X13, X12
5083  	PXOR  X12, X15
5084  	ROL8(X15, X7)
5085  	PADDD X15, X14
5086  	PXOR  X14, X13
5087  	MOVO  X13, X7
5088  	PSLLL $0x07, X7
5089  	PSRLL $0x19, X13
5090  	PXOR  X7, X13
5091  	MOVO  64(BP), X7
5092  	BYTE  $0x66
5093  	BYTE  $0x0f
5094  	BYTE  $0x3a
5095  	BYTE  $0x0f
5096  	BYTE  $0xdb
5097  	BYTE  $0x0c
5098  	BYTE  $0x66
5099  	BYTE  $0x0f
5100  	BYTE  $0x3a
5101  	BYTE  $0x0f
5102  	BYTE  $0xe4
5103  	BYTE  $0x0c
5104  	BYTE  $0x66
5105  	BYTE  $0x0f
5106  	BYTE  $0x3a
5107  	BYTE  $0x0f
5108  	BYTE  $0xed
5109  	BYTE  $0x0c
5110  	BYTE  $0x66
5111  	BYTE  $0x45
5112  	BYTE  $0x0f
5113  	BYTE  $0x3a
5114  	BYTE  $0x0f
5115  	BYTE  $0xed
5116  	BYTE  $0x0c
5117  	BYTE  $0x66
5118  	BYTE  $0x0f
5119  	BYTE  $0x3a
5120  	BYTE  $0x0f
5121  	BYTE  $0xf6
5122  	BYTE  $0x08
5123  	BYTE  $0x66
5124  	BYTE  $0x0f
5125  	BYTE  $0x3a
5126  	BYTE  $0x0f
5127  	BYTE  $0xff
5128  	BYTE  $0x08
5129  	BYTE  $0x66
5130  	BYTE  $0x45
5131  	BYTE  $0x0f
5132  	BYTE  $0x3a
5133  	BYTE  $0x0f
5134  	BYTE  $0xc0
5135  	BYTE  $0x08
5136  	BYTE  $0x66
5137  	BYTE  $0x45
5138  	BYTE  $0x0f
5139  	BYTE  $0x3a
5140  	BYTE  $0x0f
5141  	BYTE  $0xf6
5142  	BYTE  $0x08
5143  	BYTE  $0x66
5144  	BYTE  $0x45
5145  	BYTE  $0x0f
5146  	BYTE  $0x3a
5147  	BYTE  $0x0f
5148  	BYTE  $0xc9
5149  	BYTE  $0x04
5150  	BYTE  $0x66
5151  	BYTE  $0x45
5152  	BYTE  $0x0f
5153  	BYTE  $0x3a
5154  	BYTE  $0x0f
5155  	BYTE  $0xd2
5156  	BYTE  $0x04
5157  	BYTE  $0x66
5158  	BYTE  $0x45
5159  	BYTE  $0x0f
5160  	BYTE  $0x3a
5161  	BYTE  $0x0f
5162  	BYTE  $0xdb
5163  	BYTE  $0x04
5164  	BYTE  $0x66
5165  	BYTE  $0x45
5166  	BYTE  $0x0f
5167  	BYTE  $0x3a
5168  	BYTE  $0x0f
5169  	BYTE  $0xff
5170  	BYTE  $0x04
5171  	DECQ  R9
5172  	JNE   sealSSEIntroLoop
5173  
5174  	// Add in the state
5175  	PADDD ·chacha20Constants<>+0(SB), X0
5176  	PADDD ·chacha20Constants<>+0(SB), X1
5177  	PADDD ·chacha20Constants<>+0(SB), X2
5178  	PADDD ·chacha20Constants<>+0(SB), X12
5179  	PADDD 32(BP), X3
5180  	PADDD 32(BP), X4
5181  	PADDD 32(BP), X5
5182  	PADDD 32(BP), X13
5183  	PADDD 48(BP), X7
5184  	PADDD 48(BP), X8
5185  	PADDD 48(BP), X14
5186  	PADDD 96(BP), X10
5187  	PADDD 112(BP), X11
5188  	PADDD 128(BP), X15
5189  
5190  	// Clamp and store the key
5191  	PAND ·polyClampMask<>+0(SB), X0
5192  	MOVO X0, (BP)
5193  	MOVO X3, 16(BP)
5194  
5195  	// Hash AAD
5196  	MOVQ  ad_len+80(FP), R9
5197  	CALL  polyHashADInternal<>(SB)
5198  	MOVOU (SI), X0
5199  	MOVOU 16(SI), X3
5200  	MOVOU 32(SI), X6
5201  	MOVOU 48(SI), X9
5202  	PXOR  X0, X1
5203  	PXOR  X3, X4
5204  	PXOR  X6, X7
5205  	PXOR  X9, X10
5206  	MOVOU X1, (DI)
5207  	MOVOU X4, 16(DI)
5208  	MOVOU X7, 32(DI)
5209  	MOVOU X10, 48(DI)
5210  	MOVOU 64(SI), X0
5211  	MOVOU 80(SI), X3
5212  	MOVOU 96(SI), X6
5213  	MOVOU 112(SI), X9
5214  	PXOR  X0, X2
5215  	PXOR  X3, X5
5216  	PXOR  X6, X8
5217  	PXOR  X9, X11
5218  	MOVOU X2, 64(DI)
5219  	MOVOU X5, 80(DI)
5220  	MOVOU X8, 96(DI)
5221  	MOVOU X11, 112(DI)
5222  	MOVQ  $0x00000080, CX
5223  	SUBQ  $0x80, BX
5224  	LEAQ  128(SI), SI
5225  	MOVO  X12, X1
5226  	MOVO  X13, X4
5227  	MOVO  X14, X7
5228  	MOVO  X15, X10
5229  	CMPQ  BX, $0x40
5230  	JBE   sealSSE128SealHash
5231  	MOVOU (SI), X0
5232  	MOVOU 16(SI), X3
5233  	MOVOU 32(SI), X6
5234  	MOVOU 48(SI), X9
5235  	PXOR  X0, X12
5236  	PXOR  X3, X13
5237  	PXOR  X6, X14
5238  	PXOR  X9, X15
5239  	MOVOU X12, 128(DI)
5240  	MOVOU X13, 144(DI)
5241  	MOVOU X14, 160(DI)
5242  	MOVOU X15, 176(DI)
5243  	ADDQ  $0x40, CX
5244  	SUBQ  $0x40, BX
5245  	LEAQ  64(SI), SI
5246  	MOVQ  $0x00000002, CX
5247  	MOVQ  $0x00000008, R9
5248  	CMPQ  BX, $0x40
5249  	JBE   sealSSETail64
5250  	CMPQ  BX, $0x80
5251  	JBE   sealSSETail128
5252  	CMPQ  BX, $0xc0
5253  	JBE   sealSSETail192
5254  
5255  sealSSEMainLoop:
5256  	// Load state, increment counter blocks
5257  	MOVO  ·chacha20Constants<>+0(SB), X0
5258  	MOVO  32(BP), X3
5259  	MOVO  48(BP), X6
5260  	MOVO  128(BP), X9
5261  	PADDL ·sseIncMask<>+0(SB), X9
5262  	MOVO  X0, X1
5263  	MOVO  X3, X4
5264  	MOVO  X6, X7
5265  	MOVO  X9, X10
5266  	PADDL ·sseIncMask<>+0(SB), X10
5267  	MOVO  X1, X2
5268  	MOVO  X4, X5
5269  	MOVO  X7, X8
5270  	MOVO  X10, X11
5271  	PADDL ·sseIncMask<>+0(SB), X11
5272  	MOVO  X2, X12
5273  	MOVO  X5, X13
5274  	MOVO  X8, X14
5275  	MOVO  X11, X15
5276  	PADDL ·sseIncMask<>+0(SB), X15
5277  
5278  	// Store counters
5279  	MOVO X9, 80(BP)
5280  	MOVO X10, 96(BP)
5281  	MOVO X11, 112(BP)
5282  	MOVO X15, 128(BP)
5283  
5284  sealSSEInnerLoop:
5285  	MOVO  X14, 64(BP)
5286  	PADDD X3, X0
5287  	PXOR  X0, X9
5288  	ROL16(X9, X14)
5289  	PADDD X9, X6
5290  	PXOR  X6, X3
5291  	MOVO  X3, X14
5292  	PSLLL $0x0c, X14
5293  	PSRLL $0x14, X3
5294  	PXOR  X14, X3
5295  	PADDD X3, X0
5296  	PXOR  X0, X9
5297  	ROL8(X9, X14)
5298  	PADDD X9, X6
5299  	PXOR  X6, X3
5300  	MOVO  X3, X14
5301  	PSLLL $0x07, X14
5302  	PSRLL $0x19, X3
5303  	PXOR  X14, X3
5304  	PADDD X4, X1
5305  	PXOR  X1, X10
5306  	ROL16(X10, X14)
5307  	PADDD X10, X7
5308  	PXOR  X7, X4
5309  	MOVO  X4, X14
5310  	PSLLL $0x0c, X14
5311  	PSRLL $0x14, X4
5312  	PXOR  X14, X4
5313  	PADDD X4, X1
5314  	PXOR  X1, X10
5315  	ROL8(X10, X14)
5316  	PADDD X10, X7
5317  	PXOR  X7, X4
5318  	MOVO  X4, X14
5319  	PSLLL $0x07, X14
5320  	PSRLL $0x19, X4
5321  	PXOR  X14, X4
5322  	PADDD X5, X2
5323  	PXOR  X2, X11
5324  	ROL16(X11, X14)
5325  	PADDD X11, X8
5326  	PXOR  X8, X5
5327  	MOVO  X5, X14
5328  	PSLLL $0x0c, X14
5329  	PSRLL $0x14, X5
5330  	PXOR  X14, X5
5331  	PADDD X5, X2
5332  	PXOR  X2, X11
5333  	ROL8(X11, X14)
5334  	PADDD X11, X8
5335  	PXOR  X8, X5
5336  	MOVO  X5, X14
5337  	PSLLL $0x07, X14
5338  	PSRLL $0x19, X5
5339  	PXOR  X14, X5
5340  	MOVO  64(BP), X14
5341  	MOVO  X7, 64(BP)
5342  	PADDD X13, X12
5343  	PXOR  X12, X15
5344  	ROL16(X15, X7)
5345  	PADDD X15, X14
5346  	PXOR  X14, X13
5347  	MOVO  X13, X7
5348  	PSLLL $0x0c, X7
5349  	PSRLL $0x14, X13
5350  	PXOR  X7, X13
5351  	PADDD X13, X12
5352  	PXOR  X12, X15
5353  	ROL8(X15, X7)
5354  	PADDD X15, X14
5355  	PXOR  X14, X13
5356  	MOVO  X13, X7
5357  	PSLLL $0x07, X7
5358  	PSRLL $0x19, X13
5359  	PXOR  X7, X13
5360  	MOVO  64(BP), X7
5361  	ADDQ  (DI), R10
5362  	ADCQ  8(DI), R11
5363  	ADCQ  $0x01, R12
5364  	BYTE  $0x66
5365  	BYTE  $0x0f
5366  	BYTE  $0x3a
5367  	BYTE  $0x0f
5368  	BYTE  $0xdb
5369  	BYTE  $0x04
5370  	BYTE  $0x66
5371  	BYTE  $0x0f
5372  	BYTE  $0x3a
5373  	BYTE  $0x0f
5374  	BYTE  $0xe4
5375  	BYTE  $0x04
5376  	BYTE  $0x66
5377  	BYTE  $0x0f
5378  	BYTE  $0x3a
5379  	BYTE  $0x0f
5380  	BYTE  $0xed
5381  	BYTE  $0x04
5382  	BYTE  $0x66
5383  	BYTE  $0x45
5384  	BYTE  $0x0f
5385  	BYTE  $0x3a
5386  	BYTE  $0x0f
5387  	BYTE  $0xed
5388  	BYTE  $0x04
5389  	BYTE  $0x66
5390  	BYTE  $0x0f
5391  	BYTE  $0x3a
5392  	BYTE  $0x0f
5393  	BYTE  $0xf6
5394  	BYTE  $0x08
5395  	BYTE  $0x66
5396  	BYTE  $0x0f
5397  	BYTE  $0x3a
5398  	BYTE  $0x0f
5399  	BYTE  $0xff
5400  	BYTE  $0x08
5401  	BYTE  $0x66
5402  	BYTE  $0x45
5403  	BYTE  $0x0f
5404  	BYTE  $0x3a
5405  	BYTE  $0x0f
5406  	BYTE  $0xc0
5407  	BYTE  $0x08
5408  	BYTE  $0x66
5409  	BYTE  $0x45
5410  	BYTE  $0x0f
5411  	BYTE  $0x3a
5412  	BYTE  $0x0f
5413  	BYTE  $0xf6
5414  	BYTE  $0x08
5415  	BYTE  $0x66
5416  	BYTE  $0x45
5417  	BYTE  $0x0f
5418  	BYTE  $0x3a
5419  	BYTE  $0x0f
5420  	BYTE  $0xc9
5421  	BYTE  $0x0c
5422  	BYTE  $0x66
5423  	BYTE  $0x45
5424  	BYTE  $0x0f
5425  	BYTE  $0x3a
5426  	BYTE  $0x0f
5427  	BYTE  $0xd2
5428  	BYTE  $0x0c
5429  	BYTE  $0x66
5430  	BYTE  $0x45
5431  	BYTE  $0x0f
5432  	BYTE  $0x3a
5433  	BYTE  $0x0f
5434  	BYTE  $0xdb
5435  	BYTE  $0x0c
5436  	BYTE  $0x66
5437  	BYTE  $0x45
5438  	BYTE  $0x0f
5439  	BYTE  $0x3a
5440  	BYTE  $0x0f
5441  	BYTE  $0xff
5442  	BYTE  $0x0c
5443  	MOVQ  (BP), AX
5444  	MOVQ  AX, R15
5445  	MULQ  R10
5446  	MOVQ  AX, R13
5447  	MOVQ  DX, R14
5448  	MOVQ  (BP), AX
5449  	MULQ  R11
5450  	IMULQ R12, R15
5451  	ADDQ  AX, R14
5452  	ADCQ  DX, R15
5453  	MOVQ  8(BP), AX
5454  	MOVQ  AX, R8
5455  	MULQ  R10
5456  	ADDQ  AX, R14
5457  	ADCQ  $0x00, DX
5458  	MOVQ  DX, R10
5459  	MOVQ  8(BP), AX
5460  	MULQ  R11
5461  	ADDQ  AX, R15
5462  	ADCQ  $0x00, DX
5463  	LEAQ  16(DI), DI
5464  	MOVO  X14, 64(BP)
5465  	PADDD X3, X0
5466  	PXOR  X0, X9
5467  	ROL16(X9, X14)
5468  	PADDD X9, X6
5469  	PXOR  X6, X3
5470  	MOVO  X3, X14
5471  	PSLLL $0x0c, X14
5472  	PSRLL $0x14, X3
5473  	PXOR  X14, X3
5474  	PADDD X3, X0
5475  	PXOR  X0, X9
5476  	ROL8(X9, X14)
5477  	PADDD X9, X6
5478  	PXOR  X6, X3
5479  	MOVO  X3, X14
5480  	PSLLL $0x07, X14
5481  	PSRLL $0x19, X3
5482  	PXOR  X14, X3
5483  	PADDD X4, X1
5484  	PXOR  X1, X10
5485  	ROL16(X10, X14)
5486  	PADDD X10, X7
5487  	PXOR  X7, X4
5488  	MOVO  X4, X14
5489  	PSLLL $0x0c, X14
5490  	PSRLL $0x14, X4
5491  	PXOR  X14, X4
5492  	PADDD X4, X1
5493  	PXOR  X1, X10
5494  	ROL8(X10, X14)
5495  	PADDD X10, X7
5496  	PXOR  X7, X4
5497  	MOVO  X4, X14
5498  	PSLLL $0x07, X14
5499  	PSRLL $0x19, X4
5500  	PXOR  X14, X4
5501  	PADDD X5, X2
5502  	PXOR  X2, X11
5503  	ROL16(X11, X14)
5504  	PADDD X11, X8
5505  	PXOR  X8, X5
5506  	MOVO  X5, X14
5507  	PSLLL $0x0c, X14
5508  	PSRLL $0x14, X5
5509  	PXOR  X14, X5
5510  	PADDD X5, X2
5511  	PXOR  X2, X11
5512  	ROL8(X11, X14)
5513  	PADDD X11, X8
5514  	PXOR  X8, X5
5515  	MOVO  X5, X14
5516  	PSLLL $0x07, X14
5517  	PSRLL $0x19, X5
5518  	PXOR  X14, X5
5519  	MOVO  64(BP), X14
5520  	MOVO  X7, 64(BP)
5521  	IMULQ R12, R8
5522  	ADDQ  R10, R15
5523  	ADCQ  DX, R8
5524  	PADDD X13, X12
5525  	PXOR  X12, X15
5526  	ROL16(X15, X7)
5527  	PADDD X15, X14
5528  	PXOR  X14, X13
5529  	MOVO  X13, X7
5530  	PSLLL $0x0c, X7
5531  	PSRLL $0x14, X13
5532  	PXOR  X7, X13
5533  	PADDD X13, X12
5534  	PXOR  X12, X15
5535  	ROL8(X15, X7)
5536  	PADDD X15, X14
5537  	PXOR  X14, X13
5538  	MOVO  X13, X7
5539  	PSLLL $0x07, X7
5540  	PSRLL $0x19, X13
5541  	PXOR  X7, X13
5542  	MOVO  64(BP), X7
5543  	MOVQ  R13, R10
5544  	MOVQ  R14, R11
5545  	MOVQ  R15, R12
5546  	ANDQ  $0x03, R12
5547  	MOVQ  R15, R13
5548  	ANDQ  $-4, R13
5549  	MOVQ  R8, R14
5550  	SHRQ  $0x02, R8, R15
5551  	SHRQ  $0x02, R8
5552  	ADDQ  R13, R10
5553  	ADCQ  R14, R11
5554  	ADCQ  $0x00, R12
5555  	ADDQ  R15, R10
5556  	ADCQ  R8, R11
5557  	ADCQ  $0x00, R12
5558  	BYTE  $0x66
5559  	BYTE  $0x0f
5560  	BYTE  $0x3a
5561  	BYTE  $0x0f
5562  	BYTE  $0xdb
5563  	BYTE  $0x0c
5564  	BYTE  $0x66
5565  	BYTE  $0x0f
5566  	BYTE  $0x3a
5567  	BYTE  $0x0f
5568  	BYTE  $0xe4
5569  	BYTE  $0x0c
5570  	BYTE  $0x66
5571  	BYTE  $0x0f
5572  	BYTE  $0x3a
5573  	BYTE  $0x0f
5574  	BYTE  $0xed
5575  	BYTE  $0x0c
5576  	BYTE  $0x66
5577  	BYTE  $0x45
5578  	BYTE  $0x0f
5579  	BYTE  $0x3a
5580  	BYTE  $0x0f
5581  	BYTE  $0xed
5582  	BYTE  $0x0c
5583  	BYTE  $0x66
5584  	BYTE  $0x0f
5585  	BYTE  $0x3a
5586  	BYTE  $0x0f
5587  	BYTE  $0xf6
5588  	BYTE  $0x08
5589  	BYTE  $0x66
5590  	BYTE  $0x0f
5591  	BYTE  $0x3a
5592  	BYTE  $0x0f
5593  	BYTE  $0xff
5594  	BYTE  $0x08
5595  	BYTE  $0x66
5596  	BYTE  $0x45
5597  	BYTE  $0x0f
5598  	BYTE  $0x3a
5599  	BYTE  $0x0f
5600  	BYTE  $0xc0
5601  	BYTE  $0x08
5602  	BYTE  $0x66
5603  	BYTE  $0x45
5604  	BYTE  $0x0f
5605  	BYTE  $0x3a
5606  	BYTE  $0x0f
5607  	BYTE  $0xf6
5608  	BYTE  $0x08
5609  	BYTE  $0x66
5610  	BYTE  $0x45
5611  	BYTE  $0x0f
5612  	BYTE  $0x3a
5613  	BYTE  $0x0f
5614  	BYTE  $0xc9
5615  	BYTE  $0x04
5616  	BYTE  $0x66
5617  	BYTE  $0x45
5618  	BYTE  $0x0f
5619  	BYTE  $0x3a
5620  	BYTE  $0x0f
5621  	BYTE  $0xd2
5622  	BYTE  $0x04
5623  	BYTE  $0x66
5624  	BYTE  $0x45
5625  	BYTE  $0x0f
5626  	BYTE  $0x3a
5627  	BYTE  $0x0f
5628  	BYTE  $0xdb
5629  	BYTE  $0x04
5630  	BYTE  $0x66
5631  	BYTE  $0x45
5632  	BYTE  $0x0f
5633  	BYTE  $0x3a
5634  	BYTE  $0x0f
5635  	BYTE  $0xff
5636  	BYTE  $0x04
5637  	DECQ  R9
5638  	JGE   sealSSEInnerLoop
5639  	ADDQ  (DI), R10
5640  	ADCQ  8(DI), R11
5641  	ADCQ  $0x01, R12
5642  	MOVQ  (BP), AX
5643  	MOVQ  AX, R15
5644  	MULQ  R10
5645  	MOVQ  AX, R13
5646  	MOVQ  DX, R14
5647  	MOVQ  (BP), AX
5648  	MULQ  R11
5649  	IMULQ R12, R15
5650  	ADDQ  AX, R14
5651  	ADCQ  DX, R15
5652  	MOVQ  8(BP), AX
5653  	MOVQ  AX, R8
5654  	MULQ  R10
5655  	ADDQ  AX, R14
5656  	ADCQ  $0x00, DX
5657  	MOVQ  DX, R10
5658  	MOVQ  8(BP), AX
5659  	MULQ  R11
5660  	ADDQ  AX, R15
5661  	ADCQ  $0x00, DX
5662  	IMULQ R12, R8
5663  	ADDQ  R10, R15
5664  	ADCQ  DX, R8
5665  	MOVQ  R13, R10
5666  	MOVQ  R14, R11
5667  	MOVQ  R15, R12
5668  	ANDQ  $0x03, R12
5669  	MOVQ  R15, R13
5670  	ANDQ  $-4, R13
5671  	MOVQ  R8, R14
5672  	SHRQ  $0x02, R8, R15
5673  	SHRQ  $0x02, R8
5674  	ADDQ  R13, R10
5675  	ADCQ  R14, R11
5676  	ADCQ  $0x00, R12
5677  	ADDQ  R15, R10
5678  	ADCQ  R8, R11
5679  	ADCQ  $0x00, R12
5680  	LEAQ  16(DI), DI
5681  	DECQ  CX
5682  	JG    sealSSEInnerLoop
5683  
5684  	// Add in the state
5685  	PADDD ·chacha20Constants<>+0(SB), X0
5686  	PADDD ·chacha20Constants<>+0(SB), X1
5687  	PADDD ·chacha20Constants<>+0(SB), X2
5688  	PADDD ·chacha20Constants<>+0(SB), X12
5689  	PADDD 32(BP), X3
5690  	PADDD 32(BP), X4
5691  	PADDD 32(BP), X5
5692  	PADDD 32(BP), X13
5693  	PADDD 48(BP), X6
5694  	PADDD 48(BP), X7
5695  	PADDD 48(BP), X8
5696  	PADDD 48(BP), X14
5697  	PADDD 80(BP), X9
5698  	PADDD 96(BP), X10
5699  	PADDD 112(BP), X11
5700  	PADDD 128(BP), X15
5701  	MOVO  X15, 64(BP)
5702  
5703  	// Load - xor - store
5704  	MOVOU (SI), X15
5705  	PXOR  X15, X0
5706  	MOVOU 16(SI), X15
5707  	PXOR  X15, X3
5708  	MOVOU 32(SI), X15
5709  	PXOR  X15, X6
5710  	MOVOU 48(SI), X15
5711  	PXOR  X15, X9
5712  	MOVOU X0, (DI)
5713  	MOVOU X3, 16(DI)
5714  	MOVOU X6, 32(DI)
5715  	MOVOU X9, 48(DI)
5716  	MOVO  64(BP), X15
5717  	MOVOU 64(SI), X0
5718  	MOVOU 80(SI), X3
5719  	MOVOU 96(SI), X6
5720  	MOVOU 112(SI), X9
5721  	PXOR  X0, X1
5722  	PXOR  X3, X4
5723  	PXOR  X6, X7
5724  	PXOR  X9, X10
5725  	MOVOU X1, 64(DI)
5726  	MOVOU X4, 80(DI)
5727  	MOVOU X7, 96(DI)
5728  	MOVOU X10, 112(DI)
5729  	MOVOU 128(SI), X0
5730  	MOVOU 144(SI), X3
5731  	MOVOU 160(SI), X6
5732  	MOVOU 176(SI), X9
5733  	PXOR  X0, X2
5734  	PXOR  X3, X5
5735  	PXOR  X6, X8
5736  	PXOR  X9, X11
5737  	MOVOU X2, 128(DI)
5738  	MOVOU X5, 144(DI)
5739  	MOVOU X8, 160(DI)
5740  	MOVOU X11, 176(DI)
5741  	ADDQ  $0xc0, SI
5742  	MOVQ  $0x000000c0, CX
5743  	SUBQ  $0xc0, BX
5744  	MOVO  X12, X1
5745  	MOVO  X13, X4
5746  	MOVO  X14, X7
5747  	MOVO  X15, X10
5748  	CMPQ  BX, $0x40
5749  	JBE   sealSSE128SealHash
5750  	MOVOU (SI), X0
5751  	MOVOU 16(SI), X3
5752  	MOVOU 32(SI), X6
5753  	MOVOU 48(SI), X9
5754  	PXOR  X0, X12
5755  	PXOR  X3, X13
5756  	PXOR  X6, X14
5757  	PXOR  X9, X15
5758  	MOVOU X12, 192(DI)
5759  	MOVOU X13, 208(DI)
5760  	MOVOU X14, 224(DI)
5761  	MOVOU X15, 240(DI)
5762  	LEAQ  64(SI), SI
5763  	SUBQ  $0x40, BX
5764  	MOVQ  $0x00000006, CX
5765  	MOVQ  $0x00000004, R9
5766  	CMPQ  BX, $0xc0
5767  	JG    sealSSEMainLoop
5768  	MOVQ  BX, CX
5769  	TESTQ BX, BX
5770  	JE    sealSSE128SealHash
5771  	MOVQ  $0x00000006, CX
5772  	CMPQ  BX, $0x40
5773  	JBE   sealSSETail64
5774  	CMPQ  BX, $0x80
5775  	JBE   sealSSETail128
5776  	JMP   sealSSETail192
5777  
5778  sealSSETail64:
5779  	MOVO  ·chacha20Constants<>+0(SB), X1
5780  	MOVO  32(BP), X4
5781  	MOVO  48(BP), X7
5782  	MOVO  128(BP), X10
5783  	PADDL ·sseIncMask<>+0(SB), X10
5784  	MOVO  X10, 80(BP)
5785  
5786  sealSSETail64LoopA:
5787  	ADDQ  (DI), R10
5788  	ADCQ  8(DI), R11
5789  	ADCQ  $0x01, R12
5790  	MOVQ  (BP), AX
5791  	MOVQ  AX, R15
5792  	MULQ  R10
5793  	MOVQ  AX, R13
5794  	MOVQ  DX, R14
5795  	MOVQ  (BP), AX
5796  	MULQ  R11
5797  	IMULQ R12, R15
5798  	ADDQ  AX, R14
5799  	ADCQ  DX, R15
5800  	MOVQ  8(BP), AX
5801  	MOVQ  AX, R8
5802  	MULQ  R10
5803  	ADDQ  AX, R14
5804  	ADCQ  $0x00, DX
5805  	MOVQ  DX, R10
5806  	MOVQ  8(BP), AX
5807  	MULQ  R11
5808  	ADDQ  AX, R15
5809  	ADCQ  $0x00, DX
5810  	IMULQ R12, R8
5811  	ADDQ  R10, R15
5812  	ADCQ  DX, R8
5813  	MOVQ  R13, R10
5814  	MOVQ  R14, R11
5815  	MOVQ  R15, R12
5816  	ANDQ  $0x03, R12
5817  	MOVQ  R15, R13
5818  	ANDQ  $-4, R13
5819  	MOVQ  R8, R14
5820  	SHRQ  $0x02, R8, R15
5821  	SHRQ  $0x02, R8
5822  	ADDQ  R13, R10
5823  	ADCQ  R14, R11
5824  	ADCQ  $0x00, R12
5825  	ADDQ  R15, R10
5826  	ADCQ  R8, R11
5827  	ADCQ  $0x00, R12
5828  	LEAQ  16(DI), DI
5829  
5830  sealSSETail64LoopB:
5831  	PADDD X4, X1
5832  	PXOR  X1, X10
5833  	ROL16(X10, X13)
5834  	PADDD X10, X7
5835  	PXOR  X7, X4
5836  	MOVO  X4, X13
5837  	PSLLL $0x0c, X13
5838  	PSRLL $0x14, X4
5839  	PXOR  X13, X4
5840  	PADDD X4, X1
5841  	PXOR  X1, X10
5842  	ROL8(X10, X13)
5843  	PADDD X10, X7
5844  	PXOR  X7, X4
5845  	MOVO  X4, X13
5846  	PSLLL $0x07, X13
5847  	PSRLL $0x19, X4
5848  	PXOR  X13, X4
5849  	BYTE  $0x66
5850  	BYTE  $0x0f
5851  	BYTE  $0x3a
5852  	BYTE  $0x0f
5853  	BYTE  $0xe4
5854  	BYTE  $0x04
5855  	BYTE  $0x66
5856  	BYTE  $0x0f
5857  	BYTE  $0x3a
5858  	BYTE  $0x0f
5859  	BYTE  $0xff
5860  	BYTE  $0x08
5861  	BYTE  $0x66
5862  	BYTE  $0x45
5863  	BYTE  $0x0f
5864  	BYTE  $0x3a
5865  	BYTE  $0x0f
5866  	BYTE  $0xd2
5867  	BYTE  $0x0c
5868  	PADDD X4, X1
5869  	PXOR  X1, X10
5870  	ROL16(X10, X13)
5871  	PADDD X10, X7
5872  	PXOR  X7, X4
5873  	MOVO  X4, X13
5874  	PSLLL $0x0c, X13
5875  	PSRLL $0x14, X4
5876  	PXOR  X13, X4
5877  	PADDD X4, X1
5878  	PXOR  X1, X10
5879  	ROL8(X10, X13)
5880  	PADDD X10, X7
5881  	PXOR  X7, X4
5882  	MOVO  X4, X13
5883  	PSLLL $0x07, X13
5884  	PSRLL $0x19, X4
5885  	PXOR  X13, X4
5886  	BYTE  $0x66
5887  	BYTE  $0x0f
5888  	BYTE  $0x3a
5889  	BYTE  $0x0f
5890  	BYTE  $0xe4
5891  	BYTE  $0x0c
5892  	BYTE  $0x66
5893  	BYTE  $0x0f
5894  	BYTE  $0x3a
5895  	BYTE  $0x0f
5896  	BYTE  $0xff
5897  	BYTE  $0x08
5898  	BYTE  $0x66
5899  	BYTE  $0x45
5900  	BYTE  $0x0f
5901  	BYTE  $0x3a
5902  	BYTE  $0x0f
5903  	BYTE  $0xd2
5904  	BYTE  $0x04
5905  	ADDQ  (DI), R10
5906  	ADCQ  8(DI), R11
5907  	ADCQ  $0x01, R12
5908  	MOVQ  (BP), AX
5909  	MOVQ  AX, R15
5910  	MULQ  R10
5911  	MOVQ  AX, R13
5912  	MOVQ  DX, R14
5913  	MOVQ  (BP), AX
5914  	MULQ  R11
5915  	IMULQ R12, R15
5916  	ADDQ  AX, R14
5917  	ADCQ  DX, R15
5918  	MOVQ  8(BP), AX
5919  	MOVQ  AX, R8
5920  	MULQ  R10
5921  	ADDQ  AX, R14
5922  	ADCQ  $0x00, DX
5923  	MOVQ  DX, R10
5924  	MOVQ  8(BP), AX
5925  	MULQ  R11
5926  	ADDQ  AX, R15
5927  	ADCQ  $0x00, DX
5928  	IMULQ R12, R8
5929  	ADDQ  R10, R15
5930  	ADCQ  DX, R8
5931  	MOVQ  R13, R10
5932  	MOVQ  R14, R11
5933  	MOVQ  R15, R12
5934  	ANDQ  $0x03, R12
5935  	MOVQ  R15, R13
5936  	ANDQ  $-4, R13
5937  	MOVQ  R8, R14
5938  	SHRQ  $0x02, R8, R15
5939  	SHRQ  $0x02, R8
5940  	ADDQ  R13, R10
5941  	ADCQ  R14, R11
5942  	ADCQ  $0x00, R12
5943  	ADDQ  R15, R10
5944  	ADCQ  R8, R11
5945  	ADCQ  $0x00, R12
5946  	LEAQ  16(DI), DI
5947  	DECQ  CX
5948  	JG    sealSSETail64LoopA
5949  	DECQ  R9
5950  	JGE   sealSSETail64LoopB
5951  	PADDL ·chacha20Constants<>+0(SB), X1
5952  	PADDL 32(BP), X4
5953  	PADDL 48(BP), X7
5954  	PADDL 80(BP), X10
5955  	JMP   sealSSE128Seal
5956  
5957  sealSSETail128:
5958  	MOVO  ·chacha20Constants<>+0(SB), X0
5959  	MOVO  32(BP), X3
5960  	MOVO  48(BP), X6
5961  	MOVO  128(BP), X9
5962  	PADDL ·sseIncMask<>+0(SB), X9
5963  	MOVO  X9, 80(BP)
5964  	MOVO  X0, X1
5965  	MOVO  X3, X4
5966  	MOVO  X6, X7
5967  	MOVO  X9, X10
5968  	PADDL ·sseIncMask<>+0(SB), X10
5969  	MOVO  X10, 96(BP)
5970  
5971  sealSSETail128LoopA:
5972  	ADDQ  (DI), R10
5973  	ADCQ  8(DI), R11
5974  	ADCQ  $0x01, R12
5975  	MOVQ  (BP), AX
5976  	MOVQ  AX, R15
5977  	MULQ  R10
5978  	MOVQ  AX, R13
5979  	MOVQ  DX, R14
5980  	MOVQ  (BP), AX
5981  	MULQ  R11
5982  	IMULQ R12, R15
5983  	ADDQ  AX, R14
5984  	ADCQ  DX, R15
5985  	MOVQ  8(BP), AX
5986  	MOVQ  AX, R8
5987  	MULQ  R10
5988  	ADDQ  AX, R14
5989  	ADCQ  $0x00, DX
5990  	MOVQ  DX, R10
5991  	MOVQ  8(BP), AX
5992  	MULQ  R11
5993  	ADDQ  AX, R15
5994  	ADCQ  $0x00, DX
5995  	IMULQ R12, R8
5996  	ADDQ  R10, R15
5997  	ADCQ  DX, R8
5998  	MOVQ  R13, R10
5999  	MOVQ  R14, R11
6000  	MOVQ  R15, R12
6001  	ANDQ  $0x03, R12
6002  	MOVQ  R15, R13
6003  	ANDQ  $-4, R13
6004  	MOVQ  R8, R14
6005  	SHRQ  $0x02, R8, R15
6006  	SHRQ  $0x02, R8
6007  	ADDQ  R13, R10
6008  	ADCQ  R14, R11
6009  	ADCQ  $0x00, R12
6010  	ADDQ  R15, R10
6011  	ADCQ  R8, R11
6012  	ADCQ  $0x00, R12
6013  	LEAQ  16(DI), DI
6014  
6015  sealSSETail128LoopB:
6016  	PADDD X3, X0
6017  	PXOR  X0, X9
6018  	ROL16(X9, X12)
6019  	PADDD X9, X6
6020  	PXOR  X6, X3
6021  	MOVO  X3, X12
6022  	PSLLL $0x0c, X12
6023  	PSRLL $0x14, X3
6024  	PXOR  X12, X3
6025  	PADDD X3, X0
6026  	PXOR  X0, X9
6027  	ROL8(X9, X12)
6028  	PADDD X9, X6
6029  	PXOR  X6, X3
6030  	MOVO  X3, X12
6031  	PSLLL $0x07, X12
6032  	PSRLL $0x19, X3
6033  	PXOR  X12, X3
6034  	PADDD X4, X1
6035  	PXOR  X1, X10
6036  	ROL16(X10, X12)
6037  	PADDD X10, X7
6038  	PXOR  X7, X4
6039  	MOVO  X4, X12
6040  	PSLLL $0x0c, X12
6041  	PSRLL $0x14, X4
6042  	PXOR  X12, X4
6043  	PADDD X4, X1
6044  	PXOR  X1, X10
6045  	ROL8(X10, X12)
6046  	PADDD X10, X7
6047  	PXOR  X7, X4
6048  	MOVO  X4, X12
6049  	PSLLL $0x07, X12
6050  	PSRLL $0x19, X4
6051  	PXOR  X12, X4
6052  	BYTE  $0x66
6053  	BYTE  $0x0f
6054  	BYTE  $0x3a
6055  	BYTE  $0x0f
6056  	BYTE  $0xdb
6057  	BYTE  $0x04
6058  	BYTE  $0x66
6059  	BYTE  $0x0f
6060  	BYTE  $0x3a
6061  	BYTE  $0x0f
6062  	BYTE  $0xf6
6063  	BYTE  $0x08
6064  	BYTE  $0x66
6065  	BYTE  $0x45
6066  	BYTE  $0x0f
6067  	BYTE  $0x3a
6068  	BYTE  $0x0f
6069  	BYTE  $0xc9
6070  	BYTE  $0x0c
6071  	BYTE  $0x66
6072  	BYTE  $0x0f
6073  	BYTE  $0x3a
6074  	BYTE  $0x0f
6075  	BYTE  $0xe4
6076  	BYTE  $0x04
6077  	BYTE  $0x66
6078  	BYTE  $0x0f
6079  	BYTE  $0x3a
6080  	BYTE  $0x0f
6081  	BYTE  $0xff
6082  	BYTE  $0x08
6083  	BYTE  $0x66
6084  	BYTE  $0x45
6085  	BYTE  $0x0f
6086  	BYTE  $0x3a
6087  	BYTE  $0x0f
6088  	BYTE  $0xd2
6089  	BYTE  $0x0c
6090  	ADDQ  (DI), R10
6091  	ADCQ  8(DI), R11
6092  	ADCQ  $0x01, R12
6093  	MOVQ  (BP), AX
6094  	MOVQ  AX, R15
6095  	MULQ  R10
6096  	MOVQ  AX, R13
6097  	MOVQ  DX, R14
6098  	MOVQ  (BP), AX
6099  	MULQ  R11
6100  	IMULQ R12, R15
6101  	ADDQ  AX, R14
6102  	ADCQ  DX, R15
6103  	MOVQ  8(BP), AX
6104  	MOVQ  AX, R8
6105  	MULQ  R10
6106  	ADDQ  AX, R14
6107  	ADCQ  $0x00, DX
6108  	MOVQ  DX, R10
6109  	MOVQ  8(BP), AX
6110  	MULQ  R11
6111  	ADDQ  AX, R15
6112  	ADCQ  $0x00, DX
6113  	IMULQ R12, R8
6114  	ADDQ  R10, R15
6115  	ADCQ  DX, R8
6116  	MOVQ  R13, R10
6117  	MOVQ  R14, R11
6118  	MOVQ  R15, R12
6119  	ANDQ  $0x03, R12
6120  	MOVQ  R15, R13
6121  	ANDQ  $-4, R13
6122  	MOVQ  R8, R14
6123  	SHRQ  $0x02, R8, R15
6124  	SHRQ  $0x02, R8
6125  	ADDQ  R13, R10
6126  	ADCQ  R14, R11
6127  	ADCQ  $0x00, R12
6128  	ADDQ  R15, R10
6129  	ADCQ  R8, R11
6130  	ADCQ  $0x00, R12
6131  	LEAQ  16(DI), DI
6132  	PADDD X3, X0
6133  	PXOR  X0, X9
6134  	ROL16(X9, X12)
6135  	PADDD X9, X6
6136  	PXOR  X6, X3
6137  	MOVO  X3, X12
6138  	PSLLL $0x0c, X12
6139  	PSRLL $0x14, X3
6140  	PXOR  X12, X3
6141  	PADDD X3, X0
6142  	PXOR  X0, X9
6143  	ROL8(X9, X12)
6144  	PADDD X9, X6
6145  	PXOR  X6, X3
6146  	MOVO  X3, X12
6147  	PSLLL $0x07, X12
6148  	PSRLL $0x19, X3
6149  	PXOR  X12, X3
6150  	PADDD X4, X1
6151  	PXOR  X1, X10
6152  	ROL16(X10, X12)
6153  	PADDD X10, X7
6154  	PXOR  X7, X4
6155  	MOVO  X4, X12
6156  	PSLLL $0x0c, X12
6157  	PSRLL $0x14, X4
6158  	PXOR  X12, X4
6159  	PADDD X4, X1
6160  	PXOR  X1, X10
6161  	ROL8(X10, X12)
6162  	PADDD X10, X7
6163  	PXOR  X7, X4
6164  	MOVO  X4, X12
6165  	PSLLL $0x07, X12
6166  	PSRLL $0x19, X4
6167  	PXOR  X12, X4
6168  	BYTE  $0x66
6169  	BYTE  $0x0f
6170  	BYTE  $0x3a
6171  	BYTE  $0x0f
6172  	BYTE  $0xdb
6173  	BYTE  $0x0c
6174  	BYTE  $0x66
6175  	BYTE  $0x0f
6176  	BYTE  $0x3a
6177  	BYTE  $0x0f
6178  	BYTE  $0xf6
6179  	BYTE  $0x08
6180  	BYTE  $0x66
6181  	BYTE  $0x45
6182  	BYTE  $0x0f
6183  	BYTE  $0x3a
6184  	BYTE  $0x0f
6185  	BYTE  $0xc9
6186  	BYTE  $0x04
6187  	BYTE  $0x66
6188  	BYTE  $0x0f
6189  	BYTE  $0x3a
6190  	BYTE  $0x0f
6191  	BYTE  $0xe4
6192  	BYTE  $0x0c
6193  	BYTE  $0x66
6194  	BYTE  $0x0f
6195  	BYTE  $0x3a
6196  	BYTE  $0x0f
6197  	BYTE  $0xff
6198  	BYTE  $0x08
6199  	BYTE  $0x66
6200  	BYTE  $0x45
6201  	BYTE  $0x0f
6202  	BYTE  $0x3a
6203  	BYTE  $0x0f
6204  	BYTE  $0xd2
6205  	BYTE  $0x04
6206  	DECQ  CX
6207  	JG    sealSSETail128LoopA
6208  	DECQ  R9
6209  	JGE   sealSSETail128LoopB
6210  	PADDL ·chacha20Constants<>+0(SB), X0
6211  	PADDL ·chacha20Constants<>+0(SB), X1
6212  	PADDL 32(BP), X3
6213  	PADDL 32(BP), X4
6214  	PADDL 48(BP), X6
6215  	PADDL 48(BP), X7
6216  	PADDL 80(BP), X9
6217  	PADDL 96(BP), X10
6218  	MOVOU (SI), X12
6219  	MOVOU 16(SI), X13
6220  	MOVOU 32(SI), X14
6221  	MOVOU 48(SI), X15
6222  	PXOR  X12, X0
6223  	PXOR  X13, X3
6224  	PXOR  X14, X6
6225  	PXOR  X15, X9
6226  	MOVOU X0, (DI)
6227  	MOVOU X3, 16(DI)
6228  	MOVOU X6, 32(DI)
6229  	MOVOU X9, 48(DI)
6230  	MOVQ  $0x00000040, CX
6231  	LEAQ  64(SI), SI
6232  	SUBQ  $0x40, BX
6233  	JMP   sealSSE128SealHash
6234  
6235  sealSSETail192:
6236  	MOVO  ·chacha20Constants<>+0(SB), X0
6237  	MOVO  32(BP), X3
6238  	MOVO  48(BP), X6
6239  	MOVO  128(BP), X9
6240  	PADDL ·sseIncMask<>+0(SB), X9
6241  	MOVO  X9, 80(BP)
6242  	MOVO  X0, X1
6243  	MOVO  X3, X4
6244  	MOVO  X6, X7
6245  	MOVO  X9, X10
6246  	PADDL ·sseIncMask<>+0(SB), X10
6247  	MOVO  X10, 96(BP)
6248  	MOVO  X1, X2
6249  	MOVO  X4, X5
6250  	MOVO  X7, X8
6251  	MOVO  X10, X11
6252  	PADDL ·sseIncMask<>+0(SB), X11
6253  	MOVO  X11, 112(BP)
6254  
6255  sealSSETail192LoopA:
6256  	ADDQ  (DI), R10
6257  	ADCQ  8(DI), R11
6258  	ADCQ  $0x01, R12
6259  	MOVQ  (BP), AX
6260  	MOVQ  AX, R15
6261  	MULQ  R10
6262  	MOVQ  AX, R13
6263  	MOVQ  DX, R14
6264  	MOVQ  (BP), AX
6265  	MULQ  R11
6266  	IMULQ R12, R15
6267  	ADDQ  AX, R14
6268  	ADCQ  DX, R15
6269  	MOVQ  8(BP), AX
6270  	MOVQ  AX, R8
6271  	MULQ  R10
6272  	ADDQ  AX, R14
6273  	ADCQ  $0x00, DX
6274  	MOVQ  DX, R10
6275  	MOVQ  8(BP), AX
6276  	MULQ  R11
6277  	ADDQ  AX, R15
6278  	ADCQ  $0x00, DX
6279  	IMULQ R12, R8
6280  	ADDQ  R10, R15
6281  	ADCQ  DX, R8
6282  	MOVQ  R13, R10
6283  	MOVQ  R14, R11
6284  	MOVQ  R15, R12
6285  	ANDQ  $0x03, R12
6286  	MOVQ  R15, R13
6287  	ANDQ  $-4, R13
6288  	MOVQ  R8, R14
6289  	SHRQ  $0x02, R8, R15
6290  	SHRQ  $0x02, R8
6291  	ADDQ  R13, R10
6292  	ADCQ  R14, R11
6293  	ADCQ  $0x00, R12
6294  	ADDQ  R15, R10
6295  	ADCQ  R8, R11
6296  	ADCQ  $0x00, R12
6297  	LEAQ  16(DI), DI
6298  
6299  sealSSETail192LoopB:
6300  	PADDD X3, X0
6301  	PXOR  X0, X9
6302  	ROL16(X9, X12)
6303  	PADDD X9, X6
6304  	PXOR  X6, X3
6305  	MOVO  X3, X12
6306  	PSLLL $0x0c, X12
6307  	PSRLL $0x14, X3
6308  	PXOR  X12, X3
6309  	PADDD X3, X0
6310  	PXOR  X0, X9
6311  	ROL8(X9, X12)
6312  	PADDD X9, X6
6313  	PXOR  X6, X3
6314  	MOVO  X3, X12
6315  	PSLLL $0x07, X12
6316  	PSRLL $0x19, X3
6317  	PXOR  X12, X3
6318  	PADDD X4, X1
6319  	PXOR  X1, X10
6320  	ROL16(X10, X12)
6321  	PADDD X10, X7
6322  	PXOR  X7, X4
6323  	MOVO  X4, X12
6324  	PSLLL $0x0c, X12
6325  	PSRLL $0x14, X4
6326  	PXOR  X12, X4
6327  	PADDD X4, X1
6328  	PXOR  X1, X10
6329  	ROL8(X10, X12)
6330  	PADDD X10, X7
6331  	PXOR  X7, X4
6332  	MOVO  X4, X12
6333  	PSLLL $0x07, X12
6334  	PSRLL $0x19, X4
6335  	PXOR  X12, X4
6336  	PADDD X5, X2
6337  	PXOR  X2, X11
6338  	ROL16(X11, X12)
6339  	PADDD X11, X8
6340  	PXOR  X8, X5
6341  	MOVO  X5, X12
6342  	PSLLL $0x0c, X12
6343  	PSRLL $0x14, X5
6344  	PXOR  X12, X5
6345  	PADDD X5, X2
6346  	PXOR  X2, X11
6347  	ROL8(X11, X12)
6348  	PADDD X11, X8
6349  	PXOR  X8, X5
6350  	MOVO  X5, X12
6351  	PSLLL $0x07, X12
6352  	PSRLL $0x19, X5
6353  	PXOR  X12, X5
6354  	BYTE  $0x66
6355  	BYTE  $0x0f
6356  	BYTE  $0x3a
6357  	BYTE  $0x0f
6358  	BYTE  $0xdb
6359  	BYTE  $0x04
6360  	BYTE  $0x66
6361  	BYTE  $0x0f
6362  	BYTE  $0x3a
6363  	BYTE  $0x0f
6364  	BYTE  $0xf6
6365  	BYTE  $0x08
6366  	BYTE  $0x66
6367  	BYTE  $0x45
6368  	BYTE  $0x0f
6369  	BYTE  $0x3a
6370  	BYTE  $0x0f
6371  	BYTE  $0xc9
6372  	BYTE  $0x0c
6373  	BYTE  $0x66
6374  	BYTE  $0x0f
6375  	BYTE  $0x3a
6376  	BYTE  $0x0f
6377  	BYTE  $0xe4
6378  	BYTE  $0x04
6379  	BYTE  $0x66
6380  	BYTE  $0x0f
6381  	BYTE  $0x3a
6382  	BYTE  $0x0f
6383  	BYTE  $0xff
6384  	BYTE  $0x08
6385  	BYTE  $0x66
6386  	BYTE  $0x45
6387  	BYTE  $0x0f
6388  	BYTE  $0x3a
6389  	BYTE  $0x0f
6390  	BYTE  $0xd2
6391  	BYTE  $0x0c
6392  	BYTE  $0x66
6393  	BYTE  $0x0f
6394  	BYTE  $0x3a
6395  	BYTE  $0x0f
6396  	BYTE  $0xed
6397  	BYTE  $0x04
6398  	BYTE  $0x66
6399  	BYTE  $0x45
6400  	BYTE  $0x0f
6401  	BYTE  $0x3a
6402  	BYTE  $0x0f
6403  	BYTE  $0xc0
6404  	BYTE  $0x08
6405  	BYTE  $0x66
6406  	BYTE  $0x45
6407  	BYTE  $0x0f
6408  	BYTE  $0x3a
6409  	BYTE  $0x0f
6410  	BYTE  $0xdb
6411  	BYTE  $0x0c
6412  	ADDQ  (DI), R10
6413  	ADCQ  8(DI), R11
6414  	ADCQ  $0x01, R12
6415  	MOVQ  (BP), AX
6416  	MOVQ  AX, R15
6417  	MULQ  R10
6418  	MOVQ  AX, R13
6419  	MOVQ  DX, R14
6420  	MOVQ  (BP), AX
6421  	MULQ  R11
6422  	IMULQ R12, R15
6423  	ADDQ  AX, R14
6424  	ADCQ  DX, R15
6425  	MOVQ  8(BP), AX
6426  	MOVQ  AX, R8
6427  	MULQ  R10
6428  	ADDQ  AX, R14
6429  	ADCQ  $0x00, DX
6430  	MOVQ  DX, R10
6431  	MOVQ  8(BP), AX
6432  	MULQ  R11
6433  	ADDQ  AX, R15
6434  	ADCQ  $0x00, DX
6435  	IMULQ R12, R8
6436  	ADDQ  R10, R15
6437  	ADCQ  DX, R8
6438  	MOVQ  R13, R10
6439  	MOVQ  R14, R11
6440  	MOVQ  R15, R12
6441  	ANDQ  $0x03, R12
6442  	MOVQ  R15, R13
6443  	ANDQ  $-4, R13
6444  	MOVQ  R8, R14
6445  	SHRQ  $0x02, R8, R15
6446  	SHRQ  $0x02, R8
6447  	ADDQ  R13, R10
6448  	ADCQ  R14, R11
6449  	ADCQ  $0x00, R12
6450  	ADDQ  R15, R10
6451  	ADCQ  R8, R11
6452  	ADCQ  $0x00, R12
6453  	LEAQ  16(DI), DI
6454  	PADDD X3, X0
6455  	PXOR  X0, X9
6456  	ROL16(X9, X12)
6457  	PADDD X9, X6
6458  	PXOR  X6, X3
6459  	MOVO  X3, X12
6460  	PSLLL $0x0c, X12
6461  	PSRLL $0x14, X3
6462  	PXOR  X12, X3
6463  	PADDD X3, X0
6464  	PXOR  X0, X9
6465  	ROL8(X9, X12)
6466  	PADDD X9, X6
6467  	PXOR  X6, X3
6468  	MOVO  X3, X12
6469  	PSLLL $0x07, X12
6470  	PSRLL $0x19, X3
6471  	PXOR  X12, X3
6472  	PADDD X4, X1
6473  	PXOR  X1, X10
6474  	ROL16(X10, X12)
6475  	PADDD X10, X7
6476  	PXOR  X7, X4
6477  	MOVO  X4, X12
6478  	PSLLL $0x0c, X12
6479  	PSRLL $0x14, X4
6480  	PXOR  X12, X4
6481  	PADDD X4, X1
6482  	PXOR  X1, X10
6483  	ROL8(X10, X12)
6484  	PADDD X10, X7
6485  	PXOR  X7, X4
6486  	MOVO  X4, X12
6487  	PSLLL $0x07, X12
6488  	PSRLL $0x19, X4
6489  	PXOR  X12, X4
6490  	PADDD X5, X2
6491  	PXOR  X2, X11
6492  	ROL16(X11, X12)
6493  	PADDD X11, X8
6494  	PXOR  X8, X5
6495  	MOVO  X5, X12
6496  	PSLLL $0x0c, X12
6497  	PSRLL $0x14, X5
6498  	PXOR  X12, X5
6499  	PADDD X5, X2
6500  	PXOR  X2, X11
6501  	ROL8(X11, X12)
6502  	PADDD X11, X8
6503  	PXOR  X8, X5
6504  	MOVO  X5, X12
6505  	PSLLL $0x07, X12
6506  	PSRLL $0x19, X5
6507  	PXOR  X12, X5
6508  	BYTE  $0x66
6509  	BYTE  $0x0f
6510  	BYTE  $0x3a
6511  	BYTE  $0x0f
6512  	BYTE  $0xdb
6513  	BYTE  $0x0c
6514  	BYTE  $0x66
6515  	BYTE  $0x0f
6516  	BYTE  $0x3a
6517  	BYTE  $0x0f
6518  	BYTE  $0xf6
6519  	BYTE  $0x08
6520  	BYTE  $0x66
6521  	BYTE  $0x45
6522  	BYTE  $0x0f
6523  	BYTE  $0x3a
6524  	BYTE  $0x0f
6525  	BYTE  $0xc9
6526  	BYTE  $0x04
6527  	BYTE  $0x66
6528  	BYTE  $0x0f
6529  	BYTE  $0x3a
6530  	BYTE  $0x0f
6531  	BYTE  $0xe4
6532  	BYTE  $0x0c
6533  	BYTE  $0x66
6534  	BYTE  $0x0f
6535  	BYTE  $0x3a
6536  	BYTE  $0x0f
6537  	BYTE  $0xff
6538  	BYTE  $0x08
6539  	BYTE  $0x66
6540  	BYTE  $0x45
6541  	BYTE  $0x0f
6542  	BYTE  $0x3a
6543  	BYTE  $0x0f
6544  	BYTE  $0xd2
6545  	BYTE  $0x04
6546  	BYTE  $0x66
6547  	BYTE  $0x0f
6548  	BYTE  $0x3a
6549  	BYTE  $0x0f
6550  	BYTE  $0xed
6551  	BYTE  $0x0c
6552  	BYTE  $0x66
6553  	BYTE  $0x45
6554  	BYTE  $0x0f
6555  	BYTE  $0x3a
6556  	BYTE  $0x0f
6557  	BYTE  $0xc0
6558  	BYTE  $0x08
6559  	BYTE  $0x66
6560  	BYTE  $0x45
6561  	BYTE  $0x0f
6562  	BYTE  $0x3a
6563  	BYTE  $0x0f
6564  	BYTE  $0xdb
6565  	BYTE  $0x04
6566  	DECQ  CX
6567  	JG    sealSSETail192LoopA
6568  	DECQ  R9
6569  	JGE   sealSSETail192LoopB
6570  	PADDL ·chacha20Constants<>+0(SB), X0
6571  	PADDL ·chacha20Constants<>+0(SB), X1
6572  	PADDL ·chacha20Constants<>+0(SB), X2
6573  	PADDL 32(BP), X3
6574  	PADDL 32(BP), X4
6575  	PADDL 32(BP), X5
6576  	PADDL 48(BP), X6
6577  	PADDL 48(BP), X7
6578  	PADDL 48(BP), X8
6579  	PADDL 80(BP), X9
6580  	PADDL 96(BP), X10
6581  	PADDL 112(BP), X11
6582  	MOVOU (SI), X12
6583  	MOVOU 16(SI), X13
6584  	MOVOU 32(SI), X14
6585  	MOVOU 48(SI), X15
6586  	PXOR  X12, X0
6587  	PXOR  X13, X3
6588  	PXOR  X14, X6
6589  	PXOR  X15, X9
6590  	MOVOU X0, (DI)
6591  	MOVOU X3, 16(DI)
6592  	MOVOU X6, 32(DI)
6593  	MOVOU X9, 48(DI)
6594  	MOVOU 64(SI), X12
6595  	MOVOU 80(SI), X13
6596  	MOVOU 96(SI), X14
6597  	MOVOU 112(SI), X15
6598  	PXOR  X12, X1
6599  	PXOR  X13, X4
6600  	PXOR  X14, X7
6601  	PXOR  X15, X10
6602  	MOVOU X1, 64(DI)
6603  	MOVOU X4, 80(DI)
6604  	MOVOU X7, 96(DI)
6605  	MOVOU X10, 112(DI)
6606  	MOVO  X2, X1
6607  	MOVO  X5, X4
6608  	MOVO  X8, X7
6609  	MOVO  X11, X10
6610  	MOVQ  $0x00000080, CX
6611  	LEAQ  128(SI), SI
6612  	SUBQ  $0x80, BX
6613  	JMP   sealSSE128SealHash
6614  
6615  sealSSE128:
6616  	MOVOU ·chacha20Constants<>+0(SB), X0
6617  	MOVOU 16(R8), X3
6618  	MOVOU 32(R8), X6
6619  	MOVOU 48(R8), X9
6620  	MOVO  X0, X1
6621  	MOVO  X3, X4
6622  	MOVO  X6, X7
6623  	MOVO  X9, X10
6624  	PADDL ·sseIncMask<>+0(SB), X10
6625  	MOVO  X1, X2
6626  	MOVO  X4, X5
6627  	MOVO  X7, X8
6628  	MOVO  X10, X11
6629  	PADDL ·sseIncMask<>+0(SB), X11
6630  	MOVO  X3, X13
6631  	MOVO  X6, X14
6632  	MOVO  X10, X15
6633  	MOVQ  $0x0000000a, R9
6634  
6635  sealSSE128InnerCipherLoop:
6636  	PADDD X3, X0
6637  	PXOR  X0, X9
6638  	ROL16(X9, X12)
6639  	PADDD X9, X6
6640  	PXOR  X6, X3
6641  	MOVO  X3, X12
6642  	PSLLL $0x0c, X12
6643  	PSRLL $0x14, X3
6644  	PXOR  X12, X3
6645  	PADDD X3, X0
6646  	PXOR  X0, X9
6647  	ROL8(X9, X12)
6648  	PADDD X9, X6
6649  	PXOR  X6, X3
6650  	MOVO  X3, X12
6651  	PSLLL $0x07, X12
6652  	PSRLL $0x19, X3
6653  	PXOR  X12, X3
6654  	PADDD X4, X1
6655  	PXOR  X1, X10
6656  	ROL16(X10, X12)
6657  	PADDD X10, X7
6658  	PXOR  X7, X4
6659  	MOVO  X4, X12
6660  	PSLLL $0x0c, X12
6661  	PSRLL $0x14, X4
6662  	PXOR  X12, X4
6663  	PADDD X4, X1
6664  	PXOR  X1, X10
6665  	ROL8(X10, X12)
6666  	PADDD X10, X7
6667  	PXOR  X7, X4
6668  	MOVO  X4, X12
6669  	PSLLL $0x07, X12
6670  	PSRLL $0x19, X4
6671  	PXOR  X12, X4
6672  	PADDD X5, X2
6673  	PXOR  X2, X11
6674  	ROL16(X11, X12)
6675  	PADDD X11, X8
6676  	PXOR  X8, X5
6677  	MOVO  X5, X12
6678  	PSLLL $0x0c, X12
6679  	PSRLL $0x14, X5
6680  	PXOR  X12, X5
6681  	PADDD X5, X2
6682  	PXOR  X2, X11
6683  	ROL8(X11, X12)
6684  	PADDD X11, X8
6685  	PXOR  X8, X5
6686  	MOVO  X5, X12
6687  	PSLLL $0x07, X12
6688  	PSRLL $0x19, X5
6689  	PXOR  X12, X5
6690  	BYTE  $0x66
6691  	BYTE  $0x0f
6692  	BYTE  $0x3a
6693  	BYTE  $0x0f
6694  	BYTE  $0xdb
6695  	BYTE  $0x04
6696  	BYTE  $0x66
6697  	BYTE  $0x0f
6698  	BYTE  $0x3a
6699  	BYTE  $0x0f
6700  	BYTE  $0xe4
6701  	BYTE  $0x04
6702  	BYTE  $0x66
6703  	BYTE  $0x0f
6704  	BYTE  $0x3a
6705  	BYTE  $0x0f
6706  	BYTE  $0xed
6707  	BYTE  $0x04
6708  	BYTE  $0x66
6709  	BYTE  $0x0f
6710  	BYTE  $0x3a
6711  	BYTE  $0x0f
6712  	BYTE  $0xf6
6713  	BYTE  $0x08
6714  	BYTE  $0x66
6715  	BYTE  $0x0f
6716  	BYTE  $0x3a
6717  	BYTE  $0x0f
6718  	BYTE  $0xff
6719  	BYTE  $0x08
6720  	BYTE  $0x66
6721  	BYTE  $0x45
6722  	BYTE  $0x0f
6723  	BYTE  $0x3a
6724  	BYTE  $0x0f
6725  	BYTE  $0xc0
6726  	BYTE  $0x08
6727  	BYTE  $0x66
6728  	BYTE  $0x45
6729  	BYTE  $0x0f
6730  	BYTE  $0x3a
6731  	BYTE  $0x0f
6732  	BYTE  $0xc9
6733  	BYTE  $0x0c
6734  	BYTE  $0x66
6735  	BYTE  $0x45
6736  	BYTE  $0x0f
6737  	BYTE  $0x3a
6738  	BYTE  $0x0f
6739  	BYTE  $0xd2
6740  	BYTE  $0x0c
6741  	BYTE  $0x66
6742  	BYTE  $0x45
6743  	BYTE  $0x0f
6744  	BYTE  $0x3a
6745  	BYTE  $0x0f
6746  	BYTE  $0xdb
6747  	BYTE  $0x0c
6748  	PADDD X3, X0
6749  	PXOR  X0, X9
6750  	ROL16(X9, X12)
6751  	PADDD X9, X6
6752  	PXOR  X6, X3
6753  	MOVO  X3, X12
6754  	PSLLL $0x0c, X12
6755  	PSRLL $0x14, X3
6756  	PXOR  X12, X3
6757  	PADDD X3, X0
6758  	PXOR  X0, X9
6759  	ROL8(X9, X12)
6760  	PADDD X9, X6
6761  	PXOR  X6, X3
6762  	MOVO  X3, X12
6763  	PSLLL $0x07, X12
6764  	PSRLL $0x19, X3
6765  	PXOR  X12, X3
6766  	PADDD X4, X1
6767  	PXOR  X1, X10
6768  	ROL16(X10, X12)
6769  	PADDD X10, X7
6770  	PXOR  X7, X4
6771  	MOVO  X4, X12
6772  	PSLLL $0x0c, X12
6773  	PSRLL $0x14, X4
6774  	PXOR  X12, X4
6775  	PADDD X4, X1
6776  	PXOR  X1, X10
6777  	ROL8(X10, X12)
6778  	PADDD X10, X7
6779  	PXOR  X7, X4
6780  	MOVO  X4, X12
6781  	PSLLL $0x07, X12
6782  	PSRLL $0x19, X4
6783  	PXOR  X12, X4
6784  	PADDD X5, X2
6785  	PXOR  X2, X11
6786  	ROL16(X11, X12)
6787  	PADDD X11, X8
6788  	PXOR  X8, X5
6789  	MOVO  X5, X12
6790  	PSLLL $0x0c, X12
6791  	PSRLL $0x14, X5
6792  	PXOR  X12, X5
6793  	PADDD X5, X2
6794  	PXOR  X2, X11
6795  	ROL8(X11, X12)
6796  	PADDD X11, X8
6797  	PXOR  X8, X5
6798  	MOVO  X5, X12
6799  	PSLLL $0x07, X12
6800  	PSRLL $0x19, X5
6801  	PXOR  X12, X5
6802  	BYTE  $0x66
6803  	BYTE  $0x0f
6804  	BYTE  $0x3a
6805  	BYTE  $0x0f
6806  	BYTE  $0xdb
6807  	BYTE  $0x0c
6808  	BYTE  $0x66
6809  	BYTE  $0x0f
6810  	BYTE  $0x3a
6811  	BYTE  $0x0f
6812  	BYTE  $0xe4
6813  	BYTE  $0x0c
6814  	BYTE  $0x66
6815  	BYTE  $0x0f
6816  	BYTE  $0x3a
6817  	BYTE  $0x0f
6818  	BYTE  $0xed
6819  	BYTE  $0x0c
6820  	BYTE  $0x66
6821  	BYTE  $0x0f
6822  	BYTE  $0x3a
6823  	BYTE  $0x0f
6824  	BYTE  $0xf6
6825  	BYTE  $0x08
6826  	BYTE  $0x66
6827  	BYTE  $0x0f
6828  	BYTE  $0x3a
6829  	BYTE  $0x0f
6830  	BYTE  $0xff
6831  	BYTE  $0x08
6832  	BYTE  $0x66
6833  	BYTE  $0x45
6834  	BYTE  $0x0f
6835  	BYTE  $0x3a
6836  	BYTE  $0x0f
6837  	BYTE  $0xc0
6838  	BYTE  $0x08
6839  	BYTE  $0x66
6840  	BYTE  $0x45
6841  	BYTE  $0x0f
6842  	BYTE  $0x3a
6843  	BYTE  $0x0f
6844  	BYTE  $0xc9
6845  	BYTE  $0x04
6846  	BYTE  $0x66
6847  	BYTE  $0x45
6848  	BYTE  $0x0f
6849  	BYTE  $0x3a
6850  	BYTE  $0x0f
6851  	BYTE  $0xd2
6852  	BYTE  $0x04
6853  	BYTE  $0x66
6854  	BYTE  $0x45
6855  	BYTE  $0x0f
6856  	BYTE  $0x3a
6857  	BYTE  $0x0f
6858  	BYTE  $0xdb
6859  	BYTE  $0x04
6860  	DECQ  R9
6861  	JNE   sealSSE128InnerCipherLoop
6862  
6863  	// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
6864  	PADDL ·chacha20Constants<>+0(SB), X0
6865  	PADDL ·chacha20Constants<>+0(SB), X1
6866  	PADDL ·chacha20Constants<>+0(SB), X2
6867  	PADDL X13, X3
6868  	PADDL X13, X4
6869  	PADDL X13, X5
6870  	PADDL X14, X7
6871  	PADDL X14, X8
6872  	PADDL X15, X10
6873  	PADDL ·sseIncMask<>+0(SB), X15
6874  	PADDL X15, X11
6875  	PAND  ·polyClampMask<>+0(SB), X0
6876  	MOVOU X0, (BP)
6877  	MOVOU X3, 16(BP)
6878  
6879  	// Hash
6880  	MOVQ ad_len+80(FP), R9
6881  	CALL polyHashADInternal<>(SB)
6882  	XORQ CX, CX
6883  
6884  sealSSE128SealHash:
6885  	CMPQ  CX, $0x10
6886  	JB    sealSSE128Seal
6887  	ADDQ  (DI), R10
6888  	ADCQ  8(DI), R11
6889  	ADCQ  $0x01, R12
6890  	MOVQ  (BP), AX
6891  	MOVQ  AX, R15
6892  	MULQ  R10
6893  	MOVQ  AX, R13
6894  	MOVQ  DX, R14
6895  	MOVQ  (BP), AX
6896  	MULQ  R11
6897  	IMULQ R12, R15
6898  	ADDQ  AX, R14
6899  	ADCQ  DX, R15
6900  	MOVQ  8(BP), AX
6901  	MOVQ  AX, R8
6902  	MULQ  R10
6903  	ADDQ  AX, R14
6904  	ADCQ  $0x00, DX
6905  	MOVQ  DX, R10
6906  	MOVQ  8(BP), AX
6907  	MULQ  R11
6908  	ADDQ  AX, R15
6909  	ADCQ  $0x00, DX
6910  	IMULQ R12, R8
6911  	ADDQ  R10, R15
6912  	ADCQ  DX, R8
6913  	MOVQ  R13, R10
6914  	MOVQ  R14, R11
6915  	MOVQ  R15, R12
6916  	ANDQ  $0x03, R12
6917  	MOVQ  R15, R13
6918  	ANDQ  $-4, R13
6919  	MOVQ  R8, R14
6920  	SHRQ  $0x02, R8, R15
6921  	SHRQ  $0x02, R8
6922  	ADDQ  R13, R10
6923  	ADCQ  R14, R11
6924  	ADCQ  $0x00, R12
6925  	ADDQ  R15, R10
6926  	ADCQ  R8, R11
6927  	ADCQ  $0x00, R12
6928  	SUBQ  $0x10, CX
6929  	ADDQ  $0x10, DI
6930  	JMP   sealSSE128SealHash
6931  
6932  sealSSE128Seal:
6933  	CMPQ BX, $0x10
6934  	JB   sealSSETail
6935  	SUBQ $0x10, BX
6936  
6937  	// Load for decryption
6938  	MOVOU (SI), X12
6939  	PXOR  X12, X1
6940  	MOVOU X1, (DI)
6941  	LEAQ  16(SI), SI
6942  	LEAQ  16(DI), DI
6943  
6944  	// Extract for hashing
6945  	MOVQ   X1, R13
6946  	PSRLDQ $0x08, X1
6947  	MOVQ   X1, R14
6948  	ADDQ   R13, R10
6949  	ADCQ   R14, R11
6950  	ADCQ   $0x01, R12
6951  	MOVQ   (BP), AX
6952  	MOVQ   AX, R15
6953  	MULQ   R10
6954  	MOVQ   AX, R13
6955  	MOVQ   DX, R14
6956  	MOVQ   (BP), AX
6957  	MULQ   R11
6958  	IMULQ  R12, R15
6959  	ADDQ   AX, R14
6960  	ADCQ   DX, R15
6961  	MOVQ   8(BP), AX
6962  	MOVQ   AX, R8
6963  	MULQ   R10
6964  	ADDQ   AX, R14
6965  	ADCQ   $0x00, DX
6966  	MOVQ   DX, R10
6967  	MOVQ   8(BP), AX
6968  	MULQ   R11
6969  	ADDQ   AX, R15
6970  	ADCQ   $0x00, DX
6971  	IMULQ  R12, R8
6972  	ADDQ   R10, R15
6973  	ADCQ   DX, R8
6974  	MOVQ   R13, R10
6975  	MOVQ   R14, R11
6976  	MOVQ   R15, R12
6977  	ANDQ   $0x03, R12
6978  	MOVQ   R15, R13
6979  	ANDQ   $-4, R13
6980  	MOVQ   R8, R14
6981  	SHRQ   $0x02, R8, R15
6982  	SHRQ   $0x02, R8
6983  	ADDQ   R13, R10
6984  	ADCQ   R14, R11
6985  	ADCQ   $0x00, R12
6986  	ADDQ   R15, R10
6987  	ADCQ   R8, R11
6988  	ADCQ   $0x00, R12
6989  
6990  	// Shift the stream "left"
6991  	MOVO X4, X1
6992  	MOVO X7, X4
6993  	MOVO X10, X7
6994  	MOVO X2, X10
6995  	MOVO X5, X2
6996  	MOVO X8, X5
6997  	MOVO X11, X8
6998  	JMP  sealSSE128Seal
6999  
7000  sealSSETail:
7001  	TESTQ BX, BX
7002  	JE    sealSSEFinalize
7003  
7004  	// We can only load the PT one byte at a time to avoid read after end of buffer
7005  	MOVQ BX, R9
7006  	SHLQ $0x04, R9
7007  	LEAQ ·andMask<>+0(SB), R13
7008  	MOVQ BX, CX
7009  	LEAQ -1(SI)(BX*1), SI
7010  	XORQ R15, R15
7011  	XORQ R8, R8
7012  	XORQ AX, AX
7013  
7014  sealSSETailLoadLoop:
7015  	SHLQ   $0x08, R15, R8
7016  	SHLQ   $0x08, R15
7017  	MOVB   (SI), AX
7018  	XORQ   AX, R15
7019  	LEAQ   -1(SI), SI
7020  	DECQ   CX
7021  	JNE    sealSSETailLoadLoop
7022  	MOVQ   R15, 64(BP)
7023  	MOVQ   R8, 72(BP)
7024  	PXOR   64(BP), X1
7025  	MOVOU  X1, (DI)
7026  	MOVOU  -16(R13)(R9*1), X12
7027  	PAND   X12, X1
7028  	MOVQ   X1, R13
7029  	PSRLDQ $0x08, X1
7030  	MOVQ   X1, R14
7031  	ADDQ   R13, R10
7032  	ADCQ   R14, R11
7033  	ADCQ   $0x01, R12
7034  	MOVQ   (BP), AX
7035  	MOVQ   AX, R15
7036  	MULQ   R10
7037  	MOVQ   AX, R13
7038  	MOVQ   DX, R14
7039  	MOVQ   (BP), AX
7040  	MULQ   R11
7041  	IMULQ  R12, R15
7042  	ADDQ   AX, R14
7043  	ADCQ   DX, R15
7044  	MOVQ   8(BP), AX
7045  	MOVQ   AX, R8
7046  	MULQ   R10
7047  	ADDQ   AX, R14
7048  	ADCQ   $0x00, DX
7049  	MOVQ   DX, R10
7050  	MOVQ   8(BP), AX
7051  	MULQ   R11
7052  	ADDQ   AX, R15
7053  	ADCQ   $0x00, DX
7054  	IMULQ  R12, R8
7055  	ADDQ   R10, R15
7056  	ADCQ   DX, R8
7057  	MOVQ   R13, R10
7058  	MOVQ   R14, R11
7059  	MOVQ   R15, R12
7060  	ANDQ   $0x03, R12
7061  	MOVQ   R15, R13
7062  	ANDQ   $-4, R13
7063  	MOVQ   R8, R14
7064  	SHRQ   $0x02, R8, R15
7065  	SHRQ   $0x02, R8
7066  	ADDQ   R13, R10
7067  	ADCQ   R14, R11
7068  	ADCQ   $0x00, R12
7069  	ADDQ   R15, R10
7070  	ADCQ   R8, R11
7071  	ADCQ   $0x00, R12
7072  	ADDQ   BX, DI
7073  
7074  sealSSEFinalize:
7075  	// Hash in the buffer lengths
7076  	ADDQ  ad_len+80(FP), R10
7077  	ADCQ  src_len+56(FP), R11
7078  	ADCQ  $0x01, R12
7079  	MOVQ  (BP), AX
7080  	MOVQ  AX, R15
7081  	MULQ  R10
7082  	MOVQ  AX, R13
7083  	MOVQ  DX, R14
7084  	MOVQ  (BP), AX
7085  	MULQ  R11
7086  	IMULQ R12, R15
7087  	ADDQ  AX, R14
7088  	ADCQ  DX, R15
7089  	MOVQ  8(BP), AX
7090  	MOVQ  AX, R8
7091  	MULQ  R10
7092  	ADDQ  AX, R14
7093  	ADCQ  $0x00, DX
7094  	MOVQ  DX, R10
7095  	MOVQ  8(BP), AX
7096  	MULQ  R11
7097  	ADDQ  AX, R15
7098  	ADCQ  $0x00, DX
7099  	IMULQ R12, R8
7100  	ADDQ  R10, R15
7101  	ADCQ  DX, R8
7102  	MOVQ  R13, R10
7103  	MOVQ  R14, R11
7104  	MOVQ  R15, R12
7105  	ANDQ  $0x03, R12
7106  	MOVQ  R15, R13
7107  	ANDQ  $-4, R13
7108  	MOVQ  R8, R14
7109  	SHRQ  $0x02, R8, R15
7110  	SHRQ  $0x02, R8
7111  	ADDQ  R13, R10
7112  	ADCQ  R14, R11
7113  	ADCQ  $0x00, R12
7114  	ADDQ  R15, R10
7115  	ADCQ  R8, R11
7116  	ADCQ  $0x00, R12
7117  
7118  	// Final reduce
7119  	MOVQ    R10, R13
7120  	MOVQ    R11, R14
7121  	MOVQ    R12, R15
7122  	SUBQ    $-5, R10
7123  	SBBQ    $-1, R11
7124  	SBBQ    $0x03, R12
7125  	CMOVQCS R13, R10
7126  	CMOVQCS R14, R11
7127  	CMOVQCS R15, R12
7128  
7129  	// Add in the "s" part of the key
7130  	ADDQ 16(BP), R10
7131  	ADCQ 24(BP), R11
7132  
7133  	// Finally store the tag at the end of the message
7134  	MOVQ R10, (DI)
7135  	MOVQ R11, 8(DI)
7136  	RET
7137  
7138  chacha20Poly1305Seal_AVX2:
7139  	VZEROUPPER
7140  	VMOVDQU ·chacha20Constants<>+0(SB), Y0
7141  	BYTE    $0xc4
7142  	BYTE    $0x42
7143  	BYTE    $0x7d
7144  	BYTE    $0x5a
7145  	BYTE    $0x70
7146  	BYTE    $0x10
7147  	BYTE    $0xc4
7148  	BYTE    $0x42
7149  	BYTE    $0x7d
7150  	BYTE    $0x5a
7151  	BYTE    $0x60
7152  	BYTE    $0x20
7153  	BYTE    $0xc4
7154  	BYTE    $0xc2
7155  	BYTE    $0x7d
7156  	BYTE    $0x5a
7157  	BYTE    $0x60
7158  	BYTE    $0x30
7159  	VPADDD  ·avx2InitMask<>+0(SB), Y4, Y4
7160  
7161  	// Special optimizations, for very short buffers
7162  	CMPQ BX, $0x000000c0
7163  	JBE  seal192AVX2
7164  	CMPQ BX, $0x00000140
7165  	JBE  seal320AVX2
7166  
7167  	// For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
7168  	VMOVDQA Y0, Y5
7169  	VMOVDQA Y0, Y6
7170  	VMOVDQA Y0, Y7
7171  	VMOVDQA Y14, Y9
7172  	VMOVDQA Y14, Y10
7173  	VMOVDQA Y14, Y11
7174  	VMOVDQA Y14, 32(BP)
7175  	VMOVDQA Y12, Y13
7176  	VMOVDQA Y12, Y8
7177  	VMOVDQA Y12, Y15
7178  	VMOVDQA Y12, 64(BP)
7179  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
7180  	VMOVDQA Y4, 96(BP)
7181  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
7182  	VMOVDQA Y1, 128(BP)
7183  	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
7184  	VMOVDQA Y2, 160(BP)
7185  	VMOVDQA Y3, 192(BP)
7186  	MOVQ    $0x0000000a, R9
7187  
7188  sealAVX2IntroLoop:
7189  	VMOVDQA    Y15, 224(BP)
7190  	VPADDD     Y14, Y0, Y0
7191  	VPXOR      Y0, Y4, Y4
7192  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
7193  	VPADDD     Y4, Y12, Y12
7194  	VPXOR      Y12, Y14, Y14
7195  	VPSLLD     $0x0c, Y14, Y15
7196  	VPSRLD     $0x14, Y14, Y14
7197  	VPXOR      Y15, Y14, Y14
7198  	VPADDD     Y14, Y0, Y0
7199  	VPXOR      Y0, Y4, Y4
7200  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
7201  	VPADDD     Y4, Y12, Y12
7202  	VPXOR      Y12, Y14, Y14
7203  	VPSLLD     $0x07, Y14, Y15
7204  	VPSRLD     $0x19, Y14, Y14
7205  	VPXOR      Y15, Y14, Y14
7206  	VPADDD     Y9, Y5, Y5
7207  	VPXOR      Y5, Y1, Y1
7208  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
7209  	VPADDD     Y1, Y13, Y13
7210  	VPXOR      Y13, Y9, Y9
7211  	VPSLLD     $0x0c, Y9, Y15
7212  	VPSRLD     $0x14, Y9, Y9
7213  	VPXOR      Y15, Y9, Y9
7214  	VPADDD     Y9, Y5, Y5
7215  	VPXOR      Y5, Y1, Y1
7216  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
7217  	VPADDD     Y1, Y13, Y13
7218  	VPXOR      Y13, Y9, Y9
7219  	VPSLLD     $0x07, Y9, Y15
7220  	VPSRLD     $0x19, Y9, Y9
7221  	VPXOR      Y15, Y9, Y9
7222  	VPADDD     Y10, Y6, Y6
7223  	VPXOR      Y6, Y2, Y2
7224  	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
7225  	VPADDD     Y2, Y8, Y8
7226  	VPXOR      Y8, Y10, Y10
7227  	VPSLLD     $0x0c, Y10, Y15
7228  	VPSRLD     $0x14, Y10, Y10
7229  	VPXOR      Y15, Y10, Y10
7230  	VPADDD     Y10, Y6, Y6
7231  	VPXOR      Y6, Y2, Y2
7232  	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
7233  	VPADDD     Y2, Y8, Y8
7234  	VPXOR      Y8, Y10, Y10
7235  	VPSLLD     $0x07, Y10, Y15
7236  	VPSRLD     $0x19, Y10, Y10
7237  	VPXOR      Y15, Y10, Y10
7238  	VMOVDQA    224(BP), Y15
7239  	VMOVDQA    Y13, 224(BP)
7240  	VPADDD     Y11, Y7, Y7
7241  	VPXOR      Y7, Y3, Y3
7242  	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
7243  	VPADDD     Y3, Y15, Y15
7244  	VPXOR      Y15, Y11, Y11
7245  	VPSLLD     $0x0c, Y11, Y13
7246  	VPSRLD     $0x14, Y11, Y11
7247  	VPXOR      Y13, Y11, Y11
7248  	VPADDD     Y11, Y7, Y7
7249  	VPXOR      Y7, Y3, Y3
7250  	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
7251  	VPADDD     Y3, Y15, Y15
7252  	VPXOR      Y15, Y11, Y11
7253  	VPSLLD     $0x07, Y11, Y13
7254  	VPSRLD     $0x19, Y11, Y11
7255  	VPXOR      Y13, Y11, Y11
7256  	VMOVDQA    224(BP), Y13
7257  	VPALIGNR   $0x04, Y14, Y14, Y14
7258  	VPALIGNR   $0x08, Y12, Y12, Y12
7259  	VPALIGNR   $0x0c, Y4, Y4, Y4
7260  	VPALIGNR   $0x04, Y9, Y9, Y9
7261  	VPALIGNR   $0x08, Y13, Y13, Y13
7262  	VPALIGNR   $0x0c, Y1, Y1, Y1
7263  	VPALIGNR   $0x04, Y10, Y10, Y10
7264  	VPALIGNR   $0x08, Y8, Y8, Y8
7265  	VPALIGNR   $0x0c, Y2, Y2, Y2
7266  	VPALIGNR   $0x04, Y11, Y11, Y11
7267  	VPALIGNR   $0x08, Y15, Y15, Y15
7268  	VPALIGNR   $0x0c, Y3, Y3, Y3
7269  	VMOVDQA    Y15, 224(BP)
7270  	VPADDD     Y14, Y0, Y0
7271  	VPXOR      Y0, Y4, Y4
7272  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
7273  	VPADDD     Y4, Y12, Y12
7274  	VPXOR      Y12, Y14, Y14
7275  	VPSLLD     $0x0c, Y14, Y15
7276  	VPSRLD     $0x14, Y14, Y14
7277  	VPXOR      Y15, Y14, Y14
7278  	VPADDD     Y14, Y0, Y0
7279  	VPXOR      Y0, Y4, Y4
7280  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
7281  	VPADDD     Y4, Y12, Y12
7282  	VPXOR      Y12, Y14, Y14
7283  	VPSLLD     $0x07, Y14, Y15
7284  	VPSRLD     $0x19, Y14, Y14
7285  	VPXOR      Y15, Y14, Y14
7286  	VPADDD     Y9, Y5, Y5
7287  	VPXOR      Y5, Y1, Y1
7288  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
7289  	VPADDD     Y1, Y13, Y13
7290  	VPXOR      Y13, Y9, Y9
7291  	VPSLLD     $0x0c, Y9, Y15
7292  	VPSRLD     $0x14, Y9, Y9
7293  	VPXOR      Y15, Y9, Y9
7294  	VPADDD     Y9, Y5, Y5
7295  	VPXOR      Y5, Y1, Y1
7296  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
7297  	VPADDD     Y1, Y13, Y13
7298  	VPXOR      Y13, Y9, Y9
7299  	VPSLLD     $0x07, Y9, Y15
7300  	VPSRLD     $0x19, Y9, Y9
7301  	VPXOR      Y15, Y9, Y9
7302  	VPADDD     Y10, Y6, Y6
7303  	VPXOR      Y6, Y2, Y2
7304  	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
7305  	VPADDD     Y2, Y8, Y8
7306  	VPXOR      Y8, Y10, Y10
7307  	VPSLLD     $0x0c, Y10, Y15
7308  	VPSRLD     $0x14, Y10, Y10
7309  	VPXOR      Y15, Y10, Y10
7310  	VPADDD     Y10, Y6, Y6
7311  	VPXOR      Y6, Y2, Y2
7312  	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
7313  	VPADDD     Y2, Y8, Y8
7314  	VPXOR      Y8, Y10, Y10
7315  	VPSLLD     $0x07, Y10, Y15
7316  	VPSRLD     $0x19, Y10, Y10
7317  	VPXOR      Y15, Y10, Y10
7318  	VMOVDQA    224(BP), Y15
7319  	VMOVDQA    Y13, 224(BP)
7320  	VPADDD     Y11, Y7, Y7
7321  	VPXOR      Y7, Y3, Y3
7322  	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
7323  	VPADDD     Y3, Y15, Y15
7324  	VPXOR      Y15, Y11, Y11
7325  	VPSLLD     $0x0c, Y11, Y13
7326  	VPSRLD     $0x14, Y11, Y11
7327  	VPXOR      Y13, Y11, Y11
7328  	VPADDD     Y11, Y7, Y7
7329  	VPXOR      Y7, Y3, Y3
7330  	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
7331  	VPADDD     Y3, Y15, Y15
7332  	VPXOR      Y15, Y11, Y11
7333  	VPSLLD     $0x07, Y11, Y13
7334  	VPSRLD     $0x19, Y11, Y11
7335  	VPXOR      Y13, Y11, Y11
7336  	VMOVDQA    224(BP), Y13
7337  	VPALIGNR   $0x0c, Y14, Y14, Y14
7338  	VPALIGNR   $0x08, Y12, Y12, Y12
7339  	VPALIGNR   $0x04, Y4, Y4, Y4
7340  	VPALIGNR   $0x0c, Y9, Y9, Y9
7341  	VPALIGNR   $0x08, Y13, Y13, Y13
7342  	VPALIGNR   $0x04, Y1, Y1, Y1
7343  	VPALIGNR   $0x0c, Y10, Y10, Y10
7344  	VPALIGNR   $0x08, Y8, Y8, Y8
7345  	VPALIGNR   $0x04, Y2, Y2, Y2
7346  	VPALIGNR   $0x0c, Y11, Y11, Y11
7347  	VPALIGNR   $0x08, Y15, Y15, Y15
7348  	VPALIGNR   $0x04, Y3, Y3, Y3
7349  	DECQ       R9
7350  	JNE        sealAVX2IntroLoop
7351  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
7352  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
7353  	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
7354  	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
7355  	VPADDD     32(BP), Y14, Y14
7356  	VPADDD     32(BP), Y9, Y9
7357  	VPADDD     32(BP), Y10, Y10
7358  	VPADDD     32(BP), Y11, Y11
7359  	VPADDD     64(BP), Y12, Y12
7360  	VPADDD     64(BP), Y13, Y13
7361  	VPADDD     64(BP), Y8, Y8
7362  	VPADDD     64(BP), Y15, Y15
7363  	VPADDD     96(BP), Y4, Y4
7364  	VPADDD     128(BP), Y1, Y1
7365  	VPADDD     160(BP), Y2, Y2
7366  	VPADDD     192(BP), Y3, Y3
7367  	VPERM2I128 $0x13, Y12, Y4, Y12
7368  	VPERM2I128 $0x02, Y0, Y14, Y4
7369  	VPERM2I128 $0x13, Y0, Y14, Y0
7370  
7371  	// Clamp and store poly key
7372  	VPAND   ·polyClampMask<>+0(SB), Y4, Y4
7373  	VMOVDQA Y4, (BP)
7374  
7375  	// Hash AD
7376  	MOVQ ad_len+80(FP), R9
7377  	CALL polyHashADInternal<>(SB)
7378  
7379  	// Can store at least 320 bytes
7380  	VPXOR      (SI), Y0, Y0
7381  	VPXOR      32(SI), Y12, Y12
7382  	VMOVDQU    Y0, (DI)
7383  	VMOVDQU    Y12, 32(DI)
7384  	VPERM2I128 $0x02, Y5, Y9, Y0
7385  	VPERM2I128 $0x02, Y13, Y1, Y14
7386  	VPERM2I128 $0x13, Y5, Y9, Y12
7387  	VPERM2I128 $0x13, Y13, Y1, Y4
7388  	VPXOR      64(SI), Y0, Y0
7389  	VPXOR      96(SI), Y14, Y14
7390  	VPXOR      128(SI), Y12, Y12
7391  	VPXOR      160(SI), Y4, Y4
7392  	VMOVDQU    Y0, 64(DI)
7393  	VMOVDQU    Y14, 96(DI)
7394  	VMOVDQU    Y12, 128(DI)
7395  	VMOVDQU    Y4, 160(DI)
7396  	VPERM2I128 $0x02, Y6, Y10, Y0
7397  	VPERM2I128 $0x02, Y8, Y2, Y14
7398  	VPERM2I128 $0x13, Y6, Y10, Y12
7399  	VPERM2I128 $0x13, Y8, Y2, Y4
7400  	VPXOR      192(SI), Y0, Y0
7401  	VPXOR      224(SI), Y14, Y14
7402  	VPXOR      256(SI), Y12, Y12
7403  	VPXOR      288(SI), Y4, Y4
7404  	VMOVDQU    Y0, 192(DI)
7405  	VMOVDQU    Y14, 224(DI)
7406  	VMOVDQU    Y12, 256(DI)
7407  	VMOVDQU    Y4, 288(DI)
7408  	MOVQ       $0x00000140, CX
7409  	SUBQ       $0x00000140, BX
7410  	LEAQ       320(SI), SI
7411  	VPERM2I128 $0x02, Y7, Y11, Y0
7412  	VPERM2I128 $0x02, Y15, Y3, Y14
7413  	VPERM2I128 $0x13, Y7, Y11, Y12
7414  	VPERM2I128 $0x13, Y15, Y3, Y4
7415  	CMPQ       BX, $0x80
7416  	JBE        sealAVX2SealHash
7417  	VPXOR      (SI), Y0, Y0
7418  	VPXOR      32(SI), Y14, Y14
7419  	VPXOR      64(SI), Y12, Y12
7420  	VPXOR      96(SI), Y4, Y4
7421  	VMOVDQU    Y0, 320(DI)
7422  	VMOVDQU    Y14, 352(DI)
7423  	VMOVDQU    Y12, 384(DI)
7424  	VMOVDQU    Y4, 416(DI)
7425  	SUBQ       $0x80, BX
7426  	LEAQ       128(SI), SI
7427  	MOVQ       $0x00000008, CX
7428  	MOVQ       $0x00000002, R9
7429  	CMPQ       BX, $0x80
7430  	JBE        sealAVX2Tail128
7431  	CMPQ       BX, $0x00000100
7432  	JBE        sealAVX2Tail256
7433  	CMPQ       BX, $0x00000180
7434  	JBE        sealAVX2Tail384
7435  	CMPQ       BX, $0x00000200
7436  	JBE        sealAVX2Tail512
7437  
7438  	// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
7439  	VMOVDQA  ·chacha20Constants<>+0(SB), Y0
7440  	VMOVDQA  Y0, Y5
7441  	VMOVDQA  Y0, Y6
7442  	VMOVDQA  Y0, Y7
7443  	VMOVDQA  32(BP), Y14
7444  	VMOVDQA  Y14, Y9
7445  	VMOVDQA  Y14, Y10
7446  	VMOVDQA  Y14, Y11
7447  	VMOVDQA  64(BP), Y12
7448  	VMOVDQA  Y12, Y13
7449  	VMOVDQA  Y12, Y8
7450  	VMOVDQA  Y12, Y15
7451  	VMOVDQA  192(BP), Y4
7452  	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y4
7453  	VPADDD   ·avx2IncMask<>+0(SB), Y4, Y1
7454  	VPADDD   ·avx2IncMask<>+0(SB), Y1, Y2
7455  	VPADDD   ·avx2IncMask<>+0(SB), Y2, Y3
7456  	VMOVDQA  Y4, 96(BP)
7457  	VMOVDQA  Y1, 128(BP)
7458  	VMOVDQA  Y2, 160(BP)
7459  	VMOVDQA  Y3, 192(BP)
7460  	VMOVDQA  Y15, 224(BP)
7461  	VPADDD   Y14, Y0, Y0
7462  	VPXOR    Y0, Y4, Y4
7463  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
7464  	VPADDD   Y4, Y12, Y12
7465  	VPXOR    Y12, Y14, Y14
7466  	VPSLLD   $0x0c, Y14, Y15
7467  	VPSRLD   $0x14, Y14, Y14
7468  	VPXOR    Y15, Y14, Y14
7469  	VPADDD   Y14, Y0, Y0
7470  	VPXOR    Y0, Y4, Y4
7471  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
7472  	VPADDD   Y4, Y12, Y12
7473  	VPXOR    Y12, Y14, Y14
7474  	VPSLLD   $0x07, Y14, Y15
7475  	VPSRLD   $0x19, Y14, Y14
7476  	VPXOR    Y15, Y14, Y14
7477  	VPADDD   Y9, Y5, Y5
7478  	VPXOR    Y5, Y1, Y1
7479  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
7480  	VPADDD   Y1, Y13, Y13
7481  	VPXOR    Y13, Y9, Y9
7482  	VPSLLD   $0x0c, Y9, Y15
7483  	VPSRLD   $0x14, Y9, Y9
7484  	VPXOR    Y15, Y9, Y9
7485  	VPADDD   Y9, Y5, Y5
7486  	VPXOR    Y5, Y1, Y1
7487  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
7488  	VPADDD   Y1, Y13, Y13
7489  	VPXOR    Y13, Y9, Y9
7490  	VPSLLD   $0x07, Y9, Y15
7491  	VPSRLD   $0x19, Y9, Y9
7492  	VPXOR    Y15, Y9, Y9
7493  	VPADDD   Y10, Y6, Y6
7494  	VPXOR    Y6, Y2, Y2
7495  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
7496  	VPADDD   Y2, Y8, Y8
7497  	VPXOR    Y8, Y10, Y10
7498  	VPSLLD   $0x0c, Y10, Y15
7499  	VPSRLD   $0x14, Y10, Y10
7500  	VPXOR    Y15, Y10, Y10
7501  	VPADDD   Y10, Y6, Y6
7502  	VPXOR    Y6, Y2, Y2
7503  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
7504  	VPADDD   Y2, Y8, Y8
7505  	VPXOR    Y8, Y10, Y10
7506  	VPSLLD   $0x07, Y10, Y15
7507  	VPSRLD   $0x19, Y10, Y10
7508  	VPXOR    Y15, Y10, Y10
7509  	VMOVDQA  224(BP), Y15
7510  	VMOVDQA  Y13, 224(BP)
7511  	VPADDD   Y11, Y7, Y7
7512  	VPXOR    Y7, Y3, Y3
7513  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
7514  	VPADDD   Y3, Y15, Y15
7515  	VPXOR    Y15, Y11, Y11
7516  	VPSLLD   $0x0c, Y11, Y13
7517  	VPSRLD   $0x14, Y11, Y11
7518  	VPXOR    Y13, Y11, Y11
7519  	VPADDD   Y11, Y7, Y7
7520  	VPXOR    Y7, Y3, Y3
7521  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
7522  	VPADDD   Y3, Y15, Y15
7523  	VPXOR    Y15, Y11, Y11
7524  	VPSLLD   $0x07, Y11, Y13
7525  	VPSRLD   $0x19, Y11, Y11
7526  	VPXOR    Y13, Y11, Y11
7527  	VMOVDQA  224(BP), Y13
7528  	VPALIGNR $0x04, Y14, Y14, Y14
7529  	VPALIGNR $0x08, Y12, Y12, Y12
7530  	VPALIGNR $0x0c, Y4, Y4, Y4
7531  	VPALIGNR $0x04, Y9, Y9, Y9
7532  	VPALIGNR $0x08, Y13, Y13, Y13
7533  	VPALIGNR $0x0c, Y1, Y1, Y1
7534  	VPALIGNR $0x04, Y10, Y10, Y10
7535  	VPALIGNR $0x08, Y8, Y8, Y8
7536  	VPALIGNR $0x0c, Y2, Y2, Y2
7537  	VPALIGNR $0x04, Y11, Y11, Y11
7538  	VPALIGNR $0x08, Y15, Y15, Y15
7539  	VPALIGNR $0x0c, Y3, Y3, Y3
7540  	VMOVDQA  Y15, 224(BP)
7541  	VPADDD   Y14, Y0, Y0
7542  	VPXOR    Y0, Y4, Y4
7543  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
7544  	VPADDD   Y4, Y12, Y12
7545  	VPXOR    Y12, Y14, Y14
7546  	VPSLLD   $0x0c, Y14, Y15
7547  	VPSRLD   $0x14, Y14, Y14
7548  	VPXOR    Y15, Y14, Y14
7549  	VPADDD   Y14, Y0, Y0
7550  	VPXOR    Y0, Y4, Y4
7551  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
7552  	VPADDD   Y4, Y12, Y12
7553  	VPXOR    Y12, Y14, Y14
7554  	VPSLLD   $0x07, Y14, Y15
7555  	VPSRLD   $0x19, Y14, Y14
7556  	VPXOR    Y15, Y14, Y14
7557  	VPADDD   Y9, Y5, Y5
7558  	VPXOR    Y5, Y1, Y1
7559  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
7560  	VPADDD   Y1, Y13, Y13
7561  	VPXOR    Y13, Y9, Y9
7562  	VPSLLD   $0x0c, Y9, Y15
7563  	VPSRLD   $0x14, Y9, Y9
7564  	VPXOR    Y15, Y9, Y9
7565  	VPADDD   Y9, Y5, Y5
7566  	VPXOR    Y5, Y1, Y1
7567  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
7568  	VPADDD   Y1, Y13, Y13
7569  	VPXOR    Y13, Y9, Y9
7570  	VPSLLD   $0x07, Y9, Y15
7571  	VPSRLD   $0x19, Y9, Y9
7572  	VPXOR    Y15, Y9, Y9
7573  	VPADDD   Y10, Y6, Y6
7574  	VPXOR    Y6, Y2, Y2
7575  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
7576  	VPADDD   Y2, Y8, Y8
7577  	VPXOR    Y8, Y10, Y10
7578  	VPSLLD   $0x0c, Y10, Y15
7579  	VPSRLD   $0x14, Y10, Y10
7580  	VPXOR    Y15, Y10, Y10
7581  	VPADDD   Y10, Y6, Y6
7582  	VPXOR    Y6, Y2, Y2
7583  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
7584  	VPADDD   Y2, Y8, Y8
7585  	VPXOR    Y8, Y10, Y10
7586  	VPSLLD   $0x07, Y10, Y15
7587  	VPSRLD   $0x19, Y10, Y10
7588  	VPXOR    Y15, Y10, Y10
7589  	VMOVDQA  224(BP), Y15
7590  	VMOVDQA  Y13, 224(BP)
7591  	VPADDD   Y11, Y7, Y7
7592  	VPXOR    Y7, Y3, Y3
7593  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
7594  	VPADDD   Y3, Y15, Y15
7595  	VPXOR    Y15, Y11, Y11
7596  	VPSLLD   $0x0c, Y11, Y13
7597  	VPSRLD   $0x14, Y11, Y11
7598  	VPXOR    Y13, Y11, Y11
7599  	VPADDD   Y11, Y7, Y7
7600  	VPXOR    Y7, Y3, Y3
7601  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
7602  	VPADDD   Y3, Y15, Y15
7603  	VPXOR    Y15, Y11, Y11
7604  	VPSLLD   $0x07, Y11, Y13
7605  	VPSRLD   $0x19, Y11, Y11
7606  	VPXOR    Y13, Y11, Y11
7607  	VMOVDQA  224(BP), Y13
7608  	VPALIGNR $0x0c, Y14, Y14, Y14
7609  	VPALIGNR $0x08, Y12, Y12, Y12
7610  	VPALIGNR $0x04, Y4, Y4, Y4
7611  	VPALIGNR $0x0c, Y9, Y9, Y9
7612  	VPALIGNR $0x08, Y13, Y13, Y13
7613  	VPALIGNR $0x04, Y1, Y1, Y1
7614  	VPALIGNR $0x0c, Y10, Y10, Y10
7615  	VPALIGNR $0x08, Y8, Y8, Y8
7616  	VPALIGNR $0x04, Y2, Y2, Y2
7617  	VPALIGNR $0x0c, Y11, Y11, Y11
7618  	VPALIGNR $0x08, Y15, Y15, Y15
7619  	VPALIGNR $0x04, Y3, Y3, Y3
7620  	VPADDD   Y14, Y0, Y0
7621  	VPADDD   Y9, Y5, Y5
7622  	VPADDD   Y10, Y6, Y6
7623  	VPADDD   Y11, Y7, Y7
7624  	VPXOR    Y0, Y4, Y4
7625  	VPXOR    Y5, Y1, Y1
7626  	VPXOR    Y6, Y2, Y2
7627  	VPXOR    Y7, Y3, Y3
7628  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
7629  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
7630  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
7631  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
7632  	VPADDD   Y4, Y12, Y12
7633  	VPADDD   Y1, Y13, Y13
7634  	VPADDD   Y2, Y8, Y8
7635  	VPADDD   Y3, Y15, Y15
7636  	VPXOR    Y12, Y14, Y14
7637  	VPXOR    Y13, Y9, Y9
7638  	VPXOR    Y8, Y10, Y10
7639  	VPXOR    Y15, Y11, Y11
7640  	VMOVDQA  Y15, 224(BP)
7641  	VPSLLD   $0x0c, Y14, Y15
7642  	VPSRLD   $0x14, Y14, Y14
7643  	VPXOR    Y15, Y14, Y14
7644  	VPSLLD   $0x0c, Y9, Y15
7645  	VPSRLD   $0x14, Y9, Y9
7646  	VPXOR    Y15, Y9, Y9
7647  	VPSLLD   $0x0c, Y10, Y15
7648  	VPSRLD   $0x14, Y10, Y10
7649  	VPXOR    Y15, Y10, Y10
7650  	VPSLLD   $0x0c, Y11, Y15
7651  	VPSRLD   $0x14, Y11, Y11
7652  	VPXOR    Y15, Y11, Y11
7653  	VMOVDQA  224(BP), Y15
7654  	SUBQ     $0x10, DI
7655  	MOVQ     $0x00000009, CX
7656  	JMP      sealAVX2InternalLoopStart
7657  
7658  sealAVX2MainLoop:
7659  	VMOVDQU ·chacha20Constants<>+0(SB), Y0
7660  	VMOVDQA Y0, Y5
7661  	VMOVDQA Y0, Y6
7662  	VMOVDQA Y0, Y7
7663  	VMOVDQA 32(BP), Y14
7664  	VMOVDQA Y14, Y9
7665  	VMOVDQA Y14, Y10
7666  	VMOVDQA Y14, Y11
7667  	VMOVDQA 64(BP), Y12
7668  	VMOVDQA Y12, Y13
7669  	VMOVDQA Y12, Y8
7670  	VMOVDQA Y12, Y15
7671  	VMOVDQA 192(BP), Y4
7672  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
7673  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
7674  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
7675  	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
7676  	VMOVDQA Y4, 96(BP)
7677  	VMOVDQA Y1, 128(BP)
7678  	VMOVDQA Y2, 160(BP)
7679  	VMOVDQA Y3, 192(BP)
7680  	MOVQ    $0x0000000a, CX
7681  
7682  sealAVX2InternalLoop:
7683  	ADDQ    (DI), R10
7684  	ADCQ    8(DI), R11
7685  	ADCQ    $0x01, R12
7686  	VPADDD  Y14, Y0, Y0
7687  	VPADDD  Y9, Y5, Y5
7688  	VPADDD  Y10, Y6, Y6
7689  	VPADDD  Y11, Y7, Y7
7690  	MOVQ    (BP), DX
7691  	MOVQ    DX, R15
7692  	MULXQ   R10, R13, R14
7693  	IMULQ   R12, R15
7694  	MULXQ   R11, AX, DX
7695  	ADDQ    AX, R14
7696  	ADCQ    DX, R15
7697  	VPXOR   Y0, Y4, Y4
7698  	VPXOR   Y5, Y1, Y1
7699  	VPXOR   Y6, Y2, Y2
7700  	VPXOR   Y7, Y3, Y3
7701  	VPSHUFB ·rol16<>+0(SB), Y4, Y4
7702  	VPSHUFB ·rol16<>+0(SB), Y1, Y1
7703  	VPSHUFB ·rol16<>+0(SB), Y2, Y2
7704  	VPSHUFB ·rol16<>+0(SB), Y3, Y3
7705  	MOVQ    8(BP), DX
7706  	MULXQ   R10, R10, AX
7707  	ADDQ    R10, R14
7708  	MULXQ   R11, R11, R8
7709  	ADCQ    R11, R15
7710  	ADCQ    $0x00, R8
7711  	VPADDD  Y4, Y12, Y12
7712  	VPADDD  Y1, Y13, Y13
7713  	VPADDD  Y2, Y8, Y8
7714  	VPADDD  Y3, Y15, Y15
7715  	VPXOR   Y12, Y14, Y14
7716  	VPXOR   Y13, Y9, Y9
7717  	VPXOR   Y8, Y10, Y10
7718  	VPXOR   Y15, Y11, Y11
7719  	IMULQ   R12, DX
7720  	ADDQ    AX, R15
7721  	ADCQ    DX, R8
7722  	VMOVDQA Y15, 224(BP)
7723  	VPSLLD  $0x0c, Y14, Y15
7724  	VPSRLD  $0x14, Y14, Y14
7725  	VPXOR   Y15, Y14, Y14
7726  	VPSLLD  $0x0c, Y9, Y15
7727  	VPSRLD  $0x14, Y9, Y9
7728  	VPXOR   Y15, Y9, Y9
7729  	VPSLLD  $0x0c, Y10, Y15
7730  	VPSRLD  $0x14, Y10, Y10
7731  	VPXOR   Y15, Y10, Y10
7732  	VPSLLD  $0x0c, Y11, Y15
7733  	VPSRLD  $0x14, Y11, Y11
7734  	VPXOR   Y15, Y11, Y11
7735  	VMOVDQA 224(BP), Y15
7736  	MOVQ    R13, R10
7737  	MOVQ    R14, R11
7738  	MOVQ    R15, R12
7739  	ANDQ    $0x03, R12
7740  	MOVQ    R15, R13
7741  	ANDQ    $-4, R13
7742  	MOVQ    R8, R14
7743  	SHRQ    $0x02, R8, R15
7744  	SHRQ    $0x02, R8
7745  	ADDQ    R13, R10
7746  	ADCQ    R14, R11
7747  	ADCQ    $0x00, R12
7748  	ADDQ    R15, R10
7749  	ADCQ    R8, R11
7750  	ADCQ    $0x00, R12
7751  
7752  sealAVX2InternalLoopStart:
7753  	VPADDD   Y14, Y0, Y0
7754  	VPADDD   Y9, Y5, Y5
7755  	VPADDD   Y10, Y6, Y6
7756  	VPADDD   Y11, Y7, Y7
7757  	VPXOR    Y0, Y4, Y4
7758  	VPXOR    Y5, Y1, Y1
7759  	VPXOR    Y6, Y2, Y2
7760  	VPXOR    Y7, Y3, Y3
7761  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
7762  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
7763  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
7764  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
7765  	ADDQ     16(DI), R10
7766  	ADCQ     24(DI), R11
7767  	ADCQ     $0x01, R12
7768  	VPADDD   Y4, Y12, Y12
7769  	VPADDD   Y1, Y13, Y13
7770  	VPADDD   Y2, Y8, Y8
7771  	VPADDD   Y3, Y15, Y15
7772  	MOVQ     (BP), DX
7773  	MOVQ     DX, R15
7774  	MULXQ    R10, R13, R14
7775  	IMULQ    R12, R15
7776  	MULXQ    R11, AX, DX
7777  	ADDQ     AX, R14
7778  	ADCQ     DX, R15
7779  	VPXOR    Y12, Y14, Y14
7780  	VPXOR    Y13, Y9, Y9
7781  	VPXOR    Y8, Y10, Y10
7782  	VPXOR    Y15, Y11, Y11
7783  	VMOVDQA  Y15, 224(BP)
7784  	VPSLLD   $0x07, Y14, Y15
7785  	VPSRLD   $0x19, Y14, Y14
7786  	VPXOR    Y15, Y14, Y14
7787  	VPSLLD   $0x07, Y9, Y15
7788  	VPSRLD   $0x19, Y9, Y9
7789  	VPXOR    Y15, Y9, Y9
7790  	VPSLLD   $0x07, Y10, Y15
7791  	VPSRLD   $0x19, Y10, Y10
7792  	VPXOR    Y15, Y10, Y10
7793  	VPSLLD   $0x07, Y11, Y15
7794  	VPSRLD   $0x19, Y11, Y11
7795  	VPXOR    Y15, Y11, Y11
7796  	VMOVDQA  224(BP), Y15
7797  	MOVQ     8(BP), DX
7798  	MULXQ    R10, R10, AX
7799  	ADDQ     R10, R14
7800  	MULXQ    R11, R11, R8
7801  	ADCQ     R11, R15
7802  	ADCQ     $0x00, R8
7803  	VPALIGNR $0x04, Y14, Y14, Y14
7804  	VPALIGNR $0x04, Y9, Y9, Y9
7805  	VPALIGNR $0x04, Y10, Y10, Y10
7806  	VPALIGNR $0x04, Y11, Y11, Y11
7807  	VPALIGNR $0x08, Y12, Y12, Y12
7808  	VPALIGNR $0x08, Y13, Y13, Y13
7809  	VPALIGNR $0x08, Y8, Y8, Y8
7810  	VPALIGNR $0x08, Y15, Y15, Y15
7811  	VPALIGNR $0x0c, Y4, Y4, Y4
7812  	VPALIGNR $0x0c, Y1, Y1, Y1
7813  	VPALIGNR $0x0c, Y2, Y2, Y2
7814  	VPALIGNR $0x0c, Y3, Y3, Y3
7815  	VPADDD   Y14, Y0, Y0
7816  	VPADDD   Y9, Y5, Y5
7817  	VPADDD   Y10, Y6, Y6
7818  	VPADDD   Y11, Y7, Y7
7819  	IMULQ    R12, DX
7820  	ADDQ     AX, R15
7821  	ADCQ     DX, R8
7822  	VPXOR    Y0, Y4, Y4
7823  	VPXOR    Y5, Y1, Y1
7824  	VPXOR    Y6, Y2, Y2
7825  	VPXOR    Y7, Y3, Y3
7826  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
7827  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
7828  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
7829  	VPSHUFB  ·rol16<>+0(SB), Y3, Y3
7830  	MOVQ     R13, R10
7831  	MOVQ     R14, R11
7832  	MOVQ     R15, R12
7833  	ANDQ     $0x03, R12
7834  	MOVQ     R15, R13
7835  	ANDQ     $-4, R13
7836  	MOVQ     R8, R14
7837  	SHRQ     $0x02, R8, R15
7838  	SHRQ     $0x02, R8
7839  	ADDQ     R13, R10
7840  	ADCQ     R14, R11
7841  	ADCQ     $0x00, R12
7842  	ADDQ     R15, R10
7843  	ADCQ     R8, R11
7844  	ADCQ     $0x00, R12
7845  	VPADDD   Y4, Y12, Y12
7846  	VPADDD   Y1, Y13, Y13
7847  	VPADDD   Y2, Y8, Y8
7848  	VPADDD   Y3, Y15, Y15
7849  	VPXOR    Y12, Y14, Y14
7850  	VPXOR    Y13, Y9, Y9
7851  	VPXOR    Y8, Y10, Y10
7852  	VPXOR    Y15, Y11, Y11
7853  	ADDQ     32(DI), R10
7854  	ADCQ     40(DI), R11
7855  	ADCQ     $0x01, R12
7856  	LEAQ     48(DI), DI
7857  	VMOVDQA  Y15, 224(BP)
7858  	VPSLLD   $0x0c, Y14, Y15
7859  	VPSRLD   $0x14, Y14, Y14
7860  	VPXOR    Y15, Y14, Y14
7861  	VPSLLD   $0x0c, Y9, Y15
7862  	VPSRLD   $0x14, Y9, Y9
7863  	VPXOR    Y15, Y9, Y9
7864  	VPSLLD   $0x0c, Y10, Y15
7865  	VPSRLD   $0x14, Y10, Y10
7866  	VPXOR    Y15, Y10, Y10
7867  	VPSLLD   $0x0c, Y11, Y15
7868  	VPSRLD   $0x14, Y11, Y11
7869  	VPXOR    Y15, Y11, Y11
7870  	VMOVDQA  224(BP), Y15
7871  	MOVQ     (BP), DX
7872  	MOVQ     DX, R15
7873  	MULXQ    R10, R13, R14
7874  	IMULQ    R12, R15
7875  	MULXQ    R11, AX, DX
7876  	ADDQ     AX, R14
7877  	ADCQ     DX, R15
7878  	VPADDD   Y14, Y0, Y0
7879  	VPADDD   Y9, Y5, Y5
7880  	VPADDD   Y10, Y6, Y6
7881  	VPADDD   Y11, Y7, Y7
7882  	VPXOR    Y0, Y4, Y4
7883  	VPXOR    Y5, Y1, Y1
7884  	VPXOR    Y6, Y2, Y2
7885  	VPXOR    Y7, Y3, Y3
7886  	MOVQ     8(BP), DX
7887  	MULXQ    R10, R10, AX
7888  	ADDQ     R10, R14
7889  	MULXQ    R11, R11, R8
7890  	ADCQ     R11, R15
7891  	ADCQ     $0x00, R8
7892  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
7893  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
7894  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
7895  	VPSHUFB  ·rol8<>+0(SB), Y3, Y3
7896  	VPADDD   Y4, Y12, Y12
7897  	VPADDD   Y1, Y13, Y13
7898  	VPADDD   Y2, Y8, Y8
7899  	VPADDD   Y3, Y15, Y15
7900  	IMULQ    R12, DX
7901  	ADDQ     AX, R15
7902  	ADCQ     DX, R8
7903  	VPXOR    Y12, Y14, Y14
7904  	VPXOR    Y13, Y9, Y9
7905  	VPXOR    Y8, Y10, Y10
7906  	VPXOR    Y15, Y11, Y11
7907  	VMOVDQA  Y15, 224(BP)
7908  	VPSLLD   $0x07, Y14, Y15
7909  	VPSRLD   $0x19, Y14, Y14
7910  	VPXOR    Y15, Y14, Y14
7911  	VPSLLD   $0x07, Y9, Y15
7912  	VPSRLD   $0x19, Y9, Y9
7913  	VPXOR    Y15, Y9, Y9
7914  	VPSLLD   $0x07, Y10, Y15
7915  	VPSRLD   $0x19, Y10, Y10
7916  	VPXOR    Y15, Y10, Y10
7917  	VPSLLD   $0x07, Y11, Y15
7918  	VPSRLD   $0x19, Y11, Y11
7919  	VPXOR    Y15, Y11, Y11
7920  	VMOVDQA  224(BP), Y15
7921  	MOVQ     R13, R10
7922  	MOVQ     R14, R11
7923  	MOVQ     R15, R12
7924  	ANDQ     $0x03, R12
7925  	MOVQ     R15, R13
7926  	ANDQ     $-4, R13
7927  	MOVQ     R8, R14
7928  	SHRQ     $0x02, R8, R15
7929  	SHRQ     $0x02, R8
7930  	ADDQ     R13, R10
7931  	ADCQ     R14, R11
7932  	ADCQ     $0x00, R12
7933  	ADDQ     R15, R10
7934  	ADCQ     R8, R11
7935  	ADCQ     $0x00, R12
7936  	VPALIGNR $0x0c, Y14, Y14, Y14
7937  	VPALIGNR $0x0c, Y9, Y9, Y9
7938  	VPALIGNR $0x0c, Y10, Y10, Y10
7939  	VPALIGNR $0x0c, Y11, Y11, Y11
7940  	VPALIGNR $0x08, Y12, Y12, Y12
7941  	VPALIGNR $0x08, Y13, Y13, Y13
7942  	VPALIGNR $0x08, Y8, Y8, Y8
7943  	VPALIGNR $0x08, Y15, Y15, Y15
7944  	VPALIGNR $0x04, Y4, Y4, Y4
7945  	VPALIGNR $0x04, Y1, Y1, Y1
7946  	VPALIGNR $0x04, Y2, Y2, Y2
7947  	VPALIGNR $0x04, Y3, Y3, Y3
7948  	DECQ     CX
7949  	JNE      sealAVX2InternalLoop
7950  	VPADDD   ·chacha20Constants<>+0(SB), Y0, Y0
7951  	VPADDD   ·chacha20Constants<>+0(SB), Y5, Y5
7952  	VPADDD   ·chacha20Constants<>+0(SB), Y6, Y6
7953  	VPADDD   ·chacha20Constants<>+0(SB), Y7, Y7
7954  	VPADDD   32(BP), Y14, Y14
7955  	VPADDD   32(BP), Y9, Y9
7956  	VPADDD   32(BP), Y10, Y10
7957  	VPADDD   32(BP), Y11, Y11
7958  	VPADDD   64(BP), Y12, Y12
7959  	VPADDD   64(BP), Y13, Y13
7960  	VPADDD   64(BP), Y8, Y8
7961  	VPADDD   64(BP), Y15, Y15
7962  	VPADDD   96(BP), Y4, Y4
7963  	VPADDD   128(BP), Y1, Y1
7964  	VPADDD   160(BP), Y2, Y2
7965  	VPADDD   192(BP), Y3, Y3
7966  	VMOVDQA  Y15, 224(BP)
7967  
7968  	// We only hashed 480 of the 512 bytes available - hash the remaining 32 here
7969  	ADDQ       (DI), R10
7970  	ADCQ       8(DI), R11
7971  	ADCQ       $0x01, R12
7972  	MOVQ       (BP), DX
7973  	MOVQ       DX, R15
7974  	MULXQ      R10, R13, R14
7975  	IMULQ      R12, R15
7976  	MULXQ      R11, AX, DX
7977  	ADDQ       AX, R14
7978  	ADCQ       DX, R15
7979  	MOVQ       8(BP), DX
7980  	MULXQ      R10, R10, AX
7981  	ADDQ       R10, R14
7982  	MULXQ      R11, R11, R8
7983  	ADCQ       R11, R15
7984  	ADCQ       $0x00, R8
7985  	IMULQ      R12, DX
7986  	ADDQ       AX, R15
7987  	ADCQ       DX, R8
7988  	MOVQ       R13, R10
7989  	MOVQ       R14, R11
7990  	MOVQ       R15, R12
7991  	ANDQ       $0x03, R12
7992  	MOVQ       R15, R13
7993  	ANDQ       $-4, R13
7994  	MOVQ       R8, R14
7995  	SHRQ       $0x02, R8, R15
7996  	SHRQ       $0x02, R8
7997  	ADDQ       R13, R10
7998  	ADCQ       R14, R11
7999  	ADCQ       $0x00, R12
8000  	ADDQ       R15, R10
8001  	ADCQ       R8, R11
8002  	ADCQ       $0x00, R12
8003  	LEAQ       32(DI), DI
8004  	VPERM2I128 $0x02, Y0, Y14, Y15
8005  	VPERM2I128 $0x13, Y0, Y14, Y14
8006  	VPERM2I128 $0x02, Y12, Y4, Y0
8007  	VPERM2I128 $0x13, Y12, Y4, Y12
8008  	VPXOR      (SI), Y15, Y15
8009  	VPXOR      32(SI), Y0, Y0
8010  	VPXOR      64(SI), Y14, Y14
8011  	VPXOR      96(SI), Y12, Y12
8012  	VMOVDQU    Y15, (DI)
8013  	VMOVDQU    Y0, 32(DI)
8014  	VMOVDQU    Y14, 64(DI)
8015  	VMOVDQU    Y12, 96(DI)
8016  	VPERM2I128 $0x02, Y5, Y9, Y0
8017  	VPERM2I128 $0x02, Y13, Y1, Y14
8018  	VPERM2I128 $0x13, Y5, Y9, Y12
8019  	VPERM2I128 $0x13, Y13, Y1, Y4
8020  	VPXOR      128(SI), Y0, Y0
8021  	VPXOR      160(SI), Y14, Y14
8022  	VPXOR      192(SI), Y12, Y12
8023  	VPXOR      224(SI), Y4, Y4
8024  	VMOVDQU    Y0, 128(DI)
8025  	VMOVDQU    Y14, 160(DI)
8026  	VMOVDQU    Y12, 192(DI)
8027  	VMOVDQU    Y4, 224(DI)
8028  
8029  	// and here
8030  	ADDQ       -16(DI), R10
8031  	ADCQ       -8(DI), R11
8032  	ADCQ       $0x01, R12
8033  	MOVQ       (BP), DX
8034  	MOVQ       DX, R15
8035  	MULXQ      R10, R13, R14
8036  	IMULQ      R12, R15
8037  	MULXQ      R11, AX, DX
8038  	ADDQ       AX, R14
8039  	ADCQ       DX, R15
8040  	MOVQ       8(BP), DX
8041  	MULXQ      R10, R10, AX
8042  	ADDQ       R10, R14
8043  	MULXQ      R11, R11, R8
8044  	ADCQ       R11, R15
8045  	ADCQ       $0x00, R8
8046  	IMULQ      R12, DX
8047  	ADDQ       AX, R15
8048  	ADCQ       DX, R8
8049  	MOVQ       R13, R10
8050  	MOVQ       R14, R11
8051  	MOVQ       R15, R12
8052  	ANDQ       $0x03, R12
8053  	MOVQ       R15, R13
8054  	ANDQ       $-4, R13
8055  	MOVQ       R8, R14
8056  	SHRQ       $0x02, R8, R15
8057  	SHRQ       $0x02, R8
8058  	ADDQ       R13, R10
8059  	ADCQ       R14, R11
8060  	ADCQ       $0x00, R12
8061  	ADDQ       R15, R10
8062  	ADCQ       R8, R11
8063  	ADCQ       $0x00, R12
8064  	VPERM2I128 $0x02, Y6, Y10, Y0
8065  	VPERM2I128 $0x02, Y8, Y2, Y14
8066  	VPERM2I128 $0x13, Y6, Y10, Y12
8067  	VPERM2I128 $0x13, Y8, Y2, Y4
8068  	VPXOR      256(SI), Y0, Y0
8069  	VPXOR      288(SI), Y14, Y14
8070  	VPXOR      320(SI), Y12, Y12
8071  	VPXOR      352(SI), Y4, Y4
8072  	VMOVDQU    Y0, 256(DI)
8073  	VMOVDQU    Y14, 288(DI)
8074  	VMOVDQU    Y12, 320(DI)
8075  	VMOVDQU    Y4, 352(DI)
8076  	VPERM2I128 $0x02, Y7, Y11, Y0
8077  	VPERM2I128 $0x02, 224(BP), Y3, Y14
8078  	VPERM2I128 $0x13, Y7, Y11, Y12
8079  	VPERM2I128 $0x13, 224(BP), Y3, Y4
8080  	VPXOR      384(SI), Y0, Y0
8081  	VPXOR      416(SI), Y14, Y14
8082  	VPXOR      448(SI), Y12, Y12
8083  	VPXOR      480(SI), Y4, Y4
8084  	VMOVDQU    Y0, 384(DI)
8085  	VMOVDQU    Y14, 416(DI)
8086  	VMOVDQU    Y12, 448(DI)
8087  	VMOVDQU    Y4, 480(DI)
8088  	LEAQ       512(SI), SI
8089  	SUBQ       $0x00000200, BX
8090  	CMPQ       BX, $0x00000200
8091  	JG         sealAVX2MainLoop
8092  
8093  	// Tail can only hash 480 bytes
8094  	ADDQ  (DI), R10
8095  	ADCQ  8(DI), R11
8096  	ADCQ  $0x01, R12
8097  	MOVQ  (BP), DX
8098  	MOVQ  DX, R15
8099  	MULXQ R10, R13, R14
8100  	IMULQ R12, R15
8101  	MULXQ R11, AX, DX
8102  	ADDQ  AX, R14
8103  	ADCQ  DX, R15
8104  	MOVQ  8(BP), DX
8105  	MULXQ R10, R10, AX
8106  	ADDQ  R10, R14
8107  	MULXQ R11, R11, R8
8108  	ADCQ  R11, R15
8109  	ADCQ  $0x00, R8
8110  	IMULQ R12, DX
8111  	ADDQ  AX, R15
8112  	ADCQ  DX, R8
8113  	MOVQ  R13, R10
8114  	MOVQ  R14, R11
8115  	MOVQ  R15, R12
8116  	ANDQ  $0x03, R12
8117  	MOVQ  R15, R13
8118  	ANDQ  $-4, R13
8119  	MOVQ  R8, R14
8120  	SHRQ  $0x02, R8, R15
8121  	SHRQ  $0x02, R8
8122  	ADDQ  R13, R10
8123  	ADCQ  R14, R11
8124  	ADCQ  $0x00, R12
8125  	ADDQ  R15, R10
8126  	ADCQ  R8, R11
8127  	ADCQ  $0x00, R12
8128  	ADDQ  16(DI), R10
8129  	ADCQ  24(DI), R11
8130  	ADCQ  $0x01, R12
8131  	MOVQ  (BP), DX
8132  	MOVQ  DX, R15
8133  	MULXQ R10, R13, R14
8134  	IMULQ R12, R15
8135  	MULXQ R11, AX, DX
8136  	ADDQ  AX, R14
8137  	ADCQ  DX, R15
8138  	MOVQ  8(BP), DX
8139  	MULXQ R10, R10, AX
8140  	ADDQ  R10, R14
8141  	MULXQ R11, R11, R8
8142  	ADCQ  R11, R15
8143  	ADCQ  $0x00, R8
8144  	IMULQ R12, DX
8145  	ADDQ  AX, R15
8146  	ADCQ  DX, R8
8147  	MOVQ  R13, R10
8148  	MOVQ  R14, R11
8149  	MOVQ  R15, R12
8150  	ANDQ  $0x03, R12
8151  	MOVQ  R15, R13
8152  	ANDQ  $-4, R13
8153  	MOVQ  R8, R14
8154  	SHRQ  $0x02, R8, R15
8155  	SHRQ  $0x02, R8
8156  	ADDQ  R13, R10
8157  	ADCQ  R14, R11
8158  	ADCQ  $0x00, R12
8159  	ADDQ  R15, R10
8160  	ADCQ  R8, R11
8161  	ADCQ  $0x00, R12
8162  	LEAQ  32(DI), DI
8163  	MOVQ  $0x0000000a, CX
8164  	MOVQ  $0x00000000, R9
8165  	CMPQ  BX, $0x80
8166  	JBE   sealAVX2Tail128
8167  	CMPQ  BX, $0x00000100
8168  	JBE   sealAVX2Tail256
8169  	CMPQ  BX, $0x00000180
8170  	JBE   sealAVX2Tail384
8171  	JMP   sealAVX2Tail512
8172  
8173  seal192AVX2:
8174  	VMOVDQA Y0, Y5
8175  	VMOVDQA Y14, Y9
8176  	VMOVDQA Y12, Y13
8177  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
8178  	VMOVDQA Y0, Y6
8179  	VMOVDQA Y14, Y10
8180  	VMOVDQA Y12, Y8
8181  	VMOVDQA Y4, Y2
8182  	VMOVDQA Y1, Y15
8183  	MOVQ    $0x0000000a, R9
8184  
8185  sealAVX2192InnerCipherLoop:
8186  	VPADDD     Y14, Y0, Y0
8187  	VPXOR      Y0, Y4, Y4
8188  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
8189  	VPADDD     Y4, Y12, Y12
8190  	VPXOR      Y12, Y14, Y14
8191  	VPSLLD     $0x0c, Y14, Y3
8192  	VPSRLD     $0x14, Y14, Y14
8193  	VPXOR      Y3, Y14, Y14
8194  	VPADDD     Y14, Y0, Y0
8195  	VPXOR      Y0, Y4, Y4
8196  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
8197  	VPADDD     Y4, Y12, Y12
8198  	VPXOR      Y12, Y14, Y14
8199  	VPSLLD     $0x07, Y14, Y3
8200  	VPSRLD     $0x19, Y14, Y14
8201  	VPXOR      Y3, Y14, Y14
8202  	VPADDD     Y9, Y5, Y5
8203  	VPXOR      Y5, Y1, Y1
8204  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
8205  	VPADDD     Y1, Y13, Y13
8206  	VPXOR      Y13, Y9, Y9
8207  	VPSLLD     $0x0c, Y9, Y3
8208  	VPSRLD     $0x14, Y9, Y9
8209  	VPXOR      Y3, Y9, Y9
8210  	VPADDD     Y9, Y5, Y5
8211  	VPXOR      Y5, Y1, Y1
8212  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
8213  	VPADDD     Y1, Y13, Y13
8214  	VPXOR      Y13, Y9, Y9
8215  	VPSLLD     $0x07, Y9, Y3
8216  	VPSRLD     $0x19, Y9, Y9
8217  	VPXOR      Y3, Y9, Y9
8218  	VPALIGNR   $0x04, Y14, Y14, Y14
8219  	VPALIGNR   $0x04, Y9, Y9, Y9
8220  	VPALIGNR   $0x08, Y12, Y12, Y12
8221  	VPALIGNR   $0x08, Y13, Y13, Y13
8222  	VPALIGNR   $0x0c, Y4, Y4, Y4
8223  	VPALIGNR   $0x0c, Y1, Y1, Y1
8224  	VPADDD     Y14, Y0, Y0
8225  	VPXOR      Y0, Y4, Y4
8226  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
8227  	VPADDD     Y4, Y12, Y12
8228  	VPXOR      Y12, Y14, Y14
8229  	VPSLLD     $0x0c, Y14, Y3
8230  	VPSRLD     $0x14, Y14, Y14
8231  	VPXOR      Y3, Y14, Y14
8232  	VPADDD     Y14, Y0, Y0
8233  	VPXOR      Y0, Y4, Y4
8234  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
8235  	VPADDD     Y4, Y12, Y12
8236  	VPXOR      Y12, Y14, Y14
8237  	VPSLLD     $0x07, Y14, Y3
8238  	VPSRLD     $0x19, Y14, Y14
8239  	VPXOR      Y3, Y14, Y14
8240  	VPADDD     Y9, Y5, Y5
8241  	VPXOR      Y5, Y1, Y1
8242  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
8243  	VPADDD     Y1, Y13, Y13
8244  	VPXOR      Y13, Y9, Y9
8245  	VPSLLD     $0x0c, Y9, Y3
8246  	VPSRLD     $0x14, Y9, Y9
8247  	VPXOR      Y3, Y9, Y9
8248  	VPADDD     Y9, Y5, Y5
8249  	VPXOR      Y5, Y1, Y1
8250  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
8251  	VPADDD     Y1, Y13, Y13
8252  	VPXOR      Y13, Y9, Y9
8253  	VPSLLD     $0x07, Y9, Y3
8254  	VPSRLD     $0x19, Y9, Y9
8255  	VPXOR      Y3, Y9, Y9
8256  	VPALIGNR   $0x0c, Y14, Y14, Y14
8257  	VPALIGNR   $0x0c, Y9, Y9, Y9
8258  	VPALIGNR   $0x08, Y12, Y12, Y12
8259  	VPALIGNR   $0x08, Y13, Y13, Y13
8260  	VPALIGNR   $0x04, Y4, Y4, Y4
8261  	VPALIGNR   $0x04, Y1, Y1, Y1
8262  	DECQ       R9
8263  	JNE        sealAVX2192InnerCipherLoop
8264  	VPADDD     Y6, Y0, Y0
8265  	VPADDD     Y6, Y5, Y5
8266  	VPADDD     Y10, Y14, Y14
8267  	VPADDD     Y10, Y9, Y9
8268  	VPADDD     Y8, Y12, Y12
8269  	VPADDD     Y8, Y13, Y13
8270  	VPADDD     Y2, Y4, Y4
8271  	VPADDD     Y15, Y1, Y1
8272  	VPERM2I128 $0x02, Y0, Y14, Y3
8273  
8274  	// Clamp and store poly key
8275  	VPAND   ·polyClampMask<>+0(SB), Y3, Y3
8276  	VMOVDQA Y3, (BP)
8277  
8278  	// Stream for up to 192 bytes
8279  	VPERM2I128 $0x13, Y0, Y14, Y0
8280  	VPERM2I128 $0x13, Y12, Y4, Y14
8281  	VPERM2I128 $0x02, Y5, Y9, Y12
8282  	VPERM2I128 $0x02, Y13, Y1, Y4
8283  	VPERM2I128 $0x13, Y5, Y9, Y5
8284  	VPERM2I128 $0x13, Y13, Y1, Y9
8285  
8286  sealAVX2ShortSeal:
8287  	// Hash aad
8288  	MOVQ ad_len+80(FP), R9
8289  	CALL polyHashADInternal<>(SB)
8290  	XORQ CX, CX
8291  
8292  sealAVX2SealHash:
8293  	// itr1 holds the number of bytes encrypted but not yet hashed
8294  	CMPQ  CX, $0x10
8295  	JB    sealAVX2ShortSealLoop
8296  	ADDQ  (DI), R10
8297  	ADCQ  8(DI), R11
8298  	ADCQ  $0x01, R12
8299  	MOVQ  (BP), AX
8300  	MOVQ  AX, R15
8301  	MULQ  R10
8302  	MOVQ  AX, R13
8303  	MOVQ  DX, R14
8304  	MOVQ  (BP), AX
8305  	MULQ  R11
8306  	IMULQ R12, R15
8307  	ADDQ  AX, R14
8308  	ADCQ  DX, R15
8309  	MOVQ  8(BP), AX
8310  	MOVQ  AX, R8
8311  	MULQ  R10
8312  	ADDQ  AX, R14
8313  	ADCQ  $0x00, DX
8314  	MOVQ  DX, R10
8315  	MOVQ  8(BP), AX
8316  	MULQ  R11
8317  	ADDQ  AX, R15
8318  	ADCQ  $0x00, DX
8319  	IMULQ R12, R8
8320  	ADDQ  R10, R15
8321  	ADCQ  DX, R8
8322  	MOVQ  R13, R10
8323  	MOVQ  R14, R11
8324  	MOVQ  R15, R12
8325  	ANDQ  $0x03, R12
8326  	MOVQ  R15, R13
8327  	ANDQ  $-4, R13
8328  	MOVQ  R8, R14
8329  	SHRQ  $0x02, R8, R15
8330  	SHRQ  $0x02, R8
8331  	ADDQ  R13, R10
8332  	ADCQ  R14, R11
8333  	ADCQ  $0x00, R12
8334  	ADDQ  R15, R10
8335  	ADCQ  R8, R11
8336  	ADCQ  $0x00, R12
8337  	SUBQ  $0x10, CX
8338  	ADDQ  $0x10, DI
8339  	JMP   sealAVX2SealHash
8340  
8341  sealAVX2ShortSealLoop:
8342  	CMPQ BX, $0x20
8343  	JB   sealAVX2ShortTail32
8344  	SUBQ $0x20, BX
8345  
8346  	// Load for encryption
8347  	VPXOR   (SI), Y0, Y0
8348  	VMOVDQU Y0, (DI)
8349  	LEAQ    32(SI), SI
8350  
8351  	// Now can hash
8352  	ADDQ  (DI), R10
8353  	ADCQ  8(DI), R11
8354  	ADCQ  $0x01, R12
8355  	MOVQ  (BP), DX
8356  	MOVQ  DX, R15
8357  	MULXQ R10, R13, R14
8358  	IMULQ R12, R15
8359  	MULXQ R11, AX, DX
8360  	ADDQ  AX, R14
8361  	ADCQ  DX, R15
8362  	MOVQ  8(BP), DX
8363  	MULXQ R10, R10, AX
8364  	ADDQ  R10, R14
8365  	MULXQ R11, R11, R8
8366  	ADCQ  R11, R15
8367  	ADCQ  $0x00, R8
8368  	IMULQ R12, DX
8369  	ADDQ  AX, R15
8370  	ADCQ  DX, R8
8371  	MOVQ  R13, R10
8372  	MOVQ  R14, R11
8373  	MOVQ  R15, R12
8374  	ANDQ  $0x03, R12
8375  	MOVQ  R15, R13
8376  	ANDQ  $-4, R13
8377  	MOVQ  R8, R14
8378  	SHRQ  $0x02, R8, R15
8379  	SHRQ  $0x02, R8
8380  	ADDQ  R13, R10
8381  	ADCQ  R14, R11
8382  	ADCQ  $0x00, R12
8383  	ADDQ  R15, R10
8384  	ADCQ  R8, R11
8385  	ADCQ  $0x00, R12
8386  	ADDQ  16(DI), R10
8387  	ADCQ  24(DI), R11
8388  	ADCQ  $0x01, R12
8389  	MOVQ  (BP), DX
8390  	MOVQ  DX, R15
8391  	MULXQ R10, R13, R14
8392  	IMULQ R12, R15
8393  	MULXQ R11, AX, DX
8394  	ADDQ  AX, R14
8395  	ADCQ  DX, R15
8396  	MOVQ  8(BP), DX
8397  	MULXQ R10, R10, AX
8398  	ADDQ  R10, R14
8399  	MULXQ R11, R11, R8
8400  	ADCQ  R11, R15
8401  	ADCQ  $0x00, R8
8402  	IMULQ R12, DX
8403  	ADDQ  AX, R15
8404  	ADCQ  DX, R8
8405  	MOVQ  R13, R10
8406  	MOVQ  R14, R11
8407  	MOVQ  R15, R12
8408  	ANDQ  $0x03, R12
8409  	MOVQ  R15, R13
8410  	ANDQ  $-4, R13
8411  	MOVQ  R8, R14
8412  	SHRQ  $0x02, R8, R15
8413  	SHRQ  $0x02, R8
8414  	ADDQ  R13, R10
8415  	ADCQ  R14, R11
8416  	ADCQ  $0x00, R12
8417  	ADDQ  R15, R10
8418  	ADCQ  R8, R11
8419  	ADCQ  $0x00, R12
8420  	LEAQ  32(DI), DI
8421  
8422  	// Shift stream left
8423  	VMOVDQA Y14, Y0
8424  	VMOVDQA Y12, Y14
8425  	VMOVDQA Y4, Y12
8426  	VMOVDQA Y5, Y4
8427  	VMOVDQA Y9, Y5
8428  	VMOVDQA Y13, Y9
8429  	VMOVDQA Y1, Y13
8430  	VMOVDQA Y6, Y1
8431  	VMOVDQA Y10, Y6
8432  	JMP     sealAVX2ShortSealLoop
8433  
8434  sealAVX2ShortTail32:
8435  	CMPQ    BX, $0x10
8436  	VMOVDQA X0, X1
8437  	JB      sealAVX2ShortDone
8438  	SUBQ    $0x10, BX
8439  
8440  	// Load for encryption
8441  	VPXOR   (SI), X0, X12
8442  	VMOVDQU X12, (DI)
8443  	LEAQ    16(SI), SI
8444  
8445  	// Hash
8446  	ADDQ       (DI), R10
8447  	ADCQ       8(DI), R11
8448  	ADCQ       $0x01, R12
8449  	MOVQ       (BP), DX
8450  	MOVQ       DX, R15
8451  	MULXQ      R10, R13, R14
8452  	IMULQ      R12, R15
8453  	MULXQ      R11, AX, DX
8454  	ADDQ       AX, R14
8455  	ADCQ       DX, R15
8456  	MOVQ       8(BP), DX
8457  	MULXQ      R10, R10, AX
8458  	ADDQ       R10, R14
8459  	MULXQ      R11, R11, R8
8460  	ADCQ       R11, R15
8461  	ADCQ       $0x00, R8
8462  	IMULQ      R12, DX
8463  	ADDQ       AX, R15
8464  	ADCQ       DX, R8
8465  	MOVQ       R13, R10
8466  	MOVQ       R14, R11
8467  	MOVQ       R15, R12
8468  	ANDQ       $0x03, R12
8469  	MOVQ       R15, R13
8470  	ANDQ       $-4, R13
8471  	MOVQ       R8, R14
8472  	SHRQ       $0x02, R8, R15
8473  	SHRQ       $0x02, R8
8474  	ADDQ       R13, R10
8475  	ADCQ       R14, R11
8476  	ADCQ       $0x00, R12
8477  	ADDQ       R15, R10
8478  	ADCQ       R8, R11
8479  	ADCQ       $0x00, R12
8480  	LEAQ       16(DI), DI
8481  	VPERM2I128 $0x11, Y0, Y0, Y0
8482  	VMOVDQA    X0, X1
8483  
8484  sealAVX2ShortDone:
8485  	VZEROUPPER
8486  	JMP sealSSETail
8487  
8488  seal320AVX2:
8489  	VMOVDQA Y0, Y5
8490  	VMOVDQA Y14, Y9
8491  	VMOVDQA Y12, Y13
8492  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
8493  	VMOVDQA Y0, Y6
8494  	VMOVDQA Y14, Y10
8495  	VMOVDQA Y12, Y8
8496  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
8497  	VMOVDQA Y14, Y7
8498  	VMOVDQA Y12, Y11
8499  	VMOVDQA Y4, Y15
8500  	MOVQ    $0x0000000a, R9
8501  
8502  sealAVX2320InnerCipherLoop:
8503  	VPADDD   Y14, Y0, Y0
8504  	VPXOR    Y0, Y4, Y4
8505  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
8506  	VPADDD   Y4, Y12, Y12
8507  	VPXOR    Y12, Y14, Y14
8508  	VPSLLD   $0x0c, Y14, Y3
8509  	VPSRLD   $0x14, Y14, Y14
8510  	VPXOR    Y3, Y14, Y14
8511  	VPADDD   Y14, Y0, Y0
8512  	VPXOR    Y0, Y4, Y4
8513  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
8514  	VPADDD   Y4, Y12, Y12
8515  	VPXOR    Y12, Y14, Y14
8516  	VPSLLD   $0x07, Y14, Y3
8517  	VPSRLD   $0x19, Y14, Y14
8518  	VPXOR    Y3, Y14, Y14
8519  	VPADDD   Y9, Y5, Y5
8520  	VPXOR    Y5, Y1, Y1
8521  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
8522  	VPADDD   Y1, Y13, Y13
8523  	VPXOR    Y13, Y9, Y9
8524  	VPSLLD   $0x0c, Y9, Y3
8525  	VPSRLD   $0x14, Y9, Y9
8526  	VPXOR    Y3, Y9, Y9
8527  	VPADDD   Y9, Y5, Y5
8528  	VPXOR    Y5, Y1, Y1
8529  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
8530  	VPADDD   Y1, Y13, Y13
8531  	VPXOR    Y13, Y9, Y9
8532  	VPSLLD   $0x07, Y9, Y3
8533  	VPSRLD   $0x19, Y9, Y9
8534  	VPXOR    Y3, Y9, Y9
8535  	VPADDD   Y10, Y6, Y6
8536  	VPXOR    Y6, Y2, Y2
8537  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
8538  	VPADDD   Y2, Y8, Y8
8539  	VPXOR    Y8, Y10, Y10
8540  	VPSLLD   $0x0c, Y10, Y3
8541  	VPSRLD   $0x14, Y10, Y10
8542  	VPXOR    Y3, Y10, Y10
8543  	VPADDD   Y10, Y6, Y6
8544  	VPXOR    Y6, Y2, Y2
8545  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
8546  	VPADDD   Y2, Y8, Y8
8547  	VPXOR    Y8, Y10, Y10
8548  	VPSLLD   $0x07, Y10, Y3
8549  	VPSRLD   $0x19, Y10, Y10
8550  	VPXOR    Y3, Y10, Y10
8551  	VPALIGNR $0x04, Y14, Y14, Y14
8552  	VPALIGNR $0x04, Y9, Y9, Y9
8553  	VPALIGNR $0x04, Y10, Y10, Y10
8554  	VPALIGNR $0x08, Y12, Y12, Y12
8555  	VPALIGNR $0x08, Y13, Y13, Y13
8556  	VPALIGNR $0x08, Y8, Y8, Y8
8557  	VPALIGNR $0x0c, Y4, Y4, Y4
8558  	VPALIGNR $0x0c, Y1, Y1, Y1
8559  	VPALIGNR $0x0c, Y2, Y2, Y2
8560  	VPADDD   Y14, Y0, Y0
8561  	VPXOR    Y0, Y4, Y4
8562  	VPSHUFB  ·rol16<>+0(SB), Y4, Y4
8563  	VPADDD   Y4, Y12, Y12
8564  	VPXOR    Y12, Y14, Y14
8565  	VPSLLD   $0x0c, Y14, Y3
8566  	VPSRLD   $0x14, Y14, Y14
8567  	VPXOR    Y3, Y14, Y14
8568  	VPADDD   Y14, Y0, Y0
8569  	VPXOR    Y0, Y4, Y4
8570  	VPSHUFB  ·rol8<>+0(SB), Y4, Y4
8571  	VPADDD   Y4, Y12, Y12
8572  	VPXOR    Y12, Y14, Y14
8573  	VPSLLD   $0x07, Y14, Y3
8574  	VPSRLD   $0x19, Y14, Y14
8575  	VPXOR    Y3, Y14, Y14
8576  	VPADDD   Y9, Y5, Y5
8577  	VPXOR    Y5, Y1, Y1
8578  	VPSHUFB  ·rol16<>+0(SB), Y1, Y1
8579  	VPADDD   Y1, Y13, Y13
8580  	VPXOR    Y13, Y9, Y9
8581  	VPSLLD   $0x0c, Y9, Y3
8582  	VPSRLD   $0x14, Y9, Y9
8583  	VPXOR    Y3, Y9, Y9
8584  	VPADDD   Y9, Y5, Y5
8585  	VPXOR    Y5, Y1, Y1
8586  	VPSHUFB  ·rol8<>+0(SB), Y1, Y1
8587  	VPADDD   Y1, Y13, Y13
8588  	VPXOR    Y13, Y9, Y9
8589  	VPSLLD   $0x07, Y9, Y3
8590  	VPSRLD   $0x19, Y9, Y9
8591  	VPXOR    Y3, Y9, Y9
8592  	VPADDD   Y10, Y6, Y6
8593  	VPXOR    Y6, Y2, Y2
8594  	VPSHUFB  ·rol16<>+0(SB), Y2, Y2
8595  	VPADDD   Y2, Y8, Y8
8596  	VPXOR    Y8, Y10, Y10
8597  	VPSLLD   $0x0c, Y10, Y3
8598  	VPSRLD   $0x14, Y10, Y10
8599  	VPXOR    Y3, Y10, Y10
8600  	VPADDD   Y10, Y6, Y6
8601  	VPXOR    Y6, Y2, Y2
8602  	VPSHUFB  ·rol8<>+0(SB), Y2, Y2
8603  	VPADDD   Y2, Y8, Y8
8604  	VPXOR    Y8, Y10, Y10
8605  	VPSLLD   $0x07, Y10, Y3
8606  	VPSRLD   $0x19, Y10, Y10
8607  	VPXOR    Y3, Y10, Y10
8608  	VPALIGNR $0x0c, Y14, Y14, Y14
8609  	VPALIGNR $0x0c, Y9, Y9, Y9
8610  	VPALIGNR $0x0c, Y10, Y10, Y10
8611  	VPALIGNR $0x08, Y12, Y12, Y12
8612  	VPALIGNR $0x08, Y13, Y13, Y13
8613  	VPALIGNR $0x08, Y8, Y8, Y8
8614  	VPALIGNR $0x04, Y4, Y4, Y4
8615  	VPALIGNR $0x04, Y1, Y1, Y1
8616  	VPALIGNR $0x04, Y2, Y2, Y2
8617  	DECQ     R9
8618  	JNE      sealAVX2320InnerCipherLoop
8619  	VMOVDQA  ·chacha20Constants<>+0(SB), Y3
8620  	VPADDD   Y3, Y0, Y0
8621  	VPADDD   Y3, Y5, Y5
8622  	VPADDD   Y3, Y6, Y6
8623  	VPADDD   Y7, Y14, Y14
8624  	VPADDD   Y7, Y9, Y9
8625  	VPADDD   Y7, Y10, Y10
8626  	VPADDD   Y11, Y12, Y12
8627  	VPADDD   Y11, Y13, Y13
8628  	VPADDD   Y11, Y8, Y8
8629  	VMOVDQA  ·avx2IncMask<>+0(SB), Y3
8630  	VPADDD   Y15, Y4, Y4
8631  	VPADDD   Y3, Y15, Y15
8632  	VPADDD   Y15, Y1, Y1
8633  	VPADDD   Y3, Y15, Y15
8634  	VPADDD   Y15, Y2, Y2
8635  
8636  	// Clamp and store poly key
8637  	VPERM2I128 $0x02, Y0, Y14, Y3
8638  	VPAND      ·polyClampMask<>+0(SB), Y3, Y3
8639  	VMOVDQA    Y3, (BP)
8640  
8641  	// Stream for up to 320 bytes
8642  	VPERM2I128 $0x13, Y0, Y14, Y0
8643  	VPERM2I128 $0x13, Y12, Y4, Y14
8644  	VPERM2I128 $0x02, Y5, Y9, Y12
8645  	VPERM2I128 $0x02, Y13, Y1, Y4
8646  	VPERM2I128 $0x13, Y5, Y9, Y5
8647  	VPERM2I128 $0x13, Y13, Y1, Y9
8648  	VPERM2I128 $0x02, Y6, Y10, Y13
8649  	VPERM2I128 $0x02, Y8, Y2, Y1
8650  	VPERM2I128 $0x13, Y6, Y10, Y6
8651  	VPERM2I128 $0x13, Y8, Y2, Y10
8652  	JMP        sealAVX2ShortSeal
8653  
8654  sealAVX2Tail128:
8655  	VMOVDQA ·chacha20Constants<>+0(SB), Y0
8656  	VMOVDQA 32(BP), Y14
8657  	VMOVDQA 64(BP), Y12
8658  	VMOVDQA 192(BP), Y4
8659  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
8660  	VMOVDQA Y4, Y1
8661  
8662  sealAVX2Tail128LoopA:
8663  	ADDQ  (DI), R10
8664  	ADCQ  8(DI), R11
8665  	ADCQ  $0x01, R12
8666  	MOVQ  (BP), AX
8667  	MOVQ  AX, R15
8668  	MULQ  R10
8669  	MOVQ  AX, R13
8670  	MOVQ  DX, R14
8671  	MOVQ  (BP), AX
8672  	MULQ  R11
8673  	IMULQ R12, R15
8674  	ADDQ  AX, R14
8675  	ADCQ  DX, R15
8676  	MOVQ  8(BP), AX
8677  	MOVQ  AX, R8
8678  	MULQ  R10
8679  	ADDQ  AX, R14
8680  	ADCQ  $0x00, DX
8681  	MOVQ  DX, R10
8682  	MOVQ  8(BP), AX
8683  	MULQ  R11
8684  	ADDQ  AX, R15
8685  	ADCQ  $0x00, DX
8686  	IMULQ R12, R8
8687  	ADDQ  R10, R15
8688  	ADCQ  DX, R8
8689  	MOVQ  R13, R10
8690  	MOVQ  R14, R11
8691  	MOVQ  R15, R12
8692  	ANDQ  $0x03, R12
8693  	MOVQ  R15, R13
8694  	ANDQ  $-4, R13
8695  	MOVQ  R8, R14
8696  	SHRQ  $0x02, R8, R15
8697  	SHRQ  $0x02, R8
8698  	ADDQ  R13, R10
8699  	ADCQ  R14, R11
8700  	ADCQ  $0x00, R12
8701  	ADDQ  R15, R10
8702  	ADCQ  R8, R11
8703  	ADCQ  $0x00, R12
8704  	LEAQ  16(DI), DI
8705  
8706  sealAVX2Tail128LoopB:
8707  	VPADDD     Y14, Y0, Y0
8708  	VPXOR      Y0, Y4, Y4
8709  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
8710  	VPADDD     Y4, Y12, Y12
8711  	VPXOR      Y12, Y14, Y14
8712  	VPSLLD     $0x0c, Y14, Y3
8713  	VPSRLD     $0x14, Y14, Y14
8714  	VPXOR      Y3, Y14, Y14
8715  	VPADDD     Y14, Y0, Y0
8716  	VPXOR      Y0, Y4, Y4
8717  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
8718  	VPADDD     Y4, Y12, Y12
8719  	VPXOR      Y12, Y14, Y14
8720  	VPSLLD     $0x07, Y14, Y3
8721  	VPSRLD     $0x19, Y14, Y14
8722  	VPXOR      Y3, Y14, Y14
8723  	ADDQ       (DI), R10
8724  	ADCQ       8(DI), R11
8725  	ADCQ       $0x01, R12
8726  	MOVQ       (BP), AX
8727  	MOVQ       AX, R15
8728  	MULQ       R10
8729  	MOVQ       AX, R13
8730  	MOVQ       DX, R14
8731  	MOVQ       (BP), AX
8732  	MULQ       R11
8733  	IMULQ      R12, R15
8734  	ADDQ       AX, R14
8735  	ADCQ       DX, R15
8736  	MOVQ       8(BP), AX
8737  	MOVQ       AX, R8
8738  	MULQ       R10
8739  	ADDQ       AX, R14
8740  	ADCQ       $0x00, DX
8741  	MOVQ       DX, R10
8742  	MOVQ       8(BP), AX
8743  	MULQ       R11
8744  	ADDQ       AX, R15
8745  	ADCQ       $0x00, DX
8746  	IMULQ      R12, R8
8747  	ADDQ       R10, R15
8748  	ADCQ       DX, R8
8749  	MOVQ       R13, R10
8750  	MOVQ       R14, R11
8751  	MOVQ       R15, R12
8752  	ANDQ       $0x03, R12
8753  	MOVQ       R15, R13
8754  	ANDQ       $-4, R13
8755  	MOVQ       R8, R14
8756  	SHRQ       $0x02, R8, R15
8757  	SHRQ       $0x02, R8
8758  	ADDQ       R13, R10
8759  	ADCQ       R14, R11
8760  	ADCQ       $0x00, R12
8761  	ADDQ       R15, R10
8762  	ADCQ       R8, R11
8763  	ADCQ       $0x00, R12
8764  	VPALIGNR   $0x04, Y14, Y14, Y14
8765  	VPALIGNR   $0x08, Y12, Y12, Y12
8766  	VPALIGNR   $0x0c, Y4, Y4, Y4
8767  	VPADDD     Y14, Y0, Y0
8768  	VPXOR      Y0, Y4, Y4
8769  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
8770  	VPADDD     Y4, Y12, Y12
8771  	VPXOR      Y12, Y14, Y14
8772  	VPSLLD     $0x0c, Y14, Y3
8773  	VPSRLD     $0x14, Y14, Y14
8774  	VPXOR      Y3, Y14, Y14
8775  	VPADDD     Y14, Y0, Y0
8776  	VPXOR      Y0, Y4, Y4
8777  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
8778  	VPADDD     Y4, Y12, Y12
8779  	VPXOR      Y12, Y14, Y14
8780  	VPSLLD     $0x07, Y14, Y3
8781  	VPSRLD     $0x19, Y14, Y14
8782  	VPXOR      Y3, Y14, Y14
8783  	ADDQ       16(DI), R10
8784  	ADCQ       24(DI), R11
8785  	ADCQ       $0x01, R12
8786  	MOVQ       (BP), AX
8787  	MOVQ       AX, R15
8788  	MULQ       R10
8789  	MOVQ       AX, R13
8790  	MOVQ       DX, R14
8791  	MOVQ       (BP), AX
8792  	MULQ       R11
8793  	IMULQ      R12, R15
8794  	ADDQ       AX, R14
8795  	ADCQ       DX, R15
8796  	MOVQ       8(BP), AX
8797  	MOVQ       AX, R8
8798  	MULQ       R10
8799  	ADDQ       AX, R14
8800  	ADCQ       $0x00, DX
8801  	MOVQ       DX, R10
8802  	MOVQ       8(BP), AX
8803  	MULQ       R11
8804  	ADDQ       AX, R15
8805  	ADCQ       $0x00, DX
8806  	IMULQ      R12, R8
8807  	ADDQ       R10, R15
8808  	ADCQ       DX, R8
8809  	MOVQ       R13, R10
8810  	MOVQ       R14, R11
8811  	MOVQ       R15, R12
8812  	ANDQ       $0x03, R12
8813  	MOVQ       R15, R13
8814  	ANDQ       $-4, R13
8815  	MOVQ       R8, R14
8816  	SHRQ       $0x02, R8, R15
8817  	SHRQ       $0x02, R8
8818  	ADDQ       R13, R10
8819  	ADCQ       R14, R11
8820  	ADCQ       $0x00, R12
8821  	ADDQ       R15, R10
8822  	ADCQ       R8, R11
8823  	ADCQ       $0x00, R12
8824  	LEAQ       32(DI), DI
8825  	VPALIGNR   $0x0c, Y14, Y14, Y14
8826  	VPALIGNR   $0x08, Y12, Y12, Y12
8827  	VPALIGNR   $0x04, Y4, Y4, Y4
8828  	DECQ       CX
8829  	JG         sealAVX2Tail128LoopA
8830  	DECQ       R9
8831  	JGE        sealAVX2Tail128LoopB
8832  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y5
8833  	VPADDD     32(BP), Y14, Y9
8834  	VPADDD     64(BP), Y12, Y13
8835  	VPADDD     Y1, Y4, Y1
8836  	VPERM2I128 $0x02, Y5, Y9, Y0
8837  	VPERM2I128 $0x02, Y13, Y1, Y14
8838  	VPERM2I128 $0x13, Y5, Y9, Y12
8839  	VPERM2I128 $0x13, Y13, Y1, Y4
8840  	JMP        sealAVX2ShortSealLoop
8841  
8842  sealAVX2Tail256:
8843  	VMOVDQA ·chacha20Constants<>+0(SB), Y0
8844  	VMOVDQA ·chacha20Constants<>+0(SB), Y5
8845  	VMOVDQA 32(BP), Y14
8846  	VMOVDQA 32(BP), Y9
8847  	VMOVDQA 64(BP), Y12
8848  	VMOVDQA 64(BP), Y13
8849  	VMOVDQA 192(BP), Y4
8850  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
8851  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
8852  	VMOVDQA Y4, Y7
8853  	VMOVDQA Y1, Y11
8854  
8855  sealAVX2Tail256LoopA:
8856  	ADDQ  (DI), R10
8857  	ADCQ  8(DI), R11
8858  	ADCQ  $0x01, R12
8859  	MOVQ  (BP), AX
8860  	MOVQ  AX, R15
8861  	MULQ  R10
8862  	MOVQ  AX, R13
8863  	MOVQ  DX, R14
8864  	MOVQ  (BP), AX
8865  	MULQ  R11
8866  	IMULQ R12, R15
8867  	ADDQ  AX, R14
8868  	ADCQ  DX, R15
8869  	MOVQ  8(BP), AX
8870  	MOVQ  AX, R8
8871  	MULQ  R10
8872  	ADDQ  AX, R14
8873  	ADCQ  $0x00, DX
8874  	MOVQ  DX, R10
8875  	MOVQ  8(BP), AX
8876  	MULQ  R11
8877  	ADDQ  AX, R15
8878  	ADCQ  $0x00, DX
8879  	IMULQ R12, R8
8880  	ADDQ  R10, R15
8881  	ADCQ  DX, R8
8882  	MOVQ  R13, R10
8883  	MOVQ  R14, R11
8884  	MOVQ  R15, R12
8885  	ANDQ  $0x03, R12
8886  	MOVQ  R15, R13
8887  	ANDQ  $-4, R13
8888  	MOVQ  R8, R14
8889  	SHRQ  $0x02, R8, R15
8890  	SHRQ  $0x02, R8
8891  	ADDQ  R13, R10
8892  	ADCQ  R14, R11
8893  	ADCQ  $0x00, R12
8894  	ADDQ  R15, R10
8895  	ADCQ  R8, R11
8896  	ADCQ  $0x00, R12
8897  	LEAQ  16(DI), DI
8898  
8899  sealAVX2Tail256LoopB:
8900  	VPADDD     Y14, Y0, Y0
8901  	VPXOR      Y0, Y4, Y4
8902  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
8903  	VPADDD     Y4, Y12, Y12
8904  	VPXOR      Y12, Y14, Y14
8905  	VPSLLD     $0x0c, Y14, Y3
8906  	VPSRLD     $0x14, Y14, Y14
8907  	VPXOR      Y3, Y14, Y14
8908  	VPADDD     Y14, Y0, Y0
8909  	VPXOR      Y0, Y4, Y4
8910  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
8911  	VPADDD     Y4, Y12, Y12
8912  	VPXOR      Y12, Y14, Y14
8913  	VPSLLD     $0x07, Y14, Y3
8914  	VPSRLD     $0x19, Y14, Y14
8915  	VPXOR      Y3, Y14, Y14
8916  	VPADDD     Y9, Y5, Y5
8917  	VPXOR      Y5, Y1, Y1
8918  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
8919  	VPADDD     Y1, Y13, Y13
8920  	VPXOR      Y13, Y9, Y9
8921  	VPSLLD     $0x0c, Y9, Y3
8922  	VPSRLD     $0x14, Y9, Y9
8923  	VPXOR      Y3, Y9, Y9
8924  	VPADDD     Y9, Y5, Y5
8925  	VPXOR      Y5, Y1, Y1
8926  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
8927  	VPADDD     Y1, Y13, Y13
8928  	VPXOR      Y13, Y9, Y9
8929  	VPSLLD     $0x07, Y9, Y3
8930  	VPSRLD     $0x19, Y9, Y9
8931  	VPXOR      Y3, Y9, Y9
8932  	ADDQ       (DI), R10
8933  	ADCQ       8(DI), R11
8934  	ADCQ       $0x01, R12
8935  	MOVQ       (BP), AX
8936  	MOVQ       AX, R15
8937  	MULQ       R10
8938  	MOVQ       AX, R13
8939  	MOVQ       DX, R14
8940  	MOVQ       (BP), AX
8941  	MULQ       R11
8942  	IMULQ      R12, R15
8943  	ADDQ       AX, R14
8944  	ADCQ       DX, R15
8945  	MOVQ       8(BP), AX
8946  	MOVQ       AX, R8
8947  	MULQ       R10
8948  	ADDQ       AX, R14
8949  	ADCQ       $0x00, DX
8950  	MOVQ       DX, R10
8951  	MOVQ       8(BP), AX
8952  	MULQ       R11
8953  	ADDQ       AX, R15
8954  	ADCQ       $0x00, DX
8955  	IMULQ      R12, R8
8956  	ADDQ       R10, R15
8957  	ADCQ       DX, R8
8958  	MOVQ       R13, R10
8959  	MOVQ       R14, R11
8960  	MOVQ       R15, R12
8961  	ANDQ       $0x03, R12
8962  	MOVQ       R15, R13
8963  	ANDQ       $-4, R13
8964  	MOVQ       R8, R14
8965  	SHRQ       $0x02, R8, R15
8966  	SHRQ       $0x02, R8
8967  	ADDQ       R13, R10
8968  	ADCQ       R14, R11
8969  	ADCQ       $0x00, R12
8970  	ADDQ       R15, R10
8971  	ADCQ       R8, R11
8972  	ADCQ       $0x00, R12
8973  	VPALIGNR   $0x04, Y14, Y14, Y14
8974  	VPALIGNR   $0x04, Y9, Y9, Y9
8975  	VPALIGNR   $0x08, Y12, Y12, Y12
8976  	VPALIGNR   $0x08, Y13, Y13, Y13
8977  	VPALIGNR   $0x0c, Y4, Y4, Y4
8978  	VPALIGNR   $0x0c, Y1, Y1, Y1
8979  	VPADDD     Y14, Y0, Y0
8980  	VPXOR      Y0, Y4, Y4
8981  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
8982  	VPADDD     Y4, Y12, Y12
8983  	VPXOR      Y12, Y14, Y14
8984  	VPSLLD     $0x0c, Y14, Y3
8985  	VPSRLD     $0x14, Y14, Y14
8986  	VPXOR      Y3, Y14, Y14
8987  	VPADDD     Y14, Y0, Y0
8988  	VPXOR      Y0, Y4, Y4
8989  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
8990  	VPADDD     Y4, Y12, Y12
8991  	VPXOR      Y12, Y14, Y14
8992  	VPSLLD     $0x07, Y14, Y3
8993  	VPSRLD     $0x19, Y14, Y14
8994  	VPXOR      Y3, Y14, Y14
8995  	VPADDD     Y9, Y5, Y5
8996  	VPXOR      Y5, Y1, Y1
8997  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
8998  	VPADDD     Y1, Y13, Y13
8999  	VPXOR      Y13, Y9, Y9
9000  	VPSLLD     $0x0c, Y9, Y3
9001  	VPSRLD     $0x14, Y9, Y9
9002  	VPXOR      Y3, Y9, Y9
9003  	VPADDD     Y9, Y5, Y5
9004  	VPXOR      Y5, Y1, Y1
9005  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
9006  	VPADDD     Y1, Y13, Y13
9007  	VPXOR      Y13, Y9, Y9
9008  	VPSLLD     $0x07, Y9, Y3
9009  	VPSRLD     $0x19, Y9, Y9
9010  	VPXOR      Y3, Y9, Y9
9011  	ADDQ       16(DI), R10
9012  	ADCQ       24(DI), R11
9013  	ADCQ       $0x01, R12
9014  	MOVQ       (BP), AX
9015  	MOVQ       AX, R15
9016  	MULQ       R10
9017  	MOVQ       AX, R13
9018  	MOVQ       DX, R14
9019  	MOVQ       (BP), AX
9020  	MULQ       R11
9021  	IMULQ      R12, R15
9022  	ADDQ       AX, R14
9023  	ADCQ       DX, R15
9024  	MOVQ       8(BP), AX
9025  	MOVQ       AX, R8
9026  	MULQ       R10
9027  	ADDQ       AX, R14
9028  	ADCQ       $0x00, DX
9029  	MOVQ       DX, R10
9030  	MOVQ       8(BP), AX
9031  	MULQ       R11
9032  	ADDQ       AX, R15
9033  	ADCQ       $0x00, DX
9034  	IMULQ      R12, R8
9035  	ADDQ       R10, R15
9036  	ADCQ       DX, R8
9037  	MOVQ       R13, R10
9038  	MOVQ       R14, R11
9039  	MOVQ       R15, R12
9040  	ANDQ       $0x03, R12
9041  	MOVQ       R15, R13
9042  	ANDQ       $-4, R13
9043  	MOVQ       R8, R14
9044  	SHRQ       $0x02, R8, R15
9045  	SHRQ       $0x02, R8
9046  	ADDQ       R13, R10
9047  	ADCQ       R14, R11
9048  	ADCQ       $0x00, R12
9049  	ADDQ       R15, R10
9050  	ADCQ       R8, R11
9051  	ADCQ       $0x00, R12
9052  	LEAQ       32(DI), DI
9053  	VPALIGNR   $0x0c, Y14, Y14, Y14
9054  	VPALIGNR   $0x0c, Y9, Y9, Y9
9055  	VPALIGNR   $0x08, Y12, Y12, Y12
9056  	VPALIGNR   $0x08, Y13, Y13, Y13
9057  	VPALIGNR   $0x04, Y4, Y4, Y4
9058  	VPALIGNR   $0x04, Y1, Y1, Y1
9059  	DECQ       CX
9060  	JG         sealAVX2Tail256LoopA
9061  	DECQ       R9
9062  	JGE        sealAVX2Tail256LoopB
9063  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
9064  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
9065  	VPADDD     32(BP), Y14, Y14
9066  	VPADDD     32(BP), Y9, Y9
9067  	VPADDD     64(BP), Y12, Y12
9068  	VPADDD     64(BP), Y13, Y13
9069  	VPADDD     Y7, Y4, Y4
9070  	VPADDD     Y11, Y1, Y1
9071  	VPERM2I128 $0x02, Y0, Y14, Y3
9072  	VPERM2I128 $0x02, Y12, Y4, Y7
9073  	VPERM2I128 $0x13, Y0, Y14, Y11
9074  	VPERM2I128 $0x13, Y12, Y4, Y15
9075  	VPXOR      (SI), Y3, Y3
9076  	VPXOR      32(SI), Y7, Y7
9077  	VPXOR      64(SI), Y11, Y11
9078  	VPXOR      96(SI), Y15, Y15
9079  	VMOVDQU    Y3, (DI)
9080  	VMOVDQU    Y7, 32(DI)
9081  	VMOVDQU    Y11, 64(DI)
9082  	VMOVDQU    Y15, 96(DI)
9083  	MOVQ       $0x00000080, CX
9084  	LEAQ       128(SI), SI
9085  	SUBQ       $0x80, BX
9086  	VPERM2I128 $0x02, Y5, Y9, Y0
9087  	VPERM2I128 $0x02, Y13, Y1, Y14
9088  	VPERM2I128 $0x13, Y5, Y9, Y12
9089  	VPERM2I128 $0x13, Y13, Y1, Y4
9090  	JMP        sealAVX2SealHash
9091  
9092  sealAVX2Tail384:
9093  	VMOVDQA ·chacha20Constants<>+0(SB), Y0
9094  	VMOVDQA Y0, Y5
9095  	VMOVDQA Y0, Y6
9096  	VMOVDQA 32(BP), Y14
9097  	VMOVDQA Y14, Y9
9098  	VMOVDQA Y14, Y10
9099  	VMOVDQA 64(BP), Y12
9100  	VMOVDQA Y12, Y13
9101  	VMOVDQA Y12, Y8
9102  	VMOVDQA 192(BP), Y4
9103  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
9104  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
9105  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
9106  	VMOVDQA Y4, Y7
9107  	VMOVDQA Y1, Y11
9108  	VMOVDQA Y2, Y15
9109  
9110  sealAVX2Tail384LoopA:
9111  	ADDQ  (DI), R10
9112  	ADCQ  8(DI), R11
9113  	ADCQ  $0x01, R12
9114  	MOVQ  (BP), AX
9115  	MOVQ  AX, R15
9116  	MULQ  R10
9117  	MOVQ  AX, R13
9118  	MOVQ  DX, R14
9119  	MOVQ  (BP), AX
9120  	MULQ  R11
9121  	IMULQ R12, R15
9122  	ADDQ  AX, R14
9123  	ADCQ  DX, R15
9124  	MOVQ  8(BP), AX
9125  	MOVQ  AX, R8
9126  	MULQ  R10
9127  	ADDQ  AX, R14
9128  	ADCQ  $0x00, DX
9129  	MOVQ  DX, R10
9130  	MOVQ  8(BP), AX
9131  	MULQ  R11
9132  	ADDQ  AX, R15
9133  	ADCQ  $0x00, DX
9134  	IMULQ R12, R8
9135  	ADDQ  R10, R15
9136  	ADCQ  DX, R8
9137  	MOVQ  R13, R10
9138  	MOVQ  R14, R11
9139  	MOVQ  R15, R12
9140  	ANDQ  $0x03, R12
9141  	MOVQ  R15, R13
9142  	ANDQ  $-4, R13
9143  	MOVQ  R8, R14
9144  	SHRQ  $0x02, R8, R15
9145  	SHRQ  $0x02, R8
9146  	ADDQ  R13, R10
9147  	ADCQ  R14, R11
9148  	ADCQ  $0x00, R12
9149  	ADDQ  R15, R10
9150  	ADCQ  R8, R11
9151  	ADCQ  $0x00, R12
9152  	LEAQ  16(DI), DI
9153  
9154  sealAVX2Tail384LoopB:
9155  	VPADDD     Y14, Y0, Y0
9156  	VPXOR      Y0, Y4, Y4
9157  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
9158  	VPADDD     Y4, Y12, Y12
9159  	VPXOR      Y12, Y14, Y14
9160  	VPSLLD     $0x0c, Y14, Y3
9161  	VPSRLD     $0x14, Y14, Y14
9162  	VPXOR      Y3, Y14, Y14
9163  	VPADDD     Y14, Y0, Y0
9164  	VPXOR      Y0, Y4, Y4
9165  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
9166  	VPADDD     Y4, Y12, Y12
9167  	VPXOR      Y12, Y14, Y14
9168  	VPSLLD     $0x07, Y14, Y3
9169  	VPSRLD     $0x19, Y14, Y14
9170  	VPXOR      Y3, Y14, Y14
9171  	VPADDD     Y9, Y5, Y5
9172  	VPXOR      Y5, Y1, Y1
9173  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
9174  	VPADDD     Y1, Y13, Y13
9175  	VPXOR      Y13, Y9, Y9
9176  	VPSLLD     $0x0c, Y9, Y3
9177  	VPSRLD     $0x14, Y9, Y9
9178  	VPXOR      Y3, Y9, Y9
9179  	VPADDD     Y9, Y5, Y5
9180  	VPXOR      Y5, Y1, Y1
9181  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
9182  	VPADDD     Y1, Y13, Y13
9183  	VPXOR      Y13, Y9, Y9
9184  	VPSLLD     $0x07, Y9, Y3
9185  	VPSRLD     $0x19, Y9, Y9
9186  	VPXOR      Y3, Y9, Y9
9187  	VPADDD     Y10, Y6, Y6
9188  	VPXOR      Y6, Y2, Y2
9189  	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
9190  	VPADDD     Y2, Y8, Y8
9191  	VPXOR      Y8, Y10, Y10
9192  	VPSLLD     $0x0c, Y10, Y3
9193  	VPSRLD     $0x14, Y10, Y10
9194  	VPXOR      Y3, Y10, Y10
9195  	VPADDD     Y10, Y6, Y6
9196  	VPXOR      Y6, Y2, Y2
9197  	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
9198  	VPADDD     Y2, Y8, Y8
9199  	VPXOR      Y8, Y10, Y10
9200  	VPSLLD     $0x07, Y10, Y3
9201  	VPSRLD     $0x19, Y10, Y10
9202  	VPXOR      Y3, Y10, Y10
9203  	ADDQ       (DI), R10
9204  	ADCQ       8(DI), R11
9205  	ADCQ       $0x01, R12
9206  	MOVQ       (BP), AX
9207  	MOVQ       AX, R15
9208  	MULQ       R10
9209  	MOVQ       AX, R13
9210  	MOVQ       DX, R14
9211  	MOVQ       (BP), AX
9212  	MULQ       R11
9213  	IMULQ      R12, R15
9214  	ADDQ       AX, R14
9215  	ADCQ       DX, R15
9216  	MOVQ       8(BP), AX
9217  	MOVQ       AX, R8
9218  	MULQ       R10
9219  	ADDQ       AX, R14
9220  	ADCQ       $0x00, DX
9221  	MOVQ       DX, R10
9222  	MOVQ       8(BP), AX
9223  	MULQ       R11
9224  	ADDQ       AX, R15
9225  	ADCQ       $0x00, DX
9226  	IMULQ      R12, R8
9227  	ADDQ       R10, R15
9228  	ADCQ       DX, R8
9229  	MOVQ       R13, R10
9230  	MOVQ       R14, R11
9231  	MOVQ       R15, R12
9232  	ANDQ       $0x03, R12
9233  	MOVQ       R15, R13
9234  	ANDQ       $-4, R13
9235  	MOVQ       R8, R14
9236  	SHRQ       $0x02, R8, R15
9237  	SHRQ       $0x02, R8
9238  	ADDQ       R13, R10
9239  	ADCQ       R14, R11
9240  	ADCQ       $0x00, R12
9241  	ADDQ       R15, R10
9242  	ADCQ       R8, R11
9243  	ADCQ       $0x00, R12
9244  	VPALIGNR   $0x04, Y14, Y14, Y14
9245  	VPALIGNR   $0x04, Y9, Y9, Y9
9246  	VPALIGNR   $0x04, Y10, Y10, Y10
9247  	VPALIGNR   $0x08, Y12, Y12, Y12
9248  	VPALIGNR   $0x08, Y13, Y13, Y13
9249  	VPALIGNR   $0x08, Y8, Y8, Y8
9250  	VPALIGNR   $0x0c, Y4, Y4, Y4
9251  	VPALIGNR   $0x0c, Y1, Y1, Y1
9252  	VPALIGNR   $0x0c, Y2, Y2, Y2
9253  	VPADDD     Y14, Y0, Y0
9254  	VPXOR      Y0, Y4, Y4
9255  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
9256  	VPADDD     Y4, Y12, Y12
9257  	VPXOR      Y12, Y14, Y14
9258  	VPSLLD     $0x0c, Y14, Y3
9259  	VPSRLD     $0x14, Y14, Y14
9260  	VPXOR      Y3, Y14, Y14
9261  	VPADDD     Y14, Y0, Y0
9262  	VPXOR      Y0, Y4, Y4
9263  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
9264  	VPADDD     Y4, Y12, Y12
9265  	VPXOR      Y12, Y14, Y14
9266  	VPSLLD     $0x07, Y14, Y3
9267  	VPSRLD     $0x19, Y14, Y14
9268  	VPXOR      Y3, Y14, Y14
9269  	VPADDD     Y9, Y5, Y5
9270  	VPXOR      Y5, Y1, Y1
9271  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
9272  	VPADDD     Y1, Y13, Y13
9273  	VPXOR      Y13, Y9, Y9
9274  	VPSLLD     $0x0c, Y9, Y3
9275  	VPSRLD     $0x14, Y9, Y9
9276  	VPXOR      Y3, Y9, Y9
9277  	VPADDD     Y9, Y5, Y5
9278  	VPXOR      Y5, Y1, Y1
9279  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
9280  	VPADDD     Y1, Y13, Y13
9281  	VPXOR      Y13, Y9, Y9
9282  	VPSLLD     $0x07, Y9, Y3
9283  	VPSRLD     $0x19, Y9, Y9
9284  	VPXOR      Y3, Y9, Y9
9285  	VPADDD     Y10, Y6, Y6
9286  	VPXOR      Y6, Y2, Y2
9287  	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
9288  	VPADDD     Y2, Y8, Y8
9289  	VPXOR      Y8, Y10, Y10
9290  	VPSLLD     $0x0c, Y10, Y3
9291  	VPSRLD     $0x14, Y10, Y10
9292  	VPXOR      Y3, Y10, Y10
9293  	VPADDD     Y10, Y6, Y6
9294  	VPXOR      Y6, Y2, Y2
9295  	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
9296  	VPADDD     Y2, Y8, Y8
9297  	VPXOR      Y8, Y10, Y10
9298  	VPSLLD     $0x07, Y10, Y3
9299  	VPSRLD     $0x19, Y10, Y10
9300  	VPXOR      Y3, Y10, Y10
9301  	ADDQ       16(DI), R10
9302  	ADCQ       24(DI), R11
9303  	ADCQ       $0x01, R12
9304  	MOVQ       (BP), AX
9305  	MOVQ       AX, R15
9306  	MULQ       R10
9307  	MOVQ       AX, R13
9308  	MOVQ       DX, R14
9309  	MOVQ       (BP), AX
9310  	MULQ       R11
9311  	IMULQ      R12, R15
9312  	ADDQ       AX, R14
9313  	ADCQ       DX, R15
9314  	MOVQ       8(BP), AX
9315  	MOVQ       AX, R8
9316  	MULQ       R10
9317  	ADDQ       AX, R14
9318  	ADCQ       $0x00, DX
9319  	MOVQ       DX, R10
9320  	MOVQ       8(BP), AX
9321  	MULQ       R11
9322  	ADDQ       AX, R15
9323  	ADCQ       $0x00, DX
9324  	IMULQ      R12, R8
9325  	ADDQ       R10, R15
9326  	ADCQ       DX, R8
9327  	MOVQ       R13, R10
9328  	MOVQ       R14, R11
9329  	MOVQ       R15, R12
9330  	ANDQ       $0x03, R12
9331  	MOVQ       R15, R13
9332  	ANDQ       $-4, R13
9333  	MOVQ       R8, R14
9334  	SHRQ       $0x02, R8, R15
9335  	SHRQ       $0x02, R8
9336  	ADDQ       R13, R10
9337  	ADCQ       R14, R11
9338  	ADCQ       $0x00, R12
9339  	ADDQ       R15, R10
9340  	ADCQ       R8, R11
9341  	ADCQ       $0x00, R12
9342  	LEAQ       32(DI), DI
9343  	VPALIGNR   $0x0c, Y14, Y14, Y14
9344  	VPALIGNR   $0x0c, Y9, Y9, Y9
9345  	VPALIGNR   $0x0c, Y10, Y10, Y10
9346  	VPALIGNR   $0x08, Y12, Y12, Y12
9347  	VPALIGNR   $0x08, Y13, Y13, Y13
9348  	VPALIGNR   $0x08, Y8, Y8, Y8
9349  	VPALIGNR   $0x04, Y4, Y4, Y4
9350  	VPALIGNR   $0x04, Y1, Y1, Y1
9351  	VPALIGNR   $0x04, Y2, Y2, Y2
9352  	DECQ       CX
9353  	JG         sealAVX2Tail384LoopA
9354  	DECQ       R9
9355  	JGE        sealAVX2Tail384LoopB
9356  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
9357  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
9358  	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
9359  	VPADDD     32(BP), Y14, Y14
9360  	VPADDD     32(BP), Y9, Y9
9361  	VPADDD     32(BP), Y10, Y10
9362  	VPADDD     64(BP), Y12, Y12
9363  	VPADDD     64(BP), Y13, Y13
9364  	VPADDD     64(BP), Y8, Y8
9365  	VPADDD     Y7, Y4, Y4
9366  	VPADDD     Y11, Y1, Y1
9367  	VPADDD     Y15, Y2, Y2
9368  	VPERM2I128 $0x02, Y0, Y14, Y3
9369  	VPERM2I128 $0x02, Y12, Y4, Y7
9370  	VPERM2I128 $0x13, Y0, Y14, Y11
9371  	VPERM2I128 $0x13, Y12, Y4, Y15
9372  	VPXOR      (SI), Y3, Y3
9373  	VPXOR      32(SI), Y7, Y7
9374  	VPXOR      64(SI), Y11, Y11
9375  	VPXOR      96(SI), Y15, Y15
9376  	VMOVDQU    Y3, (DI)
9377  	VMOVDQU    Y7, 32(DI)
9378  	VMOVDQU    Y11, 64(DI)
9379  	VMOVDQU    Y15, 96(DI)
9380  	VPERM2I128 $0x02, Y5, Y9, Y3
9381  	VPERM2I128 $0x02, Y13, Y1, Y7
9382  	VPERM2I128 $0x13, Y5, Y9, Y11
9383  	VPERM2I128 $0x13, Y13, Y1, Y15
9384  	VPXOR      128(SI), Y3, Y3
9385  	VPXOR      160(SI), Y7, Y7
9386  	VPXOR      192(SI), Y11, Y11
9387  	VPXOR      224(SI), Y15, Y15
9388  	VMOVDQU    Y3, 128(DI)
9389  	VMOVDQU    Y7, 160(DI)
9390  	VMOVDQU    Y11, 192(DI)
9391  	VMOVDQU    Y15, 224(DI)
9392  	MOVQ       $0x00000100, CX
9393  	LEAQ       256(SI), SI
9394  	SUBQ       $0x00000100, BX
9395  	VPERM2I128 $0x02, Y6, Y10, Y0
9396  	VPERM2I128 $0x02, Y8, Y2, Y14
9397  	VPERM2I128 $0x13, Y6, Y10, Y12
9398  	VPERM2I128 $0x13, Y8, Y2, Y4
9399  	JMP        sealAVX2SealHash
9400  
9401  sealAVX2Tail512:
9402  	VMOVDQA ·chacha20Constants<>+0(SB), Y0
9403  	VMOVDQA Y0, Y5
9404  	VMOVDQA Y0, Y6
9405  	VMOVDQA Y0, Y7
9406  	VMOVDQA 32(BP), Y14
9407  	VMOVDQA Y14, Y9
9408  	VMOVDQA Y14, Y10
9409  	VMOVDQA Y14, Y11
9410  	VMOVDQA 64(BP), Y12
9411  	VMOVDQA Y12, Y13
9412  	VMOVDQA Y12, Y8
9413  	VMOVDQA Y12, Y15
9414  	VMOVDQA 192(BP), Y4
9415  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y4
9416  	VPADDD  ·avx2IncMask<>+0(SB), Y4, Y1
9417  	VPADDD  ·avx2IncMask<>+0(SB), Y1, Y2
9418  	VPADDD  ·avx2IncMask<>+0(SB), Y2, Y3
9419  	VMOVDQA Y4, 96(BP)
9420  	VMOVDQA Y1, 128(BP)
9421  	VMOVDQA Y2, 160(BP)
9422  	VMOVDQA Y3, 192(BP)
9423  
9424  sealAVX2Tail512LoopA:
9425  	ADDQ  (DI), R10
9426  	ADCQ  8(DI), R11
9427  	ADCQ  $0x01, R12
9428  	MOVQ  (BP), AX
9429  	MOVQ  AX, R15
9430  	MULQ  R10
9431  	MOVQ  AX, R13
9432  	MOVQ  DX, R14
9433  	MOVQ  (BP), AX
9434  	MULQ  R11
9435  	IMULQ R12, R15
9436  	ADDQ  AX, R14
9437  	ADCQ  DX, R15
9438  	MOVQ  8(BP), AX
9439  	MOVQ  AX, R8
9440  	MULQ  R10
9441  	ADDQ  AX, R14
9442  	ADCQ  $0x00, DX
9443  	MOVQ  DX, R10
9444  	MOVQ  8(BP), AX
9445  	MULQ  R11
9446  	ADDQ  AX, R15
9447  	ADCQ  $0x00, DX
9448  	IMULQ R12, R8
9449  	ADDQ  R10, R15
9450  	ADCQ  DX, R8
9451  	MOVQ  R13, R10
9452  	MOVQ  R14, R11
9453  	MOVQ  R15, R12
9454  	ANDQ  $0x03, R12
9455  	MOVQ  R15, R13
9456  	ANDQ  $-4, R13
9457  	MOVQ  R8, R14
9458  	SHRQ  $0x02, R8, R15
9459  	SHRQ  $0x02, R8
9460  	ADDQ  R13, R10
9461  	ADCQ  R14, R11
9462  	ADCQ  $0x00, R12
9463  	ADDQ  R15, R10
9464  	ADCQ  R8, R11
9465  	ADCQ  $0x00, R12
9466  	LEAQ  16(DI), DI
9467  
9468  sealAVX2Tail512LoopB:
9469  	VPADDD     Y14, Y0, Y0
9470  	VPADDD     Y9, Y5, Y5
9471  	VPADDD     Y10, Y6, Y6
9472  	VPADDD     Y11, Y7, Y7
9473  	VPXOR      Y0, Y4, Y4
9474  	VPXOR      Y5, Y1, Y1
9475  	VPXOR      Y6, Y2, Y2
9476  	VPXOR      Y7, Y3, Y3
9477  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
9478  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
9479  	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
9480  	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
9481  	VPADDD     Y4, Y12, Y12
9482  	VPADDD     Y1, Y13, Y13
9483  	VPADDD     Y2, Y8, Y8
9484  	VPADDD     Y3, Y15, Y15
9485  	VPXOR      Y12, Y14, Y14
9486  	VPXOR      Y13, Y9, Y9
9487  	VPXOR      Y8, Y10, Y10
9488  	VPXOR      Y15, Y11, Y11
9489  	VMOVDQA    Y15, 224(BP)
9490  	VPSLLD     $0x0c, Y14, Y15
9491  	VPSRLD     $0x14, Y14, Y14
9492  	VPXOR      Y15, Y14, Y14
9493  	VPSLLD     $0x0c, Y9, Y15
9494  	VPSRLD     $0x14, Y9, Y9
9495  	VPXOR      Y15, Y9, Y9
9496  	VPSLLD     $0x0c, Y10, Y15
9497  	VPSRLD     $0x14, Y10, Y10
9498  	VPXOR      Y15, Y10, Y10
9499  	VPSLLD     $0x0c, Y11, Y15
9500  	VPSRLD     $0x14, Y11, Y11
9501  	VPXOR      Y15, Y11, Y11
9502  	VMOVDQA    224(BP), Y15
9503  	ADDQ       (DI), R10
9504  	ADCQ       8(DI), R11
9505  	ADCQ       $0x01, R12
9506  	MOVQ       (BP), DX
9507  	MOVQ       DX, R15
9508  	MULXQ      R10, R13, R14
9509  	IMULQ      R12, R15
9510  	MULXQ      R11, AX, DX
9511  	ADDQ       AX, R14
9512  	ADCQ       DX, R15
9513  	MOVQ       8(BP), DX
9514  	MULXQ      R10, R10, AX
9515  	ADDQ       R10, R14
9516  	MULXQ      R11, R11, R8
9517  	ADCQ       R11, R15
9518  	ADCQ       $0x00, R8
9519  	IMULQ      R12, DX
9520  	ADDQ       AX, R15
9521  	ADCQ       DX, R8
9522  	MOVQ       R13, R10
9523  	MOVQ       R14, R11
9524  	MOVQ       R15, R12
9525  	ANDQ       $0x03, R12
9526  	MOVQ       R15, R13
9527  	ANDQ       $-4, R13
9528  	MOVQ       R8, R14
9529  	SHRQ       $0x02, R8, R15
9530  	SHRQ       $0x02, R8
9531  	ADDQ       R13, R10
9532  	ADCQ       R14, R11
9533  	ADCQ       $0x00, R12
9534  	ADDQ       R15, R10
9535  	ADCQ       R8, R11
9536  	ADCQ       $0x00, R12
9537  	VPADDD     Y14, Y0, Y0
9538  	VPADDD     Y9, Y5, Y5
9539  	VPADDD     Y10, Y6, Y6
9540  	VPADDD     Y11, Y7, Y7
9541  	VPXOR      Y0, Y4, Y4
9542  	VPXOR      Y5, Y1, Y1
9543  	VPXOR      Y6, Y2, Y2
9544  	VPXOR      Y7, Y3, Y3
9545  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
9546  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
9547  	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
9548  	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
9549  	VPADDD     Y4, Y12, Y12
9550  	VPADDD     Y1, Y13, Y13
9551  	VPADDD     Y2, Y8, Y8
9552  	VPADDD     Y3, Y15, Y15
9553  	VPXOR      Y12, Y14, Y14
9554  	VPXOR      Y13, Y9, Y9
9555  	VPXOR      Y8, Y10, Y10
9556  	VPXOR      Y15, Y11, Y11
9557  	VMOVDQA    Y15, 224(BP)
9558  	VPSLLD     $0x07, Y14, Y15
9559  	VPSRLD     $0x19, Y14, Y14
9560  	VPXOR      Y15, Y14, Y14
9561  	VPSLLD     $0x07, Y9, Y15
9562  	VPSRLD     $0x19, Y9, Y9
9563  	VPXOR      Y15, Y9, Y9
9564  	VPSLLD     $0x07, Y10, Y15
9565  	VPSRLD     $0x19, Y10, Y10
9566  	VPXOR      Y15, Y10, Y10
9567  	VPSLLD     $0x07, Y11, Y15
9568  	VPSRLD     $0x19, Y11, Y11
9569  	VPXOR      Y15, Y11, Y11
9570  	VMOVDQA    224(BP), Y15
9571  	VPALIGNR   $0x04, Y14, Y14, Y14
9572  	VPALIGNR   $0x04, Y9, Y9, Y9
9573  	VPALIGNR   $0x04, Y10, Y10, Y10
9574  	VPALIGNR   $0x04, Y11, Y11, Y11
9575  	VPALIGNR   $0x08, Y12, Y12, Y12
9576  	VPALIGNR   $0x08, Y13, Y13, Y13
9577  	VPALIGNR   $0x08, Y8, Y8, Y8
9578  	VPALIGNR   $0x08, Y15, Y15, Y15
9579  	VPALIGNR   $0x0c, Y4, Y4, Y4
9580  	VPALIGNR   $0x0c, Y1, Y1, Y1
9581  	VPALIGNR   $0x0c, Y2, Y2, Y2
9582  	VPALIGNR   $0x0c, Y3, Y3, Y3
9583  	VPADDD     Y14, Y0, Y0
9584  	VPADDD     Y9, Y5, Y5
9585  	VPADDD     Y10, Y6, Y6
9586  	VPADDD     Y11, Y7, Y7
9587  	VPXOR      Y0, Y4, Y4
9588  	VPXOR      Y5, Y1, Y1
9589  	VPXOR      Y6, Y2, Y2
9590  	VPXOR      Y7, Y3, Y3
9591  	VPSHUFB    ·rol16<>+0(SB), Y4, Y4
9592  	VPSHUFB    ·rol16<>+0(SB), Y1, Y1
9593  	VPSHUFB    ·rol16<>+0(SB), Y2, Y2
9594  	VPSHUFB    ·rol16<>+0(SB), Y3, Y3
9595  	VPADDD     Y4, Y12, Y12
9596  	VPADDD     Y1, Y13, Y13
9597  	VPADDD     Y2, Y8, Y8
9598  	VPADDD     Y3, Y15, Y15
9599  	VPXOR      Y12, Y14, Y14
9600  	VPXOR      Y13, Y9, Y9
9601  	VPXOR      Y8, Y10, Y10
9602  	VPXOR      Y15, Y11, Y11
9603  	ADDQ       16(DI), R10
9604  	ADCQ       24(DI), R11
9605  	ADCQ       $0x01, R12
9606  	MOVQ       (BP), DX
9607  	MOVQ       DX, R15
9608  	MULXQ      R10, R13, R14
9609  	IMULQ      R12, R15
9610  	MULXQ      R11, AX, DX
9611  	ADDQ       AX, R14
9612  	ADCQ       DX, R15
9613  	MOVQ       8(BP), DX
9614  	MULXQ      R10, R10, AX
9615  	ADDQ       R10, R14
9616  	MULXQ      R11, R11, R8
9617  	ADCQ       R11, R15
9618  	ADCQ       $0x00, R8
9619  	IMULQ      R12, DX
9620  	ADDQ       AX, R15
9621  	ADCQ       DX, R8
9622  	MOVQ       R13, R10
9623  	MOVQ       R14, R11
9624  	MOVQ       R15, R12
9625  	ANDQ       $0x03, R12
9626  	MOVQ       R15, R13
9627  	ANDQ       $-4, R13
9628  	MOVQ       R8, R14
9629  	SHRQ       $0x02, R8, R15
9630  	SHRQ       $0x02, R8
9631  	ADDQ       R13, R10
9632  	ADCQ       R14, R11
9633  	ADCQ       $0x00, R12
9634  	ADDQ       R15, R10
9635  	ADCQ       R8, R11
9636  	ADCQ       $0x00, R12
9637  	LEAQ       32(DI), DI
9638  	VMOVDQA    Y15, 224(BP)
9639  	VPSLLD     $0x0c, Y14, Y15
9640  	VPSRLD     $0x14, Y14, Y14
9641  	VPXOR      Y15, Y14, Y14
9642  	VPSLLD     $0x0c, Y9, Y15
9643  	VPSRLD     $0x14, Y9, Y9
9644  	VPXOR      Y15, Y9, Y9
9645  	VPSLLD     $0x0c, Y10, Y15
9646  	VPSRLD     $0x14, Y10, Y10
9647  	VPXOR      Y15, Y10, Y10
9648  	VPSLLD     $0x0c, Y11, Y15
9649  	VPSRLD     $0x14, Y11, Y11
9650  	VPXOR      Y15, Y11, Y11
9651  	VMOVDQA    224(BP), Y15
9652  	VPADDD     Y14, Y0, Y0
9653  	VPADDD     Y9, Y5, Y5
9654  	VPADDD     Y10, Y6, Y6
9655  	VPADDD     Y11, Y7, Y7
9656  	VPXOR      Y0, Y4, Y4
9657  	VPXOR      Y5, Y1, Y1
9658  	VPXOR      Y6, Y2, Y2
9659  	VPXOR      Y7, Y3, Y3
9660  	VPSHUFB    ·rol8<>+0(SB), Y4, Y4
9661  	VPSHUFB    ·rol8<>+0(SB), Y1, Y1
9662  	VPSHUFB    ·rol8<>+0(SB), Y2, Y2
9663  	VPSHUFB    ·rol8<>+0(SB), Y3, Y3
9664  	VPADDD     Y4, Y12, Y12
9665  	VPADDD     Y1, Y13, Y13
9666  	VPADDD     Y2, Y8, Y8
9667  	VPADDD     Y3, Y15, Y15
9668  	VPXOR      Y12, Y14, Y14
9669  	VPXOR      Y13, Y9, Y9
9670  	VPXOR      Y8, Y10, Y10
9671  	VPXOR      Y15, Y11, Y11
9672  	VMOVDQA    Y15, 224(BP)
9673  	VPSLLD     $0x07, Y14, Y15
9674  	VPSRLD     $0x19, Y14, Y14
9675  	VPXOR      Y15, Y14, Y14
9676  	VPSLLD     $0x07, Y9, Y15
9677  	VPSRLD     $0x19, Y9, Y9
9678  	VPXOR      Y15, Y9, Y9
9679  	VPSLLD     $0x07, Y10, Y15
9680  	VPSRLD     $0x19, Y10, Y10
9681  	VPXOR      Y15, Y10, Y10
9682  	VPSLLD     $0x07, Y11, Y15
9683  	VPSRLD     $0x19, Y11, Y11
9684  	VPXOR      Y15, Y11, Y11
9685  	VMOVDQA    224(BP), Y15
9686  	VPALIGNR   $0x0c, Y14, Y14, Y14
9687  	VPALIGNR   $0x0c, Y9, Y9, Y9
9688  	VPALIGNR   $0x0c, Y10, Y10, Y10
9689  	VPALIGNR   $0x0c, Y11, Y11, Y11
9690  	VPALIGNR   $0x08, Y12, Y12, Y12
9691  	VPALIGNR   $0x08, Y13, Y13, Y13
9692  	VPALIGNR   $0x08, Y8, Y8, Y8
9693  	VPALIGNR   $0x08, Y15, Y15, Y15
9694  	VPALIGNR   $0x04, Y4, Y4, Y4
9695  	VPALIGNR   $0x04, Y1, Y1, Y1
9696  	VPALIGNR   $0x04, Y2, Y2, Y2
9697  	VPALIGNR   $0x04, Y3, Y3, Y3
9698  	DECQ       CX
9699  	JG         sealAVX2Tail512LoopA
9700  	DECQ       R9
9701  	JGE        sealAVX2Tail512LoopB
9702  	VPADDD     ·chacha20Constants<>+0(SB), Y0, Y0
9703  	VPADDD     ·chacha20Constants<>+0(SB), Y5, Y5
9704  	VPADDD     ·chacha20Constants<>+0(SB), Y6, Y6
9705  	VPADDD     ·chacha20Constants<>+0(SB), Y7, Y7
9706  	VPADDD     32(BP), Y14, Y14
9707  	VPADDD     32(BP), Y9, Y9
9708  	VPADDD     32(BP), Y10, Y10
9709  	VPADDD     32(BP), Y11, Y11
9710  	VPADDD     64(BP), Y12, Y12
9711  	VPADDD     64(BP), Y13, Y13
9712  	VPADDD     64(BP), Y8, Y8
9713  	VPADDD     64(BP), Y15, Y15
9714  	VPADDD     96(BP), Y4, Y4
9715  	VPADDD     128(BP), Y1, Y1
9716  	VPADDD     160(BP), Y2, Y2
9717  	VPADDD     192(BP), Y3, Y3
9718  	VMOVDQA    Y15, 224(BP)
9719  	VPERM2I128 $0x02, Y0, Y14, Y15
9720  	VPXOR      (SI), Y15, Y15
9721  	VMOVDQU    Y15, (DI)
9722  	VPERM2I128 $0x02, Y12, Y4, Y15
9723  	VPXOR      32(SI), Y15, Y15
9724  	VMOVDQU    Y15, 32(DI)
9725  	VPERM2I128 $0x13, Y0, Y14, Y15
9726  	VPXOR      64(SI), Y15, Y15
9727  	VMOVDQU    Y15, 64(DI)
9728  	VPERM2I128 $0x13, Y12, Y4, Y15
9729  	VPXOR      96(SI), Y15, Y15
9730  	VMOVDQU    Y15, 96(DI)
9731  	VPERM2I128 $0x02, Y5, Y9, Y0
9732  	VPERM2I128 $0x02, Y13, Y1, Y14
9733  	VPERM2I128 $0x13, Y5, Y9, Y12
9734  	VPERM2I128 $0x13, Y13, Y1, Y4
9735  	VPXOR      128(SI), Y0, Y0
9736  	VPXOR      160(SI), Y14, Y14
9737  	VPXOR      192(SI), Y12, Y12
9738  	VPXOR      224(SI), Y4, Y4
9739  	VMOVDQU    Y0, 128(DI)
9740  	VMOVDQU    Y14, 160(DI)
9741  	VMOVDQU    Y12, 192(DI)
9742  	VMOVDQU    Y4, 224(DI)
9743  	VPERM2I128 $0x02, Y6, Y10, Y0
9744  	VPERM2I128 $0x02, Y8, Y2, Y14
9745  	VPERM2I128 $0x13, Y6, Y10, Y12
9746  	VPERM2I128 $0x13, Y8, Y2, Y4
9747  	VPXOR      256(SI), Y0, Y0
9748  	VPXOR      288(SI), Y14, Y14
9749  	VPXOR      320(SI), Y12, Y12
9750  	VPXOR      352(SI), Y4, Y4
9751  	VMOVDQU    Y0, 256(DI)
9752  	VMOVDQU    Y14, 288(DI)
9753  	VMOVDQU    Y12, 320(DI)
9754  	VMOVDQU    Y4, 352(DI)
9755  	MOVQ       $0x00000180, CX
9756  	LEAQ       384(SI), SI
9757  	SUBQ       $0x00000180, BX
9758  	VPERM2I128 $0x02, Y7, Y11, Y0
9759  	VPERM2I128 $0x02, 224(BP), Y3, Y14
9760  	VPERM2I128 $0x13, Y7, Y11, Y12
9761  	VPERM2I128 $0x13, 224(BP), Y3, Y4
9762  	JMP        sealAVX2SealHash
9763