// Code generated by command: go run gen_amd64_compress_asm.go -out ../compress/blocks_amd64.s -stubs ../compress/blocks_amd64.go -pkg compress. DO NOT EDIT.

//go:build !purego

#include "textflag.h"

DATA first_8_blake_consts<>+0(SB)/8, $0x85a308d3243f6a88
DATA first_8_blake_consts<>+8(SB)/8, $0x0370734413198a2e
DATA first_8_blake_consts<>+16(SB)/8, $0x299f31d0a4093822
DATA first_8_blake_consts<>+24(SB)/8, $0xec4e6c89082efa98
GLOBL first_8_blake_consts<>(SB), RODATA|NOPTR, $32

DATA permuted_blake_consts<>+0(SB)/8, $0x0370734485a308d3
DATA permuted_blake_consts<>+8(SB)/8, $0xec4e6c89299f31d0
DATA permuted_blake_consts<>+16(SB)/8, $0x13198a2e243f6a88
DATA permuted_blake_consts<>+24(SB)/8, $0x082efa98a4093822
DATA permuted_blake_consts<>+32(SB)/8, $0x34e90c6c38d01377
DATA permuted_blake_consts<>+40(SB)/8, $0xb5470917c97c50dd
DATA permuted_blake_consts<>+48(SB)/8, $0xbe5466cf452821e6
DATA permuted_blake_consts<>+56(SB)/8, $0x3f84d5b5c0ac29b7
DATA permuted_blake_consts<>+64(SB)/8, $0x452821e6be5466cf
DATA permuted_blake_consts<>+72(SB)/8, $0x082efa98b5470917
DATA permuted_blake_consts<>+80(SB)/8, $0xa40938223f84d5b5
DATA permuted_blake_consts<>+88(SB)/8, $0xc97c50dd38d01377
DATA permuted_blake_consts<>+96(SB)/8, $0x13198a2ec0ac29b7
DATA permuted_blake_consts<>+104(SB)/8, $0x03707344ec4e6c89
DATA permuted_blake_consts<>+112(SB)/8, $0x243f6a8885a308d3
DATA permuted_blake_consts<>+120(SB)/8, $0x299f31d034e90c6c
DATA permuted_blake_consts<>+128(SB)/8, $0x243f6a88452821e6
DATA permuted_blake_consts<>+136(SB)/8, $0xc97c50dd13198a2e
DATA permuted_blake_consts<>+144(SB)/8, $0xc0ac29b734e90c6c
DATA permuted_blake_consts<>+152(SB)/8, $0xb5470917299f31d0
DATA permuted_blake_consts<>+160(SB)/8, $0x082efa983f84d5b5
DATA permuted_blake_consts<>+168(SB)/8, $0xa409382285a308d3
DATA permuted_blake_consts<>+176(SB)/8, $0x03707344be5466cf
DATA permuted_blake_consts<>+184(SB)/8, $0x38d01377ec4e6c89
DATA permuted_blake_consts<>+192(SB)/8, $0x85a308d338d01377
DATA permuted_blake_consts<>+200(SB)/8, $0x3f84d5b5c0ac29b7
DATA permuted_blake_consts<>+208(SB)/8, $0x03707344ec4e6c89
DATA permuted_blake_consts<>+216(SB)/8, $0x34e90c6cc97c50dd
DATA permuted_blake_consts<>+224(SB)/8, $0xbe5466cf082efa98
DATA permuted_blake_consts<>+232(SB)/8, $0x452821e6243f6a88
DATA permuted_blake_consts<>+240(SB)/8, $0x299f31d013198a2e
DATA permuted_blake_consts<>+248(SB)/8, $0xb5470917a4093822
DATA permuted_blake_consts<>+256(SB)/8, $0xec4e6c89243f6a88
DATA permuted_blake_consts<>+264(SB)/8, $0xb5470917a4093822
DATA permuted_blake_consts<>+272(SB)/8, $0x299f31d038d01377
DATA permuted_blake_consts<>+280(SB)/8, $0xbe5466cf13198a2e
DATA permuted_blake_consts<>+288(SB)/8, $0xc0ac29b785a308d3
DATA permuted_blake_consts<>+296(SB)/8, $0xc97c50dd452821e6
DATA permuted_blake_consts<>+304(SB)/8, $0x34e90c6c3f84d5b5
DATA permuted_blake_consts<>+312(SB)/8, $0x03707344082efa98
DATA permuted_blake_consts<>+320(SB)/8, $0xbe5466cfc0ac29b7
DATA permuted_blake_consts<>+328(SB)/8, $0x0370734434e90c6c
DATA permuted_blake_consts<>+336(SB)/8, $0x082efa9813198a2e
DATA permuted_blake_consts<>+344(SB)/8, $0x452821e6243f6a88
DATA permuted_blake_consts<>+352(SB)/8, $0x299f31d0c97c50dd
DATA permuted_blake_consts<>+360(SB)/8, $0x38d013773f84d5b5
DATA permuted_blake_consts<>+368(SB)/8, $0xec4e6c89a4093822
DATA permuted_blake_consts<>+376(SB)/8, $0x85a308d3b5470917
DATA permuted_blake_consts<>+384(SB)/8, $0xb5470917299f31d0
DATA permuted_blake_consts<>+392(SB)/8, $0xbe5466cfc97c50dd
DATA permuted_blake_consts<>+400(SB)/8, $0x85a308d3c0ac29b7
DATA permuted_blake_consts<>+408(SB)/8, $0xa40938223f84d5b5
DATA permuted_blake_consts<>+416(SB)/8, $0x03707344ec4e6c89
DATA permuted_blake_consts<>+424(SB)/8, $0x34e90c6c13198a2e
DATA permuted_blake_consts<>+432(SB)/8, $0x082efa98243f6a88
DATA permuted_blake_consts<>+440(SB)/8, $0x452821e638d01377
DATA permuted_blake_consts<>+448(SB)/8, $0x3f84d5b534e90c6c
DATA permuted_blake_consts<>+456(SB)/8, $0x38d0137785a308d3
DATA permuted_blake_consts<>+464(SB)/8, $0xec4e6c89c97c50dd
DATA permuted_blake_consts<>+472(SB)/8, $0x03707344c0ac29b7
DATA permuted_blake_consts<>+480(SB)/8, $0xa4093822243f6a88
DATA permuted_blake_consts<>+488(SB)/8, $0xbe5466cf082efa98
DATA permuted_blake_consts<>+496(SB)/8, $0xb5470917299f31d0
DATA permuted_blake_consts<>+504(SB)/8, $0x13198a2e452821e6
DATA permuted_blake_consts<>+512(SB)/8, $0x38d01377b5470917
DATA permuted_blake_consts<>+520(SB)/8, $0x452821e603707344
DATA permuted_blake_consts<>+528(SB)/8, $0x3f84d5b5082efa98
DATA permuted_blake_consts<>+536(SB)/8, $0x243f6a8834e90c6c
DATA permuted_blake_consts<>+544(SB)/8, $0xec4e6c8913198a2e
DATA permuted_blake_consts<>+552(SB)/8, $0x299f31d0a4093822
DATA permuted_blake_consts<>+560(SB)/8, $0xc97c50ddc0ac29b7
DATA permuted_blake_consts<>+568(SB)/8, $0xbe5466cf85a308d3
DATA permuted_blake_consts<>+576(SB)/8, $0xa409382213198a2e
DATA permuted_blake_consts<>+584(SB)/8, $0x299f31d0082efa98
DATA permuted_blake_consts<>+592(SB)/8, $0x452821e6be5466cf
DATA permuted_blake_consts<>+600(SB)/8, $0x85a308d3ec4e6c89
DATA permuted_blake_consts<>+608(SB)/8, $0x3f84d5b534e90c6c
DATA permuted_blake_consts<>+616(SB)/8, $0x243f6a88c0ac29b7
DATA permuted_blake_consts<>+624(SB)/8, $0x38d01377b5470917
DATA permuted_blake_consts<>+632(SB)/8, $0xc97c50dd03707344
GLOBL permuted_blake_consts<>(SB), RODATA|NOPTR, $640

DATA shuffle_rotr8_4x32<>+0(SB)/8, $0x0407060500030201
DATA shuffle_rotr8_4x32<>+8(SB)/8, $0x0c0f0e0d080b0a09
GLOBL shuffle_rotr8_4x32<>(SB), RODATA|NOPTR, $16

DATA shuffle_rotr16_4x32<>+0(SB)/8, $0x0504070601000302
DATA shuffle_rotr16_4x32<>+8(SB)/8, $0x0d0c0f0e09080b0a
GLOBL shuffle_rotr16_4x32<>(SB), RODATA|NOPTR, $16

DATA shuffle_le_to_be_4x32<>+0(SB)/8, $0x0405060700010203
DATA shuffle_le_to_be_4x32<>+8(SB)/8, $0x0c0d0e0f08090a0b
GLOBL shuffle_le_to_be_4x32<>(SB), RODATA|NOPTR, $16

// func blocksSSE2(state *State, msg []byte, counter uint64)
// Requires: SSE2
TEXT ·blocksSSE2(SB), $64-40
	MOVQ state+0(FP), AX
	MOVQ counter+32(FP), CX
	MOVQ msg_base+8(FP), DX
	MOVQ msg_len+16(FP), BX

	// Convert message len to number of blocks for loop counter.
	SHRQ $0x06, BX

	// Initialize state matrix.
	// row0 = |v0  v1  v2  v3|   |  h0     h1     h2     h3 |
	// row1 = |v4  v5  v6  v7|   |  h4     h5     h6     h7 |
	MOVOU 32(AX), X0
	MOVOU (AX), X1
	MOVOU 16(AX), X2

compressLoop:
	// row2 = |v8  v9  va  vb| = |s0^c0  s1^c1  s2^c2  s3^c3|
	// row3 = |vc  vd  ve  vf|   |t0^c4  t0^c5  t1^c6  t1^c7|
	MOVOU  first_8_blake_consts<>+0(SB), X3
	PXOR   X0, X3
	MOVD   CX, X4
	PSHUFD $0x50, X4, X4
	PXOR   first_8_blake_consts<>+16(SB), X4
	MOVO   X1, X5
	MOVO   X2, X6

	// Convert message to big endian.
	MOVL   (DX), SI
	MOVL   4(DX), DI
	MOVL   8(DX), R8
	MOVL   12(DX), R9
	MOVL   16(DX), R10
	MOVL   20(DX), R11
	MOVL   24(DX), R12
	MOVL   28(DX), R13
	BSWAPL SI
	MOVL   SI, (SP)
	BSWAPL DI
	MOVL   DI, 4(SP)
	BSWAPL R8
	MOVL   R8, 8(SP)
	BSWAPL R9
	MOVL   R9, 12(SP)
	BSWAPL R10
	MOVL   R10, 16(SP)
	BSWAPL R11
	MOVL   R11, 20(SP)
	BSWAPL R12
	MOVL   R12, 24(SP)
	BSWAPL R13
	MOVL   R13, 28(SP)
	MOVL   32(DX), SI
	MOVL   36(DX), DI
	MOVL   40(DX), R8
	MOVL   44(DX), R9
	MOVL   48(DX), R10
	MOVL   52(DX), R11
	MOVL   56(DX), R12
	MOVL   60(DX), R13
	BSWAPL SI
	MOVL   SI, 32(SP)
	BSWAPL DI
	MOVL   DI, 36(SP)
	BSWAPL R8
	MOVL   R8, 40(SP)
	BSWAPL R9
	MOVL   R9, 44(SP)
	BSWAPL R10
	MOVL   R10, 48(SP)
	BSWAPL R11
	MOVL   R11, 52(SP)
	BSWAPL R12
	MOVL   R12, 56(SP)
	BSWAPL R13
	MOVL   R13, 60(SP)

	// Round 1 column step.
	MOVD       24(SP), X9
	MOVD       16(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       8(SP), X7
	MOVD       (SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+0(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       28(SP), X9
	MOVD       20(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       12(SP), X7
	MOVD       4(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+16(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 1 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 1 diagonal step part 2: column step.
	MOVD       56(SP), X9
	MOVD       48(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       40(SP), X7
	MOVD       32(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+32(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       60(SP), X9
	MOVD       52(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       44(SP), X7
	MOVD       36(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+48(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 1 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 2 column step.
	MOVD       52(SP), X9
	MOVD       36(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       16(SP), X7
	MOVD       56(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+64(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       24(SP), X9
	MOVD       60(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       32(SP), X7
	MOVD       40(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+80(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 2 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 2 diagonal step part 2: column step.
	MOVD       20(SP), X9
	MOVD       44(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       (SP), X7
	MOVD       4(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+96(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       12(SP), X9
	MOVD       28(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       8(SP), X7
	MOVD       48(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+112(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 2 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 3 column step.
	MOVD       60(SP), X9
	MOVD       20(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       48(SP), X7
	MOVD       44(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+128(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       52(SP), X9
	MOVD       8(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       (SP), X7
	MOVD       32(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+144(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 3 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 3 diagonal step part 2: column step.
	MOVD       36(SP), X9
	MOVD       28(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       12(SP), X7
	MOVD       40(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+160(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       16(SP), X9
	MOVD       4(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       24(SP), X7
	MOVD       56(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+176(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 3 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 4 column step.
	MOVD       44(SP), X9
	MOVD       52(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       12(SP), X7
	MOVD       28(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+192(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       56(SP), X9
	MOVD       48(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       4(SP), X7
	MOVD       36(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+208(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 4 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 4 diagonal step part 2: column step.
	MOVD       60(SP), X9
	MOVD       16(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       20(SP), X7
	MOVD       8(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+224(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       32(SP), X9
	MOVD       (SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       40(SP), X7
	MOVD       24(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+240(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 4 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 5 column step.
	MOVD       40(SP), X9
	MOVD       8(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       20(SP), X7
	MOVD       36(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+256(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       60(SP), X9
	MOVD       16(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       28(SP), X7
	MOVD       (SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+272(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 5 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 5 diagonal step part 2: column step.
	MOVD       12(SP), X9
	MOVD       24(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       44(SP), X7
	MOVD       56(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+288(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       52(SP), X9
	MOVD       32(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       48(SP), X7
	MOVD       4(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+304(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 5 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 6 column step.
	MOVD       32(SP), X9
	MOVD       (SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       24(SP), X7
	MOVD       8(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+320(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       12(SP), X9
	MOVD       44(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       40(SP), X7
	MOVD       48(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+336(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 6 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 6 diagonal step part 2: column step.
	MOVD       4(SP), X9
	MOVD       60(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       28(SP), X7
	MOVD       16(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+352(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       36(SP), X9
	MOVD       56(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       20(SP), X7
	MOVD       52(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+368(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 6 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 7 column step.
	MOVD       16(SP), X9
	MOVD       56(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       4(SP), X7
	MOVD       48(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+384(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       40(SP), X9
	MOVD       52(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       60(SP), X7
	MOVD       20(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+400(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 7 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 7 diagonal step part 2: column step.
	MOVD       32(SP), X9
	MOVD       36(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       24(SP), X7
	MOVD       (SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+416(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       44(SP), X9
	MOVD       8(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       12(SP), X7
	MOVD       28(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+432(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 7 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 8 column step.
	MOVD       12(SP), X9
	MOVD       48(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       28(SP), X7
	MOVD       52(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+448(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       36(SP), X9
	MOVD       4(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       56(SP), X7
	MOVD       44(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+464(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 8 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 8 diagonal step part 2: column step.
	MOVD       8(SP), X9
	MOVD       32(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       60(SP), X7
	MOVD       20(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+480(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       40(SP), X9
	MOVD       24(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       16(SP), X7
	MOVD       (SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+496(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 8 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 9 column step.
	MOVD       (SP), X9
	MOVD       44(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       56(SP), X7
	MOVD       24(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+512(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       32(SP), X9
	MOVD       12(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       36(SP), X7
	MOVD       60(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+528(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 9 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 9 diagonal step part 2: column step.
	MOVD       40(SP), X9
	MOVD       4(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       52(SP), X7
	MOVD       48(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+544(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       20(SP), X9
	MOVD       16(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       28(SP), X7
	MOVD       8(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+560(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 9 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 10 column step.
	MOVD       4(SP), X9
	MOVD       28(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       32(SP), X7
	MOVD       40(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+576(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       20(SP), X9
	MOVD       24(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       16(SP), X7
	MOVD       8(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+592(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 10 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 10 diagonal step part 2: column step.
	MOVD       52(SP), X9
	MOVD       12(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       36(SP), X7
	MOVD       60(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+608(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       (SP), X9
	MOVD       48(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       56(SP), X7
	MOVD       44(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+624(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 10 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 11 column step.
	MOVD       24(SP), X9
	MOVD       16(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       8(SP), X7
	MOVD       (SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+0(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       28(SP), X9
	MOVD       20(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       12(SP), X7
	MOVD       4(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+16(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 11 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 11 diagonal step part 2: column step.
	MOVD       56(SP), X9
	MOVD       48(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       40(SP), X7
	MOVD       32(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+32(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       60(SP), X9
	MOVD       52(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       44(SP), X7
	MOVD       36(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+48(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 11 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 12 column step.
	MOVD       52(SP), X9
	MOVD       36(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       16(SP), X7
	MOVD       56(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+64(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       24(SP), X9
	MOVD       60(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       32(SP), X7
	MOVD       40(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+80(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 12 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 12 diagonal step part 2: column step.
	MOVD       20(SP), X9
	MOVD       44(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       (SP), X7
	MOVD       4(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+96(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       12(SP), X9
	MOVD       28(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       8(SP), X7
	MOVD       48(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+112(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 12 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 13 column step.
	MOVD       60(SP), X9
	MOVD       20(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       48(SP), X7
	MOVD       44(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+128(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       52(SP), X9
	MOVD       8(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       (SP), X7
	MOVD       32(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+144(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 13 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 13 diagonal step part 2: column step.
	MOVD       36(SP), X9
	MOVD       28(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       12(SP), X7
	MOVD       40(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+160(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       16(SP), X9
	MOVD       4(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       24(SP), X7
	MOVD       56(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+176(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 13 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Round 14 column step.
	MOVD       44(SP), X9
	MOVD       52(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       12(SP), X7
	MOVD       28(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+192(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       56(SP), X9
	MOVD       48(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       4(SP), X7
	MOVD       36(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+208(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 14 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x93, X4, X4

	// Round 14 diagonal step part 2: column step.
	MOVD       60(SP), X9
	MOVD       16(SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       20(SP), X7
	MOVD       8(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+224(SB), X8
	PXOR       X9, X8
	PADDD      X8, X1
	MOVD       32(SP), X9
	MOVD       (SP), X7
	MOVOA      X7, X8
	PUNPCKLLQ  X9, X8
	MOVD       40(SP), X7
	MOVD       24(SP), X9
	PUNPCKLLQ  X7, X9
	PUNPCKLQDQ X8, X9
	MOVOU      permuted_blake_consts<>+240(SB), X8
	PXOR       X9, X8
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x10, X7
	PSLLL      $0x10, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x0c, X7
	PSLLL      $0x14, X2
	PXOR       X7, X2
	PADDD      X8, X1
	PADDD      X2, X1
	PXOR       X1, X4
	MOVO       X4, X7
	PSRLL      $0x08, X7
	PSLLL      $0x18, X4
	PXOR       X7, X4
	PADDD      X4, X3
	PXOR       X3, X2
	MOVO       X2, X7
	PSRLL      $0x07, X7
	PSLLL      $0x19, X2
	PXOR       X7, X2

	// Round 14 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X2, X2
	PSHUFD $0x4e, X3, X3
	PSHUFD $0x39, X4, X4

	// Finally the chain value is defined as:
	// h'0 = h0^s0^v0^v8
	// h'1 = h1^s1^v1^v9
	// h'2 = h2^s2^v2^va
	// h'3 = h3^s3^v3^vb
	// h'4 = h4^s0^v4^vc
	// h'5 = h5^s1^v5^vd
	// h'6 = h6^s2^v6^ve
	// h'7 = h7^s3^v7^vf
	PXOR X5, X1
	PXOR X0, X1
	PXOR X3, X1
	PXOR X6, X2
	PXOR X0, X2
	PXOR X4, X2

	// Either terminate the loop when there are no more full blocks
	// to compress or move the message pointer to the next block of
	// bytes to compress, increment the message bits counter
	// accordingly, and loop back around to compress it.
	DECQ BX
	JZ   done
	LEAQ 64(DX), DX
	ADDQ $0x00000200, CX
	JMP  compressLoop

done:
	// Output the resulting chain value.
	MOVOU X1, (AX)
	MOVOU X2, 16(AX)
	RET

// func blocksSSE41(state *State, msg []byte, counter uint64)
// Requires: SSE2, SSE4.1, SSSE3
TEXT ·blocksSSE41(SB), NOSPLIT, $0-40
	MOVQ state+0(FP), AX
	MOVQ counter+32(FP), CX
	MOVQ msg_base+8(FP), DX
	MOVQ msg_len+16(FP), BX

	// Populate registers for faster right rotations.
	MOVOU shuffle_rotr8_4x32<>+0(SB), X4
	MOVOU shuffle_rotr16_4x32<>+0(SB), X5

	// Convert message len to number of blocks for loop counter.
	SHRQ $0x06, BX

	// Initialize state matrix.
	// row0 = |v0  v1  v2  v3|   |  h0     h1     h2     h3 |
	// row1 = |v4  v5  v6  v7|   |  h4     h5     h6     h7 |
	MOVOU 32(AX), X6
	MOVOU (AX), X7
	MOVOU 16(AX), X8

compressLoop:
	// row2 = |v8  v9  va  vb| = |s0^c0  s1^c1  s2^c2  s3^c3|
	// row3 = |vc  vd  ve  vf|   |t0^c4  t0^c5  t1^c6  t1^c7|
	MOVOU  first_8_blake_consts<>+0(SB), X9
	PXOR   X6, X9
	MOVD   CX, X10
	PSHUFD $0x50, X10, X10
	PXOR   first_8_blake_consts<>+16(SB), X10
	MOVO   X7, X11
	MOVO   X8, X12

	// Convert message to big endian.
	MOVOU  shuffle_le_to_be_4x32<>+0(SB), X13
	MOVOU  (DX), X0
	PSHUFB X13, X0
	MOVOU  16(DX), X1
	PSHUFB X13, X1
	MOVOU  32(DX), X2
	PSHUFB X13, X2
	MOVOU  48(DX), X3
	PSHUFB X13, X3

	// Round 1 column step.
	PSHUFD  $0x08, X0, X14
	PSHUFD  $0x80, X1, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+0(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x0d, X0, X14
	PSHUFD  $0xd0, X1, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+16(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 1 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 1 diagonal step part 2: column step.
	PSHUFD  $0x08, X2, X14
	PSHUFD  $0x80, X3, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+32(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x0d, X2, X14
	PSHUFD  $0xd0, X3, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+48(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 1 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 2 column step.
	PSHUFD  $0x00, X1, X14
	PSHUFD  $0x10, X2, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x42, X3, X13
	PBLENDW $0xc3, X13, X14
	MOVOU   permuted_blake_consts<>+64(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x80, X1, X14
	PSHUFD  $0x02, X2, X13
	PBLENDW $0x0f, X13, X14
	PSHUFD  $0x30, X3, X13
	PBLENDW $0x30, X13, X14
	MOVOU   permuted_blake_consts<>+80(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 2 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 2 diagonal step part 2: column step.
	PSHUFD  $0x01, X0, X14
	PSHUFD  $0x40, X1, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x30, X2, X13
	PBLENDW $0x30, X13, X14
	MOVOU   permuted_blake_consts<>+96(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0xc8, X0, X14
	PSHUFD  $0x30, X1, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x00, X3, X13
	PBLENDW $0x03, X13, X14
	MOVOU   permuted_blake_consts<>+112(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 2 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 3 column step.
	PSHUFD  $0x10, X1, X14
	PSHUFD  $0x03, X2, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0xc0, X3, X13
	PBLENDW $0xcc, X13, X14
	MOVOU   permuted_blake_consts<>+128(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x20, X0, X14
	PSHUFD  $0x00, X2, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x40, X3, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+144(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 3 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 3 diagonal step part 2: column step.
	PSHUFD  $0x0c, X0, X14
	PSHUFD  $0x30, X1, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x42, X2, X13
	PBLENDW $0xc3, X13, X14
	MOVOU   permuted_blake_consts<>+160(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x10, X0, X14
	PSHUFD  $0x08, X1, X13
	PBLENDW $0xcc, X13, X14
	PSHUFD  $0x02, X3, X13
	PBLENDW $0x03, X13, X14
	MOVOU   permuted_blake_consts<>+176(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 3 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 4 column step.
	PSHUFD  $0x0c, X0, X14
	PSHUFD  $0x03, X1, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0xc0, X2, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x10, X3, X13
	PBLENDW $0x30, X13, X14
	MOVOU   permuted_blake_consts<>+192(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x04, X0, X14
	PSHUFD  $0x01, X2, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x80, X3, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+208(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 4 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 4 diagonal step part 2: column step.
	PSHUFD  $0x02, X0, X14
	PSHUFD  $0x04, X1, X13
	PBLENDW $0x3c, X13, X14
	PSHUFD  $0xc0, X3, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+224(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x00, X0, X14
	PSHUFD  $0x02, X1, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x08, X2, X13
	PBLENDW $0xcc, X13, X14
	MOVOU   permuted_blake_consts<>+240(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 4 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 5 column step.
	PSHUFD  $0x20, X0, X14
	PSHUFD  $0x04, X1, X13
	PBLENDW $0x0c, X13, X14
	PSHUFD  $0x81, X2, X13
	PBLENDW $0xc3, X13, X14
	MOVOU   permuted_blake_consts<>+256(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x00, X0, X14
	PSHUFD  $0x0c, X1, X13
	PBLENDW $0x3c, X13, X14
	PSHUFD  $0xc0, X3, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+272(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 5 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 5 diagonal step part 2: column step.
	PSHUFD  $0xc0, X0, X14
	PSHUFD  $0x20, X1, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x0c, X2, X13
	PBLENDW $0x0c, X13, X14
	PSHUFD  $0x02, X3, X13
	PBLENDW $0x03, X13, X14
	MOVOU   permuted_blake_consts<>+288(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x01, X0, X14
	PSHUFD  $0x00, X2, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x40, X3, X13
	PBLENDW $0xcc, X13, X14
	MOVOU   permuted_blake_consts<>+304(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 5 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 6 column step.
	PSHUFD  $0x02, X0, X14
	PSHUFD  $0x08, X1, X13
	PBLENDW $0x0c, X13, X14
	PSHUFD  $0x00, X2, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+320(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0xc0, X0, X14
	PSHUFD  $0x38, X2, X13
	PBLENDW $0x3c, X13, X14
	PSHUFD  $0x00, X3, X13
	PBLENDW $0x03, X13, X14
	MOVOU   permuted_blake_consts<>+336(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 6 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 6 diagonal step part 2: column step.
	PSHUFD  $0x40, X0, X14
	PSHUFD  $0x0c, X1, X13
	PBLENDW $0x0f, X13, X14
	PSHUFD  $0x30, X3, X13
	PBLENDW $0x30, X13, X14
	MOVOU   permuted_blake_consts<>+352(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x04, X1, X14
	PSHUFD  $0x40, X2, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x21, X3, X13
	PBLENDW $0x33, X13, X14
	MOVOU   permuted_blake_consts<>+368(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 6 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 7 column step.
	PSHUFD  $0x04, X0, X14
	PSHUFD  $0x00, X1, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x20, X3, X13
	PBLENDW $0x33, X13, X14
	MOVOU   permuted_blake_consts<>+384(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x01, X1, X14
	PSHUFD  $0x80, X2, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x1c, X3, X13
	PBLENDW $0x3c, X13, X14
	MOVOU   permuted_blake_consts<>+400(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 7 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 7 diagonal step part 2: column step.
	PSHUFD  $0x00, X0, X14
	PSHUFD  $0x08, X1, X13
	PBLENDW $0x0c, X13, X14
	PSHUFD  $0x10, X2, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+416(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x2c, X0, X14
	PSHUFD  $0x03, X1, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0xc0, X2, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+432(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 7 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 8 column step.
	PSHUFD  $0xc0, X0, X14
	PSHUFD  $0x0c, X1, X13
	PBLENDW $0x0c, X13, X14
	PSHUFD  $0x01, X3, X13
	PBLENDW $0x33, X13, X14
	MOVOU   permuted_blake_consts<>+448(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x10, X0, X14
	PSHUFD  $0x43, X2, X13
	PBLENDW $0xc3, X13, X14
	PSHUFD  $0x08, X3, X13
	PBLENDW $0x0c, X13, X14
	MOVOU   permuted_blake_consts<>+464(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 8 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 8 diagonal step part 2: column step.
	PSHUFD  $0x80, X0, X14
	PSHUFD  $0x01, X1, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x00, X2, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x0c, X3, X13
	PBLENDW $0x0c, X13, X14
	MOVOU   permuted_blake_consts<>+480(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x00, X0, X14
	PSHUFD  $0x20, X1, X13
	PBLENDW $0x3c, X13, X14
	PSHUFD  $0x80, X2, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+496(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 8 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 9 column step.
	PSHUFD  $0x00, X0, X14
	PSHUFD  $0x02, X1, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x30, X2, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x08, X3, X13
	PBLENDW $0x0c, X13, X14
	MOVOU   permuted_blake_consts<>+512(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x30, X0, X14
	PSHUFD  $0x04, X2, X13
	PBLENDW $0xcc, X13, X14
	PSHUFD  $0x03, X3, X13
	PBLENDW $0x03, X13, X14
	MOVOU   permuted_blake_consts<>+528(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 9 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 9 diagonal step part 2: column step.
	PSHUFD  $0x10, X0, X14
	PSHUFD  $0x80, X2, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x04, X3, X13
	PBLENDW $0x0f, X13, X14
	MOVOU   permuted_blake_consts<>+544(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x02, X0, X14
	PSHUFD  $0x4c, X1, X13
	PBLENDW $0xfc, X13, X14
	MOVOU   permuted_blake_consts<>+560(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 9 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 10 column step.
	PSHUFD  $0x40, X0, X14
	PSHUFD  $0x30, X1, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x02, X2, X13
	PBLENDW $0x0f, X13, X14
	MOVOU   permuted_blake_consts<>+576(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x02, X0, X14
	PSHUFD  $0x60, X1, X13
	PBLENDW $0xfc, X13, X14
	MOVOU   permuted_blake_consts<>+592(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 10 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 10 diagonal step part 2: column step.
	PSHUFD  $0x30, X0, X14
	PSHUFD  $0x04, X2, X13
	PBLENDW $0x0c, X13, X14
	PSHUFD  $0x43, X3, X13
	PBLENDW $0xc3, X13, X14
	MOVOU   permuted_blake_consts<>+608(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x00, X0, X14
	PSHUFD  $0x03, X2, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x08, X3, X13
	PBLENDW $0x3c, X13, X14
	MOVOU   permuted_blake_consts<>+624(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 10 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 11 column step.
	PSHUFD  $0x08, X0, X14
	PSHUFD  $0x80, X1, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+0(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x0d, X0, X14
	PSHUFD  $0xd0, X1, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+16(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 11 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 11 diagonal step part 2: column step.
	PSHUFD  $0x08, X2, X14
	PSHUFD  $0x80, X3, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+32(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x0d, X2, X14
	PSHUFD  $0xd0, X3, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+48(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 11 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 12 column step.
	PSHUFD  $0x00, X1, X14
	PSHUFD  $0x10, X2, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x42, X3, X13
	PBLENDW $0xc3, X13, X14
	MOVOU   permuted_blake_consts<>+64(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x80, X1, X14
	PSHUFD  $0x02, X2, X13
	PBLENDW $0x0f, X13, X14
	PSHUFD  $0x30, X3, X13
	PBLENDW $0x30, X13, X14
	MOVOU   permuted_blake_consts<>+80(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 12 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 12 diagonal step part 2: column step.
	PSHUFD  $0x01, X0, X14
	PSHUFD  $0x40, X1, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x30, X2, X13
	PBLENDW $0x30, X13, X14
	MOVOU   permuted_blake_consts<>+96(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0xc8, X0, X14
	PSHUFD  $0x30, X1, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x00, X3, X13
	PBLENDW $0x03, X13, X14
	MOVOU   permuted_blake_consts<>+112(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 12 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 13 column step.
	PSHUFD  $0x10, X1, X14
	PSHUFD  $0x03, X2, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0xc0, X3, X13
	PBLENDW $0xcc, X13, X14
	MOVOU   permuted_blake_consts<>+128(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x20, X0, X14
	PSHUFD  $0x00, X2, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x40, X3, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+144(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 13 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 13 diagonal step part 2: column step.
	PSHUFD  $0x0c, X0, X14
	PSHUFD  $0x30, X1, X13
	PBLENDW $0x30, X13, X14
	PSHUFD  $0x42, X2, X13
	PBLENDW $0xc3, X13, X14
	MOVOU   permuted_blake_consts<>+160(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x10, X0, X14
	PSHUFD  $0x08, X1, X13
	PBLENDW $0xcc, X13, X14
	PSHUFD  $0x02, X3, X13
	PBLENDW $0x03, X13, X14
	MOVOU   permuted_blake_consts<>+176(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 13 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Round 14 column step.
	PSHUFD  $0x0c, X0, X14
	PSHUFD  $0x03, X1, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0xc0, X2, X13
	PBLENDW $0xc0, X13, X14
	PSHUFD  $0x10, X3, X13
	PBLENDW $0x30, X13, X14
	MOVOU   permuted_blake_consts<>+192(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x04, X0, X14
	PSHUFD  $0x01, X2, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x80, X3, X13
	PBLENDW $0xf0, X13, X14
	MOVOU   permuted_blake_consts<>+208(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 14 diagonal step part 1: diagonalize.
	PSHUFD $0x39, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x93, X10, X10

	// Round 14 diagonal step part 2: column step.
	PSHUFD  $0x02, X0, X14
	PSHUFD  $0x04, X1, X13
	PBLENDW $0x3c, X13, X14
	PSHUFD  $0xc0, X3, X13
	PBLENDW $0xc0, X13, X14
	MOVOU   permuted_blake_consts<>+224(SB), X15
	PXOR    X14, X15
	PADDD   X15, X7
	PSHUFD  $0x00, X0, X14
	PSHUFD  $0x02, X1, X13
	PBLENDW $0x03, X13, X14
	PSHUFD  $0x08, X2, X13
	PBLENDW $0xcc, X13, X14
	MOVOU   permuted_blake_consts<>+240(SB), X15
	PXOR    X14, X15
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X5, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x0c, X13
	PSLLL   $0x14, X8
	PXOR    X13, X8
	PADDD   X15, X7
	PADDD   X8, X7
	PXOR    X7, X10
	PSHUFB  X4, X10
	PADDD   X10, X9
	PXOR    X9, X8
	MOVO    X8, X13
	PSRLL   $0x07, X13
	PSLLL   $0x19, X8
	PXOR    X13, X8

	// Round 14 diagonal step part 3: undiagonalize.
	PSHUFD $0x93, X8, X8
	PSHUFD $0x4e, X9, X9
	PSHUFD $0x39, X10, X10

	// Finally the chain value is defined as:
	// h'0 = h0^s0^v0^v8
	// h'1 = h1^s1^v1^v9
	// h'2 = h2^s2^v2^va
	// h'3 = h3^s3^v3^vb
	// h'4 = h4^s0^v4^vc
	// h'5 = h5^s1^v5^vd
	// h'6 = h6^s2^v6^ve
	// h'7 = h7^s3^v7^vf
	PXOR X11, X7
	PXOR X6, X7
	PXOR X9, X7
	PXOR X12, X8
	PXOR X6, X8
	PXOR X10, X8

	// Either terminate the loop when there are no more full blocks
	// to compress or move the message pointer to the next block of
	// bytes to compress, increment the message bits counter
	// accordingly, and loop back around to compress it.
	DECQ BX
	JZ   done
	LEAQ 64(DX), DX
	ADDQ $0x00000200, CX
	JMP  compressLoop

done:
	// Output the resulting chain value.
	MOVOU X7, (AX)
	MOVOU X8, 16(AX)
	RET

// func blocksAVX(state *State, msg []byte, counter uint64)
// Requires: AVX
TEXT ·blocksAVX(SB), NOSPLIT, $0-40
	MOVQ state+0(FP), AX
	MOVQ counter+32(FP), CX
	MOVQ msg_base+8(FP), DX
	MOVQ msg_len+16(FP), BX

	// Populate registers for fast right rotations.
	VMOVDQU shuffle_rotr8_4x32<>+0(SB), X4
	VMOVDQU shuffle_rotr16_4x32<>+0(SB), X5

	// Convert message len to number of blocks for loop counter.
	SHRQ $0x06, BX

	// Initialize state matrix.
	// row0 = |v0  v1  v2  v3|   |  h0     h1     h2     h3 |
	// row1 = |v4  v5  v6  v7|   |  h4     h5     h6     h7 |
	VMOVDQU 32(AX), X6
	VMOVDQU (AX), X7
	VMOVDQU 16(AX), X8

compressLoop:
	// row2 = |v8  v9  va  vb| = |s0^c0  s1^c1  s2^c2  s3^c3|
	// row3 = |vc  vd  ve  vf|   |t0^c4  t0^c5  t1^c6  t1^c7|
	VMOVDQU first_8_blake_consts<>+0(SB), X9
	VPXOR   X6, X9, X9
	VMOVQ   CX, X10
	VPSHUFD $0x50, X10, X10
	VPXOR   first_8_blake_consts<>+16(SB), X10, X10
	VMOVDQA X7, X11
	VMOVDQA X8, X12

	// Convert message to big endian.
	VMOVDQU shuffle_le_to_be_4x32<>+0(SB), X13
	VMOVDQU (DX), X0
	VPSHUFB X13, X0, X0
	VMOVDQU 16(DX), X1
	VPSHUFB X13, X1, X1
	VMOVDQU 32(DX), X2
	VPSHUFB X13, X2, X2
	VMOVDQU 48(DX), X3
	VPSHUFB X13, X3, X3

	// Round 1 column step.
	VPSHUFD  $0x08, X0, X14
	VPSHUFD  $0x80, X1, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+0(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x0d, X0, X14
	VPSHUFD  $0xd0, X1, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+16(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 1 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 1 diagonal step part 2: column step.
	VPSHUFD  $0x08, X2, X14
	VPSHUFD  $0x80, X3, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+32(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x0d, X2, X14
	VPSHUFD  $0xd0, X3, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+48(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 1 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 2 column step.
	VPSHUFD  $0x00, X1, X14
	VPSHUFD  $0x10, X2, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x42, X3, X13
	VPBLENDW $0xc3, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+64(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x80, X1, X14
	VPSHUFD  $0x02, X2, X13
	VPBLENDW $0x0f, X13, X14, X14
	VPSHUFD  $0x30, X3, X13
	VPBLENDW $0x30, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+80(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 2 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 2 diagonal step part 2: column step.
	VPSHUFD  $0x01, X0, X14
	VPSHUFD  $0x40, X1, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x30, X2, X13
	VPBLENDW $0x30, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+96(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0xc8, X0, X14
	VPSHUFD  $0x30, X1, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x00, X3, X13
	VPBLENDW $0x03, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+112(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 2 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 3 column step.
	VPSHUFD  $0x10, X1, X14
	VPSHUFD  $0x03, X2, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0xc0, X3, X13
	VPBLENDW $0xcc, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+128(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x20, X0, X14
	VPSHUFD  $0x00, X2, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x40, X3, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+144(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 3 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 3 diagonal step part 2: column step.
	VPSHUFD  $0x0c, X0, X14
	VPSHUFD  $0x30, X1, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x42, X2, X13
	VPBLENDW $0xc3, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+160(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x10, X0, X14
	VPSHUFD  $0x08, X1, X13
	VPBLENDW $0xcc, X13, X14, X14
	VPSHUFD  $0x02, X3, X13
	VPBLENDW $0x03, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+176(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 3 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 4 column step.
	VPSHUFD  $0x0c, X0, X14
	VPSHUFD  $0x03, X1, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0xc0, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x10, X3, X13
	VPBLENDW $0x30, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+192(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x04, X0, X14
	VPSHUFD  $0x01, X2, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x80, X3, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+208(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 4 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 4 diagonal step part 2: column step.
	VPSHUFD  $0x02, X0, X14
	VPSHUFD  $0x04, X1, X13
	VPBLENDW $0x3c, X13, X14, X14
	VPSHUFD  $0xc0, X3, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+224(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x00, X0, X14
	VPSHUFD  $0x02, X1, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x08, X2, X13
	VPBLENDW $0xcc, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+240(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 4 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 5 column step.
	VPSHUFD  $0x20, X0, X14
	VPSHUFD  $0x04, X1, X13
	VPBLENDW $0x0c, X13, X14, X14
	VPSHUFD  $0x81, X2, X13
	VPBLENDW $0xc3, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+256(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x00, X0, X14
	VPSHUFD  $0x0c, X1, X13
	VPBLENDW $0x3c, X13, X14, X14
	VPSHUFD  $0xc0, X3, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+272(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 5 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 5 diagonal step part 2: column step.
	VPSHUFD  $0xc0, X0, X14
	VPSHUFD  $0x20, X1, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x0c, X2, X13
	VPBLENDW $0x0c, X13, X14, X14
	VPSHUFD  $0x02, X3, X13
	VPBLENDW $0x03, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+288(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x01, X0, X14
	VPSHUFD  $0x00, X2, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x40, X3, X13
	VPBLENDW $0xcc, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+304(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 5 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 6 column step.
	VPSHUFD  $0x02, X0, X14
	VPSHUFD  $0x08, X1, X13
	VPBLENDW $0x0c, X13, X14, X14
	VPSHUFD  $0x00, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+320(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0xc0, X0, X14
	VPSHUFD  $0x38, X2, X13
	VPBLENDW $0x3c, X13, X14, X14
	VPSHUFD  $0x00, X3, X13
	VPBLENDW $0x03, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+336(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 6 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 6 diagonal step part 2: column step.
	VPSHUFD  $0x40, X0, X14
	VPSHUFD  $0x0c, X1, X13
	VPBLENDW $0x0f, X13, X14, X14
	VPSHUFD  $0x30, X3, X13
	VPBLENDW $0x30, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+352(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x04, X1, X14
	VPSHUFD  $0x40, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x21, X3, X13
	VPBLENDW $0x33, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+368(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 6 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 7 column step.
	VPSHUFD  $0x04, X0, X14
	VPSHUFD  $0x00, X1, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x20, X3, X13
	VPBLENDW $0x33, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+384(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x01, X1, X14
	VPSHUFD  $0x80, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x1c, X3, X13
	VPBLENDW $0x3c, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+400(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 7 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 7 diagonal step part 2: column step.
	VPSHUFD  $0x00, X0, X14
	VPSHUFD  $0x08, X1, X13
	VPBLENDW $0x0c, X13, X14, X14
	VPSHUFD  $0x10, X2, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+416(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x2c, X0, X14
	VPSHUFD  $0x03, X1, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0xc0, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+432(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 7 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 8 column step.
	VPSHUFD  $0xc0, X0, X14
	VPSHUFD  $0x0c, X1, X13
	VPBLENDW $0x0c, X13, X14, X14
	VPSHUFD  $0x01, X3, X13
	VPBLENDW $0x33, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+448(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x10, X0, X14
	VPSHUFD  $0x43, X2, X13
	VPBLENDW $0xc3, X13, X14, X14
	VPSHUFD  $0x08, X3, X13
	VPBLENDW $0x0c, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+464(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 8 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 8 diagonal step part 2: column step.
	VPSHUFD  $0x80, X0, X14
	VPSHUFD  $0x01, X1, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x00, X2, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x0c, X3, X13
	VPBLENDW $0x0c, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+480(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x00, X0, X14
	VPSHUFD  $0x20, X1, X13
	VPBLENDW $0x3c, X13, X14, X14
	VPSHUFD  $0x80, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+496(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 8 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 9 column step.
	VPSHUFD  $0x00, X0, X14
	VPSHUFD  $0x02, X1, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x30, X2, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x08, X3, X13
	VPBLENDW $0x0c, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+512(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x30, X0, X14
	VPSHUFD  $0x04, X2, X13
	VPBLENDW $0xcc, X13, X14, X14
	VPSHUFD  $0x03, X3, X13
	VPBLENDW $0x03, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+528(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 9 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 9 diagonal step part 2: column step.
	VPSHUFD  $0x10, X0, X14
	VPSHUFD  $0x80, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x04, X3, X13
	VPBLENDW $0x0f, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+544(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x02, X0, X14
	VPSHUFD  $0x4c, X1, X13
	VPBLENDW $0xfc, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+560(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 9 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 10 column step.
	VPSHUFD  $0x40, X0, X14
	VPSHUFD  $0x30, X1, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x02, X2, X13
	VPBLENDW $0x0f, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+576(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x02, X0, X14
	VPSHUFD  $0x60, X1, X13
	VPBLENDW $0xfc, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+592(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 10 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 10 diagonal step part 2: column step.
	VPSHUFD  $0x30, X0, X14
	VPSHUFD  $0x04, X2, X13
	VPBLENDW $0x0c, X13, X14, X14
	VPSHUFD  $0x43, X3, X13
	VPBLENDW $0xc3, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+608(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x00, X0, X14
	VPSHUFD  $0x03, X2, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x08, X3, X13
	VPBLENDW $0x3c, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+624(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 10 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 11 column step.
	VPSHUFD  $0x08, X0, X14
	VPSHUFD  $0x80, X1, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+0(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x0d, X0, X14
	VPSHUFD  $0xd0, X1, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+16(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 11 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 11 diagonal step part 2: column step.
	VPSHUFD  $0x08, X2, X14
	VPSHUFD  $0x80, X3, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+32(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x0d, X2, X14
	VPSHUFD  $0xd0, X3, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+48(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 11 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 12 column step.
	VPSHUFD  $0x00, X1, X14
	VPSHUFD  $0x10, X2, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x42, X3, X13
	VPBLENDW $0xc3, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+64(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x80, X1, X14
	VPSHUFD  $0x02, X2, X13
	VPBLENDW $0x0f, X13, X14, X14
	VPSHUFD  $0x30, X3, X13
	VPBLENDW $0x30, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+80(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 12 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 12 diagonal step part 2: column step.
	VPSHUFD  $0x01, X0, X14
	VPSHUFD  $0x40, X1, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x30, X2, X13
	VPBLENDW $0x30, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+96(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0xc8, X0, X14
	VPSHUFD  $0x30, X1, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x00, X3, X13
	VPBLENDW $0x03, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+112(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 12 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 13 column step.
	VPSHUFD  $0x10, X1, X14
	VPSHUFD  $0x03, X2, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0xc0, X3, X13
	VPBLENDW $0xcc, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+128(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x20, X0, X14
	VPSHUFD  $0x00, X2, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x40, X3, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+144(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 13 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 13 diagonal step part 2: column step.
	VPSHUFD  $0x0c, X0, X14
	VPSHUFD  $0x30, X1, X13
	VPBLENDW $0x30, X13, X14, X14
	VPSHUFD  $0x42, X2, X13
	VPBLENDW $0xc3, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+160(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x10, X0, X14
	VPSHUFD  $0x08, X1, X13
	VPBLENDW $0xcc, X13, X14, X14
	VPSHUFD  $0x02, X3, X13
	VPBLENDW $0x03, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+176(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 13 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Round 14 column step.
	VPSHUFD  $0x0c, X0, X14
	VPSHUFD  $0x03, X1, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0xc0, X2, X13
	VPBLENDW $0xc0, X13, X14, X14
	VPSHUFD  $0x10, X3, X13
	VPBLENDW $0x30, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+192(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x04, X0, X14
	VPSHUFD  $0x01, X2, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x80, X3, X13
	VPBLENDW $0xf0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+208(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 14 diagonal step part 1: diagonalize.
	VPSHUFD $0x39, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x93, X10, X10

	// Round 14 diagonal step part 2: column step.
	VPSHUFD  $0x02, X0, X14
	VPSHUFD  $0x04, X1, X13
	VPBLENDW $0x3c, X13, X14, X14
	VPSHUFD  $0xc0, X3, X13
	VPBLENDW $0xc0, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+224(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X15, X7, X7
	VPSHUFD  $0x00, X0, X14
	VPSHUFD  $0x02, X1, X13
	VPBLENDW $0x03, X13, X14, X14
	VPSHUFD  $0x08, X2, X13
	VPBLENDW $0xcc, X13, X14, X14
	VMOVDQU  permuted_blake_consts<>+240(SB), X15
	VPXOR    X14, X15, X15
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X5, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x0c, X8, X13
	VPSLLD   $0x14, X8, X8
	VPXOR    X13, X8, X8
	VPADDD   X15, X7, X7
	VPADDD   X8, X7, X7
	VPXOR    X7, X10, X10
	VPSHUFB  X4, X10, X10
	VPADDD   X10, X9, X9
	VPXOR    X9, X8, X8
	VPSRLD   $0x07, X8, X13
	VPSLLD   $0x19, X8, X8
	VPXOR    X13, X8, X8

	// Round 14 diagonal step part 3: undiagonalize.
	VPSHUFD $0x93, X8, X8
	VPSHUFD $0x4e, X9, X9
	VPSHUFD $0x39, X10, X10

	// Finally the chain value is defined as:
	// h'0 = h0^s0^v0^v8
	// h'1 = h1^s1^v1^v9
	// h'2 = h2^s2^v2^va
	// h'3 = h3^s3^v3^vb
	// h'4 = h4^s0^v4^vc
	// h'5 = h5^s1^v5^vd
	// h'6 = h6^s2^v6^ve
	// h'7 = h7^s3^v7^vf
	VPXOR X11, X7, X7
	VPXOR X6, X7, X7
	VPXOR X9, X7, X7
	VPXOR X12, X8, X8
	VPXOR X6, X8, X8
	VPXOR X10, X8, X8

	// Either terminate the loop when there are no more full blocks
	// to compress or move the message pointer to the next block of
	// bytes to compress, increment the message bits counter
	// accordingly, and loop back around to compress it.
	DECQ BX
	JZ   done
	LEAQ 64(DX), DX
	ADDQ $0x00000200, CX
	JMP  compressLoop

done:
	// Output the resulting chain value.
	VMOVDQU X7, (AX)
	VMOVDQU X8, 16(AX)
	RET
