blocks_amd64.s raw
1 // Code generated by command: go run gen_amd64_compress_asm.go -out ../compress/blocks_amd64.s -stubs ../compress/blocks_amd64.go -pkg compress. DO NOT EDIT.
2
3 //go:build !purego
4
5 #include "textflag.h"
6
7 DATA first_8_blake_consts<>+0(SB)/8, $0x85a308d3243f6a88
8 DATA first_8_blake_consts<>+8(SB)/8, $0x0370734413198a2e
9 DATA first_8_blake_consts<>+16(SB)/8, $0x299f31d0a4093822
10 DATA first_8_blake_consts<>+24(SB)/8, $0xec4e6c89082efa98
11 GLOBL first_8_blake_consts<>(SB), RODATA|NOPTR, $32
12
13 DATA permuted_blake_consts<>+0(SB)/8, $0x0370734485a308d3
14 DATA permuted_blake_consts<>+8(SB)/8, $0xec4e6c89299f31d0
15 DATA permuted_blake_consts<>+16(SB)/8, $0x13198a2e243f6a88
16 DATA permuted_blake_consts<>+24(SB)/8, $0x082efa98a4093822
17 DATA permuted_blake_consts<>+32(SB)/8, $0x34e90c6c38d01377
18 DATA permuted_blake_consts<>+40(SB)/8, $0xb5470917c97c50dd
19 DATA permuted_blake_consts<>+48(SB)/8, $0xbe5466cf452821e6
20 DATA permuted_blake_consts<>+56(SB)/8, $0x3f84d5b5c0ac29b7
21 DATA permuted_blake_consts<>+64(SB)/8, $0x452821e6be5466cf
22 DATA permuted_blake_consts<>+72(SB)/8, $0x082efa98b5470917
23 DATA permuted_blake_consts<>+80(SB)/8, $0xa40938223f84d5b5
24 DATA permuted_blake_consts<>+88(SB)/8, $0xc97c50dd38d01377
25 DATA permuted_blake_consts<>+96(SB)/8, $0x13198a2ec0ac29b7
26 DATA permuted_blake_consts<>+104(SB)/8, $0x03707344ec4e6c89
27 DATA permuted_blake_consts<>+112(SB)/8, $0x243f6a8885a308d3
28 DATA permuted_blake_consts<>+120(SB)/8, $0x299f31d034e90c6c
29 DATA permuted_blake_consts<>+128(SB)/8, $0x243f6a88452821e6
30 DATA permuted_blake_consts<>+136(SB)/8, $0xc97c50dd13198a2e
31 DATA permuted_blake_consts<>+144(SB)/8, $0xc0ac29b734e90c6c
32 DATA permuted_blake_consts<>+152(SB)/8, $0xb5470917299f31d0
33 DATA permuted_blake_consts<>+160(SB)/8, $0x082efa983f84d5b5
34 DATA permuted_blake_consts<>+168(SB)/8, $0xa409382285a308d3
35 DATA permuted_blake_consts<>+176(SB)/8, $0x03707344be5466cf
36 DATA permuted_blake_consts<>+184(SB)/8, $0x38d01377ec4e6c89
37 DATA permuted_blake_consts<>+192(SB)/8, $0x85a308d338d01377
38 DATA permuted_blake_consts<>+200(SB)/8, $0x3f84d5b5c0ac29b7
39 DATA permuted_blake_consts<>+208(SB)/8, $0x03707344ec4e6c89
40 DATA permuted_blake_consts<>+216(SB)/8, $0x34e90c6cc97c50dd
41 DATA permuted_blake_consts<>+224(SB)/8, $0xbe5466cf082efa98
42 DATA permuted_blake_consts<>+232(SB)/8, $0x452821e6243f6a88
43 DATA permuted_blake_consts<>+240(SB)/8, $0x299f31d013198a2e
44 DATA permuted_blake_consts<>+248(SB)/8, $0xb5470917a4093822
45 DATA permuted_blake_consts<>+256(SB)/8, $0xec4e6c89243f6a88
46 DATA permuted_blake_consts<>+264(SB)/8, $0xb5470917a4093822
47 DATA permuted_blake_consts<>+272(SB)/8, $0x299f31d038d01377
48 DATA permuted_blake_consts<>+280(SB)/8, $0xbe5466cf13198a2e
49 DATA permuted_blake_consts<>+288(SB)/8, $0xc0ac29b785a308d3
50 DATA permuted_blake_consts<>+296(SB)/8, $0xc97c50dd452821e6
51 DATA permuted_blake_consts<>+304(SB)/8, $0x34e90c6c3f84d5b5
52 DATA permuted_blake_consts<>+312(SB)/8, $0x03707344082efa98
53 DATA permuted_blake_consts<>+320(SB)/8, $0xbe5466cfc0ac29b7
54 DATA permuted_blake_consts<>+328(SB)/8, $0x0370734434e90c6c
55 DATA permuted_blake_consts<>+336(SB)/8, $0x082efa9813198a2e
56 DATA permuted_blake_consts<>+344(SB)/8, $0x452821e6243f6a88
57 DATA permuted_blake_consts<>+352(SB)/8, $0x299f31d0c97c50dd
58 DATA permuted_blake_consts<>+360(SB)/8, $0x38d013773f84d5b5
59 DATA permuted_blake_consts<>+368(SB)/8, $0xec4e6c89a4093822
60 DATA permuted_blake_consts<>+376(SB)/8, $0x85a308d3b5470917
61 DATA permuted_blake_consts<>+384(SB)/8, $0xb5470917299f31d0
62 DATA permuted_blake_consts<>+392(SB)/8, $0xbe5466cfc97c50dd
63 DATA permuted_blake_consts<>+400(SB)/8, $0x85a308d3c0ac29b7
64 DATA permuted_blake_consts<>+408(SB)/8, $0xa40938223f84d5b5
65 DATA permuted_blake_consts<>+416(SB)/8, $0x03707344ec4e6c89
66 DATA permuted_blake_consts<>+424(SB)/8, $0x34e90c6c13198a2e
67 DATA permuted_blake_consts<>+432(SB)/8, $0x082efa98243f6a88
68 DATA permuted_blake_consts<>+440(SB)/8, $0x452821e638d01377
69 DATA permuted_blake_consts<>+448(SB)/8, $0x3f84d5b534e90c6c
70 DATA permuted_blake_consts<>+456(SB)/8, $0x38d0137785a308d3
71 DATA permuted_blake_consts<>+464(SB)/8, $0xec4e6c89c97c50dd
72 DATA permuted_blake_consts<>+472(SB)/8, $0x03707344c0ac29b7
73 DATA permuted_blake_consts<>+480(SB)/8, $0xa4093822243f6a88
74 DATA permuted_blake_consts<>+488(SB)/8, $0xbe5466cf082efa98
75 DATA permuted_blake_consts<>+496(SB)/8, $0xb5470917299f31d0
76 DATA permuted_blake_consts<>+504(SB)/8, $0x13198a2e452821e6
77 DATA permuted_blake_consts<>+512(SB)/8, $0x38d01377b5470917
78 DATA permuted_blake_consts<>+520(SB)/8, $0x452821e603707344
79 DATA permuted_blake_consts<>+528(SB)/8, $0x3f84d5b5082efa98
80 DATA permuted_blake_consts<>+536(SB)/8, $0x243f6a8834e90c6c
81 DATA permuted_blake_consts<>+544(SB)/8, $0xec4e6c8913198a2e
82 DATA permuted_blake_consts<>+552(SB)/8, $0x299f31d0a4093822
83 DATA permuted_blake_consts<>+560(SB)/8, $0xc97c50ddc0ac29b7
84 DATA permuted_blake_consts<>+568(SB)/8, $0xbe5466cf85a308d3
85 DATA permuted_blake_consts<>+576(SB)/8, $0xa409382213198a2e
86 DATA permuted_blake_consts<>+584(SB)/8, $0x299f31d0082efa98
87 DATA permuted_blake_consts<>+592(SB)/8, $0x452821e6be5466cf
88 DATA permuted_blake_consts<>+600(SB)/8, $0x85a308d3ec4e6c89
89 DATA permuted_blake_consts<>+608(SB)/8, $0x3f84d5b534e90c6c
90 DATA permuted_blake_consts<>+616(SB)/8, $0x243f6a88c0ac29b7
91 DATA permuted_blake_consts<>+624(SB)/8, $0x38d01377b5470917
92 DATA permuted_blake_consts<>+632(SB)/8, $0xc97c50dd03707344
93 GLOBL permuted_blake_consts<>(SB), RODATA|NOPTR, $640
94
95 DATA shuffle_rotr8_4x32<>+0(SB)/8, $0x0407060500030201
96 DATA shuffle_rotr8_4x32<>+8(SB)/8, $0x0c0f0e0d080b0a09
97 GLOBL shuffle_rotr8_4x32<>(SB), RODATA|NOPTR, $16
98
99 DATA shuffle_rotr16_4x32<>+0(SB)/8, $0x0504070601000302
100 DATA shuffle_rotr16_4x32<>+8(SB)/8, $0x0d0c0f0e09080b0a
101 GLOBL shuffle_rotr16_4x32<>(SB), RODATA|NOPTR, $16
102
103 DATA shuffle_le_to_be_4x32<>+0(SB)/8, $0x0405060700010203
104 DATA shuffle_le_to_be_4x32<>+8(SB)/8, $0x0c0d0e0f08090a0b
105 GLOBL shuffle_le_to_be_4x32<>(SB), RODATA|NOPTR, $16
106
107 // func blocksSSE2(state *State, msg []byte, counter uint64)
108 // Requires: SSE2
109 TEXT ·blocksSSE2(SB), $64-40
110 MOVQ state+0(FP), AX
111 MOVQ counter+32(FP), CX
112 MOVQ msg_base+8(FP), DX
113 MOVQ msg_len+16(FP), BX
114
115 // Convert message len to number of blocks for loop counter.
116 SHRQ $0x06, BX
117
118 // Initialize state matrix.
119 // row0 = |v0 v1 v2 v3| | h0 h1 h2 h3 |
120 // row1 = |v4 v5 v6 v7| | h4 h5 h6 h7 |
121 MOVOU 32(AX), X0
122 MOVOU (AX), X1
123 MOVOU 16(AX), X2
124
125 compressLoop:
126 // row2 = |v8 v9 va vb| = |s0^c0 s1^c1 s2^c2 s3^c3|
127 // row3 = |vc vd ve vf| |t0^c4 t0^c5 t1^c6 t1^c7|
128 MOVOU first_8_blake_consts<>+0(SB), X3
129 PXOR X0, X3
130 MOVD CX, X4
131 PSHUFD $0x50, X4, X4
132 PXOR first_8_blake_consts<>+16(SB), X4
133 MOVO X1, X5
134 MOVO X2, X6
135
136 // Convert message to big endian.
137 MOVL (DX), SI
138 MOVL 4(DX), DI
139 MOVL 8(DX), R8
140 MOVL 12(DX), R9
141 MOVL 16(DX), R10
142 MOVL 20(DX), R11
143 MOVL 24(DX), R12
144 MOVL 28(DX), R13
145 BSWAPL SI
146 MOVL SI, (SP)
147 BSWAPL DI
148 MOVL DI, 4(SP)
149 BSWAPL R8
150 MOVL R8, 8(SP)
151 BSWAPL R9
152 MOVL R9, 12(SP)
153 BSWAPL R10
154 MOVL R10, 16(SP)
155 BSWAPL R11
156 MOVL R11, 20(SP)
157 BSWAPL R12
158 MOVL R12, 24(SP)
159 BSWAPL R13
160 MOVL R13, 28(SP)
161 MOVL 32(DX), SI
162 MOVL 36(DX), DI
163 MOVL 40(DX), R8
164 MOVL 44(DX), R9
165 MOVL 48(DX), R10
166 MOVL 52(DX), R11
167 MOVL 56(DX), R12
168 MOVL 60(DX), R13
169 BSWAPL SI
170 MOVL SI, 32(SP)
171 BSWAPL DI
172 MOVL DI, 36(SP)
173 BSWAPL R8
174 MOVL R8, 40(SP)
175 BSWAPL R9
176 MOVL R9, 44(SP)
177 BSWAPL R10
178 MOVL R10, 48(SP)
179 BSWAPL R11
180 MOVL R11, 52(SP)
181 BSWAPL R12
182 MOVL R12, 56(SP)
183 BSWAPL R13
184 MOVL R13, 60(SP)
185
186 // Round 1 column step.
187 MOVD 24(SP), X9
188 MOVD 16(SP), X7
189 MOVOA X7, X8
190 PUNPCKLLQ X9, X8
191 MOVD 8(SP), X7
192 MOVD (SP), X9
193 PUNPCKLLQ X7, X9
194 PUNPCKLQDQ X8, X9
195 MOVOU permuted_blake_consts<>+0(SB), X8
196 PXOR X9, X8
197 PADDD X8, X1
198 MOVD 28(SP), X9
199 MOVD 20(SP), X7
200 MOVOA X7, X8
201 PUNPCKLLQ X9, X8
202 MOVD 12(SP), X7
203 MOVD 4(SP), X9
204 PUNPCKLLQ X7, X9
205 PUNPCKLQDQ X8, X9
206 MOVOU permuted_blake_consts<>+16(SB), X8
207 PXOR X9, X8
208 PADDD X2, X1
209 PXOR X1, X4
210 MOVO X4, X7
211 PSRLL $0x10, X7
212 PSLLL $0x10, X4
213 PXOR X7, X4
214 PADDD X4, X3
215 PXOR X3, X2
216 MOVO X2, X7
217 PSRLL $0x0c, X7
218 PSLLL $0x14, X2
219 PXOR X7, X2
220 PADDD X8, X1
221 PADDD X2, X1
222 PXOR X1, X4
223 MOVO X4, X7
224 PSRLL $0x08, X7
225 PSLLL $0x18, X4
226 PXOR X7, X4
227 PADDD X4, X3
228 PXOR X3, X2
229 MOVO X2, X7
230 PSRLL $0x07, X7
231 PSLLL $0x19, X2
232 PXOR X7, X2
233
234 // Round 1 diagonal step part 1: diagonalize.
235 PSHUFD $0x39, X2, X2
236 PSHUFD $0x4e, X3, X3
237 PSHUFD $0x93, X4, X4
238
239 // Round 1 diagonal step part 2: column step.
240 MOVD 56(SP), X9
241 MOVD 48(SP), X7
242 MOVOA X7, X8
243 PUNPCKLLQ X9, X8
244 MOVD 40(SP), X7
245 MOVD 32(SP), X9
246 PUNPCKLLQ X7, X9
247 PUNPCKLQDQ X8, X9
248 MOVOU permuted_blake_consts<>+32(SB), X8
249 PXOR X9, X8
250 PADDD X8, X1
251 MOVD 60(SP), X9
252 MOVD 52(SP), X7
253 MOVOA X7, X8
254 PUNPCKLLQ X9, X8
255 MOVD 44(SP), X7
256 MOVD 36(SP), X9
257 PUNPCKLLQ X7, X9
258 PUNPCKLQDQ X8, X9
259 MOVOU permuted_blake_consts<>+48(SB), X8
260 PXOR X9, X8
261 PADDD X2, X1
262 PXOR X1, X4
263 MOVO X4, X7
264 PSRLL $0x10, X7
265 PSLLL $0x10, X4
266 PXOR X7, X4
267 PADDD X4, X3
268 PXOR X3, X2
269 MOVO X2, X7
270 PSRLL $0x0c, X7
271 PSLLL $0x14, X2
272 PXOR X7, X2
273 PADDD X8, X1
274 PADDD X2, X1
275 PXOR X1, X4
276 MOVO X4, X7
277 PSRLL $0x08, X7
278 PSLLL $0x18, X4
279 PXOR X7, X4
280 PADDD X4, X3
281 PXOR X3, X2
282 MOVO X2, X7
283 PSRLL $0x07, X7
284 PSLLL $0x19, X2
285 PXOR X7, X2
286
287 // Round 1 diagonal step part 3: undiagonalize.
288 PSHUFD $0x93, X2, X2
289 PSHUFD $0x4e, X3, X3
290 PSHUFD $0x39, X4, X4
291
292 // Round 2 column step.
293 MOVD 52(SP), X9
294 MOVD 36(SP), X7
295 MOVOA X7, X8
296 PUNPCKLLQ X9, X8
297 MOVD 16(SP), X7
298 MOVD 56(SP), X9
299 PUNPCKLLQ X7, X9
300 PUNPCKLQDQ X8, X9
301 MOVOU permuted_blake_consts<>+64(SB), X8
302 PXOR X9, X8
303 PADDD X8, X1
304 MOVD 24(SP), X9
305 MOVD 60(SP), X7
306 MOVOA X7, X8
307 PUNPCKLLQ X9, X8
308 MOVD 32(SP), X7
309 MOVD 40(SP), X9
310 PUNPCKLLQ X7, X9
311 PUNPCKLQDQ X8, X9
312 MOVOU permuted_blake_consts<>+80(SB), X8
313 PXOR X9, X8
314 PADDD X2, X1
315 PXOR X1, X4
316 MOVO X4, X7
317 PSRLL $0x10, X7
318 PSLLL $0x10, X4
319 PXOR X7, X4
320 PADDD X4, X3
321 PXOR X3, X2
322 MOVO X2, X7
323 PSRLL $0x0c, X7
324 PSLLL $0x14, X2
325 PXOR X7, X2
326 PADDD X8, X1
327 PADDD X2, X1
328 PXOR X1, X4
329 MOVO X4, X7
330 PSRLL $0x08, X7
331 PSLLL $0x18, X4
332 PXOR X7, X4
333 PADDD X4, X3
334 PXOR X3, X2
335 MOVO X2, X7
336 PSRLL $0x07, X7
337 PSLLL $0x19, X2
338 PXOR X7, X2
339
340 // Round 2 diagonal step part 1: diagonalize.
341 PSHUFD $0x39, X2, X2
342 PSHUFD $0x4e, X3, X3
343 PSHUFD $0x93, X4, X4
344
345 // Round 2 diagonal step part 2: column step.
346 MOVD 20(SP), X9
347 MOVD 44(SP), X7
348 MOVOA X7, X8
349 PUNPCKLLQ X9, X8
350 MOVD (SP), X7
351 MOVD 4(SP), X9
352 PUNPCKLLQ X7, X9
353 PUNPCKLQDQ X8, X9
354 MOVOU permuted_blake_consts<>+96(SB), X8
355 PXOR X9, X8
356 PADDD X8, X1
357 MOVD 12(SP), X9
358 MOVD 28(SP), X7
359 MOVOA X7, X8
360 PUNPCKLLQ X9, X8
361 MOVD 8(SP), X7
362 MOVD 48(SP), X9
363 PUNPCKLLQ X7, X9
364 PUNPCKLQDQ X8, X9
365 MOVOU permuted_blake_consts<>+112(SB), X8
366 PXOR X9, X8
367 PADDD X2, X1
368 PXOR X1, X4
369 MOVO X4, X7
370 PSRLL $0x10, X7
371 PSLLL $0x10, X4
372 PXOR X7, X4
373 PADDD X4, X3
374 PXOR X3, X2
375 MOVO X2, X7
376 PSRLL $0x0c, X7
377 PSLLL $0x14, X2
378 PXOR X7, X2
379 PADDD X8, X1
380 PADDD X2, X1
381 PXOR X1, X4
382 MOVO X4, X7
383 PSRLL $0x08, X7
384 PSLLL $0x18, X4
385 PXOR X7, X4
386 PADDD X4, X3
387 PXOR X3, X2
388 MOVO X2, X7
389 PSRLL $0x07, X7
390 PSLLL $0x19, X2
391 PXOR X7, X2
392
393 // Round 2 diagonal step part 3: undiagonalize.
394 PSHUFD $0x93, X2, X2
395 PSHUFD $0x4e, X3, X3
396 PSHUFD $0x39, X4, X4
397
398 // Round 3 column step.
399 MOVD 60(SP), X9
400 MOVD 20(SP), X7
401 MOVOA X7, X8
402 PUNPCKLLQ X9, X8
403 MOVD 48(SP), X7
404 MOVD 44(SP), X9
405 PUNPCKLLQ X7, X9
406 PUNPCKLQDQ X8, X9
407 MOVOU permuted_blake_consts<>+128(SB), X8
408 PXOR X9, X8
409 PADDD X8, X1
410 MOVD 52(SP), X9
411 MOVD 8(SP), X7
412 MOVOA X7, X8
413 PUNPCKLLQ X9, X8
414 MOVD (SP), X7
415 MOVD 32(SP), X9
416 PUNPCKLLQ X7, X9
417 PUNPCKLQDQ X8, X9
418 MOVOU permuted_blake_consts<>+144(SB), X8
419 PXOR X9, X8
420 PADDD X2, X1
421 PXOR X1, X4
422 MOVO X4, X7
423 PSRLL $0x10, X7
424 PSLLL $0x10, X4
425 PXOR X7, X4
426 PADDD X4, X3
427 PXOR X3, X2
428 MOVO X2, X7
429 PSRLL $0x0c, X7
430 PSLLL $0x14, X2
431 PXOR X7, X2
432 PADDD X8, X1
433 PADDD X2, X1
434 PXOR X1, X4
435 MOVO X4, X7
436 PSRLL $0x08, X7
437 PSLLL $0x18, X4
438 PXOR X7, X4
439 PADDD X4, X3
440 PXOR X3, X2
441 MOVO X2, X7
442 PSRLL $0x07, X7
443 PSLLL $0x19, X2
444 PXOR X7, X2
445
446 // Round 3 diagonal step part 1: diagonalize.
447 PSHUFD $0x39, X2, X2
448 PSHUFD $0x4e, X3, X3
449 PSHUFD $0x93, X4, X4
450
451 // Round 3 diagonal step part 2: column step.
452 MOVD 36(SP), X9
453 MOVD 28(SP), X7
454 MOVOA X7, X8
455 PUNPCKLLQ X9, X8
456 MOVD 12(SP), X7
457 MOVD 40(SP), X9
458 PUNPCKLLQ X7, X9
459 PUNPCKLQDQ X8, X9
460 MOVOU permuted_blake_consts<>+160(SB), X8
461 PXOR X9, X8
462 PADDD X8, X1
463 MOVD 16(SP), X9
464 MOVD 4(SP), X7
465 MOVOA X7, X8
466 PUNPCKLLQ X9, X8
467 MOVD 24(SP), X7
468 MOVD 56(SP), X9
469 PUNPCKLLQ X7, X9
470 PUNPCKLQDQ X8, X9
471 MOVOU permuted_blake_consts<>+176(SB), X8
472 PXOR X9, X8
473 PADDD X2, X1
474 PXOR X1, X4
475 MOVO X4, X7
476 PSRLL $0x10, X7
477 PSLLL $0x10, X4
478 PXOR X7, X4
479 PADDD X4, X3
480 PXOR X3, X2
481 MOVO X2, X7
482 PSRLL $0x0c, X7
483 PSLLL $0x14, X2
484 PXOR X7, X2
485 PADDD X8, X1
486 PADDD X2, X1
487 PXOR X1, X4
488 MOVO X4, X7
489 PSRLL $0x08, X7
490 PSLLL $0x18, X4
491 PXOR X7, X4
492 PADDD X4, X3
493 PXOR X3, X2
494 MOVO X2, X7
495 PSRLL $0x07, X7
496 PSLLL $0x19, X2
497 PXOR X7, X2
498
499 // Round 3 diagonal step part 3: undiagonalize.
500 PSHUFD $0x93, X2, X2
501 PSHUFD $0x4e, X3, X3
502 PSHUFD $0x39, X4, X4
503
504 // Round 4 column step.
505 MOVD 44(SP), X9
506 MOVD 52(SP), X7
507 MOVOA X7, X8
508 PUNPCKLLQ X9, X8
509 MOVD 12(SP), X7
510 MOVD 28(SP), X9
511 PUNPCKLLQ X7, X9
512 PUNPCKLQDQ X8, X9
513 MOVOU permuted_blake_consts<>+192(SB), X8
514 PXOR X9, X8
515 PADDD X8, X1
516 MOVD 56(SP), X9
517 MOVD 48(SP), X7
518 MOVOA X7, X8
519 PUNPCKLLQ X9, X8
520 MOVD 4(SP), X7
521 MOVD 36(SP), X9
522 PUNPCKLLQ X7, X9
523 PUNPCKLQDQ X8, X9
524 MOVOU permuted_blake_consts<>+208(SB), X8
525 PXOR X9, X8
526 PADDD X2, X1
527 PXOR X1, X4
528 MOVO X4, X7
529 PSRLL $0x10, X7
530 PSLLL $0x10, X4
531 PXOR X7, X4
532 PADDD X4, X3
533 PXOR X3, X2
534 MOVO X2, X7
535 PSRLL $0x0c, X7
536 PSLLL $0x14, X2
537 PXOR X7, X2
538 PADDD X8, X1
539 PADDD X2, X1
540 PXOR X1, X4
541 MOVO X4, X7
542 PSRLL $0x08, X7
543 PSLLL $0x18, X4
544 PXOR X7, X4
545 PADDD X4, X3
546 PXOR X3, X2
547 MOVO X2, X7
548 PSRLL $0x07, X7
549 PSLLL $0x19, X2
550 PXOR X7, X2
551
552 // Round 4 diagonal step part 1: diagonalize.
553 PSHUFD $0x39, X2, X2
554 PSHUFD $0x4e, X3, X3
555 PSHUFD $0x93, X4, X4
556
557 // Round 4 diagonal step part 2: column step.
558 MOVD 60(SP), X9
559 MOVD 16(SP), X7
560 MOVOA X7, X8
561 PUNPCKLLQ X9, X8
562 MOVD 20(SP), X7
563 MOVD 8(SP), X9
564 PUNPCKLLQ X7, X9
565 PUNPCKLQDQ X8, X9
566 MOVOU permuted_blake_consts<>+224(SB), X8
567 PXOR X9, X8
568 PADDD X8, X1
569 MOVD 32(SP), X9
570 MOVD (SP), X7
571 MOVOA X7, X8
572 PUNPCKLLQ X9, X8
573 MOVD 40(SP), X7
574 MOVD 24(SP), X9
575 PUNPCKLLQ X7, X9
576 PUNPCKLQDQ X8, X9
577 MOVOU permuted_blake_consts<>+240(SB), X8
578 PXOR X9, X8
579 PADDD X2, X1
580 PXOR X1, X4
581 MOVO X4, X7
582 PSRLL $0x10, X7
583 PSLLL $0x10, X4
584 PXOR X7, X4
585 PADDD X4, X3
586 PXOR X3, X2
587 MOVO X2, X7
588 PSRLL $0x0c, X7
589 PSLLL $0x14, X2
590 PXOR X7, X2
591 PADDD X8, X1
592 PADDD X2, X1
593 PXOR X1, X4
594 MOVO X4, X7
595 PSRLL $0x08, X7
596 PSLLL $0x18, X4
597 PXOR X7, X4
598 PADDD X4, X3
599 PXOR X3, X2
600 MOVO X2, X7
601 PSRLL $0x07, X7
602 PSLLL $0x19, X2
603 PXOR X7, X2
604
605 // Round 4 diagonal step part 3: undiagonalize.
606 PSHUFD $0x93, X2, X2
607 PSHUFD $0x4e, X3, X3
608 PSHUFD $0x39, X4, X4
609
610 // Round 5 column step.
611 MOVD 40(SP), X9
612 MOVD 8(SP), X7
613 MOVOA X7, X8
614 PUNPCKLLQ X9, X8
615 MOVD 20(SP), X7
616 MOVD 36(SP), X9
617 PUNPCKLLQ X7, X9
618 PUNPCKLQDQ X8, X9
619 MOVOU permuted_blake_consts<>+256(SB), X8
620 PXOR X9, X8
621 PADDD X8, X1
622 MOVD 60(SP), X9
623 MOVD 16(SP), X7
624 MOVOA X7, X8
625 PUNPCKLLQ X9, X8
626 MOVD 28(SP), X7
627 MOVD (SP), X9
628 PUNPCKLLQ X7, X9
629 PUNPCKLQDQ X8, X9
630 MOVOU permuted_blake_consts<>+272(SB), X8
631 PXOR X9, X8
632 PADDD X2, X1
633 PXOR X1, X4
634 MOVO X4, X7
635 PSRLL $0x10, X7
636 PSLLL $0x10, X4
637 PXOR X7, X4
638 PADDD X4, X3
639 PXOR X3, X2
640 MOVO X2, X7
641 PSRLL $0x0c, X7
642 PSLLL $0x14, X2
643 PXOR X7, X2
644 PADDD X8, X1
645 PADDD X2, X1
646 PXOR X1, X4
647 MOVO X4, X7
648 PSRLL $0x08, X7
649 PSLLL $0x18, X4
650 PXOR X7, X4
651 PADDD X4, X3
652 PXOR X3, X2
653 MOVO X2, X7
654 PSRLL $0x07, X7
655 PSLLL $0x19, X2
656 PXOR X7, X2
657
658 // Round 5 diagonal step part 1: diagonalize.
659 PSHUFD $0x39, X2, X2
660 PSHUFD $0x4e, X3, X3
661 PSHUFD $0x93, X4, X4
662
663 // Round 5 diagonal step part 2: column step.
664 MOVD 12(SP), X9
665 MOVD 24(SP), X7
666 MOVOA X7, X8
667 PUNPCKLLQ X9, X8
668 MOVD 44(SP), X7
669 MOVD 56(SP), X9
670 PUNPCKLLQ X7, X9
671 PUNPCKLQDQ X8, X9
672 MOVOU permuted_blake_consts<>+288(SB), X8
673 PXOR X9, X8
674 PADDD X8, X1
675 MOVD 52(SP), X9
676 MOVD 32(SP), X7
677 MOVOA X7, X8
678 PUNPCKLLQ X9, X8
679 MOVD 48(SP), X7
680 MOVD 4(SP), X9
681 PUNPCKLLQ X7, X9
682 PUNPCKLQDQ X8, X9
683 MOVOU permuted_blake_consts<>+304(SB), X8
684 PXOR X9, X8
685 PADDD X2, X1
686 PXOR X1, X4
687 MOVO X4, X7
688 PSRLL $0x10, X7
689 PSLLL $0x10, X4
690 PXOR X7, X4
691 PADDD X4, X3
692 PXOR X3, X2
693 MOVO X2, X7
694 PSRLL $0x0c, X7
695 PSLLL $0x14, X2
696 PXOR X7, X2
697 PADDD X8, X1
698 PADDD X2, X1
699 PXOR X1, X4
700 MOVO X4, X7
701 PSRLL $0x08, X7
702 PSLLL $0x18, X4
703 PXOR X7, X4
704 PADDD X4, X3
705 PXOR X3, X2
706 MOVO X2, X7
707 PSRLL $0x07, X7
708 PSLLL $0x19, X2
709 PXOR X7, X2
710
711 // Round 5 diagonal step part 3: undiagonalize.
712 PSHUFD $0x93, X2, X2
713 PSHUFD $0x4e, X3, X3
714 PSHUFD $0x39, X4, X4
715
716 // Round 6 column step.
717 MOVD 32(SP), X9
718 MOVD (SP), X7
719 MOVOA X7, X8
720 PUNPCKLLQ X9, X8
721 MOVD 24(SP), X7
722 MOVD 8(SP), X9
723 PUNPCKLLQ X7, X9
724 PUNPCKLQDQ X8, X9
725 MOVOU permuted_blake_consts<>+320(SB), X8
726 PXOR X9, X8
727 PADDD X8, X1
728 MOVD 12(SP), X9
729 MOVD 44(SP), X7
730 MOVOA X7, X8
731 PUNPCKLLQ X9, X8
732 MOVD 40(SP), X7
733 MOVD 48(SP), X9
734 PUNPCKLLQ X7, X9
735 PUNPCKLQDQ X8, X9
736 MOVOU permuted_blake_consts<>+336(SB), X8
737 PXOR X9, X8
738 PADDD X2, X1
739 PXOR X1, X4
740 MOVO X4, X7
741 PSRLL $0x10, X7
742 PSLLL $0x10, X4
743 PXOR X7, X4
744 PADDD X4, X3
745 PXOR X3, X2
746 MOVO X2, X7
747 PSRLL $0x0c, X7
748 PSLLL $0x14, X2
749 PXOR X7, X2
750 PADDD X8, X1
751 PADDD X2, X1
752 PXOR X1, X4
753 MOVO X4, X7
754 PSRLL $0x08, X7
755 PSLLL $0x18, X4
756 PXOR X7, X4
757 PADDD X4, X3
758 PXOR X3, X2
759 MOVO X2, X7
760 PSRLL $0x07, X7
761 PSLLL $0x19, X2
762 PXOR X7, X2
763
764 // Round 6 diagonal step part 1: diagonalize.
765 PSHUFD $0x39, X2, X2
766 PSHUFD $0x4e, X3, X3
767 PSHUFD $0x93, X4, X4
768
769 // Round 6 diagonal step part 2: column step.
770 MOVD 4(SP), X9
771 MOVD 60(SP), X7
772 MOVOA X7, X8
773 PUNPCKLLQ X9, X8
774 MOVD 28(SP), X7
775 MOVD 16(SP), X9
776 PUNPCKLLQ X7, X9
777 PUNPCKLQDQ X8, X9
778 MOVOU permuted_blake_consts<>+352(SB), X8
779 PXOR X9, X8
780 PADDD X8, X1
781 MOVD 36(SP), X9
782 MOVD 56(SP), X7
783 MOVOA X7, X8
784 PUNPCKLLQ X9, X8
785 MOVD 20(SP), X7
786 MOVD 52(SP), X9
787 PUNPCKLLQ X7, X9
788 PUNPCKLQDQ X8, X9
789 MOVOU permuted_blake_consts<>+368(SB), X8
790 PXOR X9, X8
791 PADDD X2, X1
792 PXOR X1, X4
793 MOVO X4, X7
794 PSRLL $0x10, X7
795 PSLLL $0x10, X4
796 PXOR X7, X4
797 PADDD X4, X3
798 PXOR X3, X2
799 MOVO X2, X7
800 PSRLL $0x0c, X7
801 PSLLL $0x14, X2
802 PXOR X7, X2
803 PADDD X8, X1
804 PADDD X2, X1
805 PXOR X1, X4
806 MOVO X4, X7
807 PSRLL $0x08, X7
808 PSLLL $0x18, X4
809 PXOR X7, X4
810 PADDD X4, X3
811 PXOR X3, X2
812 MOVO X2, X7
813 PSRLL $0x07, X7
814 PSLLL $0x19, X2
815 PXOR X7, X2
816
817 // Round 6 diagonal step part 3: undiagonalize.
818 PSHUFD $0x93, X2, X2
819 PSHUFD $0x4e, X3, X3
820 PSHUFD $0x39, X4, X4
821
822 // Round 7 column step.
823 MOVD 16(SP), X9
824 MOVD 56(SP), X7
825 MOVOA X7, X8
826 PUNPCKLLQ X9, X8
827 MOVD 4(SP), X7
828 MOVD 48(SP), X9
829 PUNPCKLLQ X7, X9
830 PUNPCKLQDQ X8, X9
831 MOVOU permuted_blake_consts<>+384(SB), X8
832 PXOR X9, X8
833 PADDD X8, X1
834 MOVD 40(SP), X9
835 MOVD 52(SP), X7
836 MOVOA X7, X8
837 PUNPCKLLQ X9, X8
838 MOVD 60(SP), X7
839 MOVD 20(SP), X9
840 PUNPCKLLQ X7, X9
841 PUNPCKLQDQ X8, X9
842 MOVOU permuted_blake_consts<>+400(SB), X8
843 PXOR X9, X8
844 PADDD X2, X1
845 PXOR X1, X4
846 MOVO X4, X7
847 PSRLL $0x10, X7
848 PSLLL $0x10, X4
849 PXOR X7, X4
850 PADDD X4, X3
851 PXOR X3, X2
852 MOVO X2, X7
853 PSRLL $0x0c, X7
854 PSLLL $0x14, X2
855 PXOR X7, X2
856 PADDD X8, X1
857 PADDD X2, X1
858 PXOR X1, X4
859 MOVO X4, X7
860 PSRLL $0x08, X7
861 PSLLL $0x18, X4
862 PXOR X7, X4
863 PADDD X4, X3
864 PXOR X3, X2
865 MOVO X2, X7
866 PSRLL $0x07, X7
867 PSLLL $0x19, X2
868 PXOR X7, X2
869
870 // Round 7 diagonal step part 1: diagonalize.
871 PSHUFD $0x39, X2, X2
872 PSHUFD $0x4e, X3, X3
873 PSHUFD $0x93, X4, X4
874
875 // Round 7 diagonal step part 2: column step.
876 MOVD 32(SP), X9
877 MOVD 36(SP), X7
878 MOVOA X7, X8
879 PUNPCKLLQ X9, X8
880 MOVD 24(SP), X7
881 MOVD (SP), X9
882 PUNPCKLLQ X7, X9
883 PUNPCKLQDQ X8, X9
884 MOVOU permuted_blake_consts<>+416(SB), X8
885 PXOR X9, X8
886 PADDD X8, X1
887 MOVD 44(SP), X9
888 MOVD 8(SP), X7
889 MOVOA X7, X8
890 PUNPCKLLQ X9, X8
891 MOVD 12(SP), X7
892 MOVD 28(SP), X9
893 PUNPCKLLQ X7, X9
894 PUNPCKLQDQ X8, X9
895 MOVOU permuted_blake_consts<>+432(SB), X8
896 PXOR X9, X8
897 PADDD X2, X1
898 PXOR X1, X4
899 MOVO X4, X7
900 PSRLL $0x10, X7
901 PSLLL $0x10, X4
902 PXOR X7, X4
903 PADDD X4, X3
904 PXOR X3, X2
905 MOVO X2, X7
906 PSRLL $0x0c, X7
907 PSLLL $0x14, X2
908 PXOR X7, X2
909 PADDD X8, X1
910 PADDD X2, X1
911 PXOR X1, X4
912 MOVO X4, X7
913 PSRLL $0x08, X7
914 PSLLL $0x18, X4
915 PXOR X7, X4
916 PADDD X4, X3
917 PXOR X3, X2
918 MOVO X2, X7
919 PSRLL $0x07, X7
920 PSLLL $0x19, X2
921 PXOR X7, X2
922
923 // Round 7 diagonal step part 3: undiagonalize.
924 PSHUFD $0x93, X2, X2
925 PSHUFD $0x4e, X3, X3
926 PSHUFD $0x39, X4, X4
927
928 // Round 8 column step.
929 MOVD 12(SP), X9
930 MOVD 48(SP), X7
931 MOVOA X7, X8
932 PUNPCKLLQ X9, X8
933 MOVD 28(SP), X7
934 MOVD 52(SP), X9
935 PUNPCKLLQ X7, X9
936 PUNPCKLQDQ X8, X9
937 MOVOU permuted_blake_consts<>+448(SB), X8
938 PXOR X9, X8
939 PADDD X8, X1
940 MOVD 36(SP), X9
941 MOVD 4(SP), X7
942 MOVOA X7, X8
943 PUNPCKLLQ X9, X8
944 MOVD 56(SP), X7
945 MOVD 44(SP), X9
946 PUNPCKLLQ X7, X9
947 PUNPCKLQDQ X8, X9
948 MOVOU permuted_blake_consts<>+464(SB), X8
949 PXOR X9, X8
950 PADDD X2, X1
951 PXOR X1, X4
952 MOVO X4, X7
953 PSRLL $0x10, X7
954 PSLLL $0x10, X4
955 PXOR X7, X4
956 PADDD X4, X3
957 PXOR X3, X2
958 MOVO X2, X7
959 PSRLL $0x0c, X7
960 PSLLL $0x14, X2
961 PXOR X7, X2
962 PADDD X8, X1
963 PADDD X2, X1
964 PXOR X1, X4
965 MOVO X4, X7
966 PSRLL $0x08, X7
967 PSLLL $0x18, X4
968 PXOR X7, X4
969 PADDD X4, X3
970 PXOR X3, X2
971 MOVO X2, X7
972 PSRLL $0x07, X7
973 PSLLL $0x19, X2
974 PXOR X7, X2
975
976 // Round 8 diagonal step part 1: diagonalize.
977 PSHUFD $0x39, X2, X2
978 PSHUFD $0x4e, X3, X3
979 PSHUFD $0x93, X4, X4
980
981 // Round 8 diagonal step part 2: column step.
982 MOVD 8(SP), X9
983 MOVD 32(SP), X7
984 MOVOA X7, X8
985 PUNPCKLLQ X9, X8
986 MOVD 60(SP), X7
987 MOVD 20(SP), X9
988 PUNPCKLLQ X7, X9
989 PUNPCKLQDQ X8, X9
990 MOVOU permuted_blake_consts<>+480(SB), X8
991 PXOR X9, X8
992 PADDD X8, X1
993 MOVD 40(SP), X9
994 MOVD 24(SP), X7
995 MOVOA X7, X8
996 PUNPCKLLQ X9, X8
997 MOVD 16(SP), X7
998 MOVD (SP), X9
999 PUNPCKLLQ X7, X9
1000 PUNPCKLQDQ X8, X9
1001 MOVOU permuted_blake_consts<>+496(SB), X8
1002 PXOR X9, X8
1003 PADDD X2, X1
1004 PXOR X1, X4
1005 MOVO X4, X7
1006 PSRLL $0x10, X7
1007 PSLLL $0x10, X4
1008 PXOR X7, X4
1009 PADDD X4, X3
1010 PXOR X3, X2
1011 MOVO X2, X7
1012 PSRLL $0x0c, X7
1013 PSLLL $0x14, X2
1014 PXOR X7, X2
1015 PADDD X8, X1
1016 PADDD X2, X1
1017 PXOR X1, X4
1018 MOVO X4, X7
1019 PSRLL $0x08, X7
1020 PSLLL $0x18, X4
1021 PXOR X7, X4
1022 PADDD X4, X3
1023 PXOR X3, X2
1024 MOVO X2, X7
1025 PSRLL $0x07, X7
1026 PSLLL $0x19, X2
1027 PXOR X7, X2
1028
1029 // Round 8 diagonal step part 3: undiagonalize.
1030 PSHUFD $0x93, X2, X2
1031 PSHUFD $0x4e, X3, X3
1032 PSHUFD $0x39, X4, X4
1033
1034 // Round 9 column step.
1035 MOVD (SP), X9
1036 MOVD 44(SP), X7
1037 MOVOA X7, X8
1038 PUNPCKLLQ X9, X8
1039 MOVD 56(SP), X7
1040 MOVD 24(SP), X9
1041 PUNPCKLLQ X7, X9
1042 PUNPCKLQDQ X8, X9
1043 MOVOU permuted_blake_consts<>+512(SB), X8
1044 PXOR X9, X8
1045 PADDD X8, X1
1046 MOVD 32(SP), X9
1047 MOVD 12(SP), X7
1048 MOVOA X7, X8
1049 PUNPCKLLQ X9, X8
1050 MOVD 36(SP), X7
1051 MOVD 60(SP), X9
1052 PUNPCKLLQ X7, X9
1053 PUNPCKLQDQ X8, X9
1054 MOVOU permuted_blake_consts<>+528(SB), X8
1055 PXOR X9, X8
1056 PADDD X2, X1
1057 PXOR X1, X4
1058 MOVO X4, X7
1059 PSRLL $0x10, X7
1060 PSLLL $0x10, X4
1061 PXOR X7, X4
1062 PADDD X4, X3
1063 PXOR X3, X2
1064 MOVO X2, X7
1065 PSRLL $0x0c, X7
1066 PSLLL $0x14, X2
1067 PXOR X7, X2
1068 PADDD X8, X1
1069 PADDD X2, X1
1070 PXOR X1, X4
1071 MOVO X4, X7
1072 PSRLL $0x08, X7
1073 PSLLL $0x18, X4
1074 PXOR X7, X4
1075 PADDD X4, X3
1076 PXOR X3, X2
1077 MOVO X2, X7
1078 PSRLL $0x07, X7
1079 PSLLL $0x19, X2
1080 PXOR X7, X2
1081
1082 // Round 9 diagonal step part 1: diagonalize.
1083 PSHUFD $0x39, X2, X2
1084 PSHUFD $0x4e, X3, X3
1085 PSHUFD $0x93, X4, X4
1086
1087 // Round 9 diagonal step part 2: column step.
1088 MOVD 40(SP), X9
1089 MOVD 4(SP), X7
1090 MOVOA X7, X8
1091 PUNPCKLLQ X9, X8
1092 MOVD 52(SP), X7
1093 MOVD 48(SP), X9
1094 PUNPCKLLQ X7, X9
1095 PUNPCKLQDQ X8, X9
1096 MOVOU permuted_blake_consts<>+544(SB), X8
1097 PXOR X9, X8
1098 PADDD X8, X1
1099 MOVD 20(SP), X9
1100 MOVD 16(SP), X7
1101 MOVOA X7, X8
1102 PUNPCKLLQ X9, X8
1103 MOVD 28(SP), X7
1104 MOVD 8(SP), X9
1105 PUNPCKLLQ X7, X9
1106 PUNPCKLQDQ X8, X9
1107 MOVOU permuted_blake_consts<>+560(SB), X8
1108 PXOR X9, X8
1109 PADDD X2, X1
1110 PXOR X1, X4
1111 MOVO X4, X7
1112 PSRLL $0x10, X7
1113 PSLLL $0x10, X4
1114 PXOR X7, X4
1115 PADDD X4, X3
1116 PXOR X3, X2
1117 MOVO X2, X7
1118 PSRLL $0x0c, X7
1119 PSLLL $0x14, X2
1120 PXOR X7, X2
1121 PADDD X8, X1
1122 PADDD X2, X1
1123 PXOR X1, X4
1124 MOVO X4, X7
1125 PSRLL $0x08, X7
1126 PSLLL $0x18, X4
1127 PXOR X7, X4
1128 PADDD X4, X3
1129 PXOR X3, X2
1130 MOVO X2, X7
1131 PSRLL $0x07, X7
1132 PSLLL $0x19, X2
1133 PXOR X7, X2
1134
1135 // Round 9 diagonal step part 3: undiagonalize.
1136 PSHUFD $0x93, X2, X2
1137 PSHUFD $0x4e, X3, X3
1138 PSHUFD $0x39, X4, X4
1139
1140 // Round 10 column step.
1141 MOVD 4(SP), X9
1142 MOVD 28(SP), X7
1143 MOVOA X7, X8
1144 PUNPCKLLQ X9, X8
1145 MOVD 32(SP), X7
1146 MOVD 40(SP), X9
1147 PUNPCKLLQ X7, X9
1148 PUNPCKLQDQ X8, X9
1149 MOVOU permuted_blake_consts<>+576(SB), X8
1150 PXOR X9, X8
1151 PADDD X8, X1
1152 MOVD 20(SP), X9
1153 MOVD 24(SP), X7
1154 MOVOA X7, X8
1155 PUNPCKLLQ X9, X8
1156 MOVD 16(SP), X7
1157 MOVD 8(SP), X9
1158 PUNPCKLLQ X7, X9
1159 PUNPCKLQDQ X8, X9
1160 MOVOU permuted_blake_consts<>+592(SB), X8
1161 PXOR X9, X8
1162 PADDD X2, X1
1163 PXOR X1, X4
1164 MOVO X4, X7
1165 PSRLL $0x10, X7
1166 PSLLL $0x10, X4
1167 PXOR X7, X4
1168 PADDD X4, X3
1169 PXOR X3, X2
1170 MOVO X2, X7
1171 PSRLL $0x0c, X7
1172 PSLLL $0x14, X2
1173 PXOR X7, X2
1174 PADDD X8, X1
1175 PADDD X2, X1
1176 PXOR X1, X4
1177 MOVO X4, X7
1178 PSRLL $0x08, X7
1179 PSLLL $0x18, X4
1180 PXOR X7, X4
1181 PADDD X4, X3
1182 PXOR X3, X2
1183 MOVO X2, X7
1184 PSRLL $0x07, X7
1185 PSLLL $0x19, X2
1186 PXOR X7, X2
1187
1188 // Round 10 diagonal step part 1: diagonalize.
1189 PSHUFD $0x39, X2, X2
1190 PSHUFD $0x4e, X3, X3
1191 PSHUFD $0x93, X4, X4
1192
1193 // Round 10 diagonal step part 2: column step.
1194 MOVD 52(SP), X9
1195 MOVD 12(SP), X7
1196 MOVOA X7, X8
1197 PUNPCKLLQ X9, X8
1198 MOVD 36(SP), X7
1199 MOVD 60(SP), X9
1200 PUNPCKLLQ X7, X9
1201 PUNPCKLQDQ X8, X9
1202 MOVOU permuted_blake_consts<>+608(SB), X8
1203 PXOR X9, X8
1204 PADDD X8, X1
1205 MOVD (SP), X9
1206 MOVD 48(SP), X7
1207 MOVOA X7, X8
1208 PUNPCKLLQ X9, X8
1209 MOVD 56(SP), X7
1210 MOVD 44(SP), X9
1211 PUNPCKLLQ X7, X9
1212 PUNPCKLQDQ X8, X9
1213 MOVOU permuted_blake_consts<>+624(SB), X8
1214 PXOR X9, X8
1215 PADDD X2, X1
1216 PXOR X1, X4
1217 MOVO X4, X7
1218 PSRLL $0x10, X7
1219 PSLLL $0x10, X4
1220 PXOR X7, X4
1221 PADDD X4, X3
1222 PXOR X3, X2
1223 MOVO X2, X7
1224 PSRLL $0x0c, X7
1225 PSLLL $0x14, X2
1226 PXOR X7, X2
1227 PADDD X8, X1
1228 PADDD X2, X1
1229 PXOR X1, X4
1230 MOVO X4, X7
1231 PSRLL $0x08, X7
1232 PSLLL $0x18, X4
1233 PXOR X7, X4
1234 PADDD X4, X3
1235 PXOR X3, X2
1236 MOVO X2, X7
1237 PSRLL $0x07, X7
1238 PSLLL $0x19, X2
1239 PXOR X7, X2
1240
1241 // Round 10 diagonal step part 3: undiagonalize.
1242 PSHUFD $0x93, X2, X2
1243 PSHUFD $0x4e, X3, X3
1244 PSHUFD $0x39, X4, X4
1245
1246 // Round 11 column step.
1247 MOVD 24(SP), X9
1248 MOVD 16(SP), X7
1249 MOVOA X7, X8
1250 PUNPCKLLQ X9, X8
1251 MOVD 8(SP), X7
1252 MOVD (SP), X9
1253 PUNPCKLLQ X7, X9
1254 PUNPCKLQDQ X8, X9
1255 MOVOU permuted_blake_consts<>+0(SB), X8
1256 PXOR X9, X8
1257 PADDD X8, X1
1258 MOVD 28(SP), X9
1259 MOVD 20(SP), X7
1260 MOVOA X7, X8
1261 PUNPCKLLQ X9, X8
1262 MOVD 12(SP), X7
1263 MOVD 4(SP), X9
1264 PUNPCKLLQ X7, X9
1265 PUNPCKLQDQ X8, X9
1266 MOVOU permuted_blake_consts<>+16(SB), X8
1267 PXOR X9, X8
1268 PADDD X2, X1
1269 PXOR X1, X4
1270 MOVO X4, X7
1271 PSRLL $0x10, X7
1272 PSLLL $0x10, X4
1273 PXOR X7, X4
1274 PADDD X4, X3
1275 PXOR X3, X2
1276 MOVO X2, X7
1277 PSRLL $0x0c, X7
1278 PSLLL $0x14, X2
1279 PXOR X7, X2
1280 PADDD X8, X1
1281 PADDD X2, X1
1282 PXOR X1, X4
1283 MOVO X4, X7
1284 PSRLL $0x08, X7
1285 PSLLL $0x18, X4
1286 PXOR X7, X4
1287 PADDD X4, X3
1288 PXOR X3, X2
1289 MOVO X2, X7
1290 PSRLL $0x07, X7
1291 PSLLL $0x19, X2
1292 PXOR X7, X2
1293
1294 // Round 11 diagonal step part 1: diagonalize.
1295 PSHUFD $0x39, X2, X2
1296 PSHUFD $0x4e, X3, X3
1297 PSHUFD $0x93, X4, X4
1298
1299 // Round 11 diagonal step part 2: column step.
1300 MOVD 56(SP), X9
1301 MOVD 48(SP), X7
1302 MOVOA X7, X8
1303 PUNPCKLLQ X9, X8
1304 MOVD 40(SP), X7
1305 MOVD 32(SP), X9
1306 PUNPCKLLQ X7, X9
1307 PUNPCKLQDQ X8, X9
1308 MOVOU permuted_blake_consts<>+32(SB), X8
1309 PXOR X9, X8
1310 PADDD X8, X1
1311 MOVD 60(SP), X9
1312 MOVD 52(SP), X7
1313 MOVOA X7, X8
1314 PUNPCKLLQ X9, X8
1315 MOVD 44(SP), X7
1316 MOVD 36(SP), X9
1317 PUNPCKLLQ X7, X9
1318 PUNPCKLQDQ X8, X9
1319 MOVOU permuted_blake_consts<>+48(SB), X8
1320 PXOR X9, X8
1321 PADDD X2, X1
1322 PXOR X1, X4
1323 MOVO X4, X7
1324 PSRLL $0x10, X7
1325 PSLLL $0x10, X4
1326 PXOR X7, X4
1327 PADDD X4, X3
1328 PXOR X3, X2
1329 MOVO X2, X7
1330 PSRLL $0x0c, X7
1331 PSLLL $0x14, X2
1332 PXOR X7, X2
1333 PADDD X8, X1
1334 PADDD X2, X1
1335 PXOR X1, X4
1336 MOVO X4, X7
1337 PSRLL $0x08, X7
1338 PSLLL $0x18, X4
1339 PXOR X7, X4
1340 PADDD X4, X3
1341 PXOR X3, X2
1342 MOVO X2, X7
1343 PSRLL $0x07, X7
1344 PSLLL $0x19, X2
1345 PXOR X7, X2
1346
1347 // Round 11 diagonal step part 3: undiagonalize.
1348 PSHUFD $0x93, X2, X2
1349 PSHUFD $0x4e, X3, X3
1350 PSHUFD $0x39, X4, X4
1351
1352 // Round 12 column step.
1353 MOVD 52(SP), X9
1354 MOVD 36(SP), X7
1355 MOVOA X7, X8
1356 PUNPCKLLQ X9, X8
1357 MOVD 16(SP), X7
1358 MOVD 56(SP), X9
1359 PUNPCKLLQ X7, X9
1360 PUNPCKLQDQ X8, X9
1361 MOVOU permuted_blake_consts<>+64(SB), X8
1362 PXOR X9, X8
1363 PADDD X8, X1
1364 MOVD 24(SP), X9
1365 MOVD 60(SP), X7
1366 MOVOA X7, X8
1367 PUNPCKLLQ X9, X8
1368 MOVD 32(SP), X7
1369 MOVD 40(SP), X9
1370 PUNPCKLLQ X7, X9
1371 PUNPCKLQDQ X8, X9
1372 MOVOU permuted_blake_consts<>+80(SB), X8
1373 PXOR X9, X8
1374 PADDD X2, X1
1375 PXOR X1, X4
1376 MOVO X4, X7
1377 PSRLL $0x10, X7
1378 PSLLL $0x10, X4
1379 PXOR X7, X4
1380 PADDD X4, X3
1381 PXOR X3, X2
1382 MOVO X2, X7
1383 PSRLL $0x0c, X7
1384 PSLLL $0x14, X2
1385 PXOR X7, X2
1386 PADDD X8, X1
1387 PADDD X2, X1
1388 PXOR X1, X4
1389 MOVO X4, X7
1390 PSRLL $0x08, X7
1391 PSLLL $0x18, X4
1392 PXOR X7, X4
1393 PADDD X4, X3
1394 PXOR X3, X2
1395 MOVO X2, X7
1396 PSRLL $0x07, X7
1397 PSLLL $0x19, X2
1398 PXOR X7, X2
1399
1400 // Round 12 diagonal step part 1: diagonalize.
1401 PSHUFD $0x39, X2, X2
1402 PSHUFD $0x4e, X3, X3
1403 PSHUFD $0x93, X4, X4
1404
1405 // Round 12 diagonal step part 2: column step.
1406 MOVD 20(SP), X9
1407 MOVD 44(SP), X7
1408 MOVOA X7, X8
1409 PUNPCKLLQ X9, X8
1410 MOVD (SP), X7
1411 MOVD 4(SP), X9
1412 PUNPCKLLQ X7, X9
1413 PUNPCKLQDQ X8, X9
1414 MOVOU permuted_blake_consts<>+96(SB), X8
1415 PXOR X9, X8
1416 PADDD X8, X1
1417 MOVD 12(SP), X9
1418 MOVD 28(SP), X7
1419 MOVOA X7, X8
1420 PUNPCKLLQ X9, X8
1421 MOVD 8(SP), X7
1422 MOVD 48(SP), X9
1423 PUNPCKLLQ X7, X9
1424 PUNPCKLQDQ X8, X9
1425 MOVOU permuted_blake_consts<>+112(SB), X8
1426 PXOR X9, X8
1427 PADDD X2, X1
1428 PXOR X1, X4
1429 MOVO X4, X7
1430 PSRLL $0x10, X7
1431 PSLLL $0x10, X4
1432 PXOR X7, X4
1433 PADDD X4, X3
1434 PXOR X3, X2
1435 MOVO X2, X7
1436 PSRLL $0x0c, X7
1437 PSLLL $0x14, X2
1438 PXOR X7, X2
1439 PADDD X8, X1
1440 PADDD X2, X1
1441 PXOR X1, X4
1442 MOVO X4, X7
1443 PSRLL $0x08, X7
1444 PSLLL $0x18, X4
1445 PXOR X7, X4
1446 PADDD X4, X3
1447 PXOR X3, X2
1448 MOVO X2, X7
1449 PSRLL $0x07, X7
1450 PSLLL $0x19, X2
1451 PXOR X7, X2
1452
1453 // Round 12 diagonal step part 3: undiagonalize.
1454 PSHUFD $0x93, X2, X2
1455 PSHUFD $0x4e, X3, X3
1456 PSHUFD $0x39, X4, X4
1457
1458 // Round 13 column step.
1459 MOVD 60(SP), X9
1460 MOVD 20(SP), X7
1461 MOVOA X7, X8
1462 PUNPCKLLQ X9, X8
1463 MOVD 48(SP), X7
1464 MOVD 44(SP), X9
1465 PUNPCKLLQ X7, X9
1466 PUNPCKLQDQ X8, X9
1467 MOVOU permuted_blake_consts<>+128(SB), X8
1468 PXOR X9, X8
1469 PADDD X8, X1
1470 MOVD 52(SP), X9
1471 MOVD 8(SP), X7
1472 MOVOA X7, X8
1473 PUNPCKLLQ X9, X8
1474 MOVD (SP), X7
1475 MOVD 32(SP), X9
1476 PUNPCKLLQ X7, X9
1477 PUNPCKLQDQ X8, X9
1478 MOVOU permuted_blake_consts<>+144(SB), X8
1479 PXOR X9, X8
1480 PADDD X2, X1
1481 PXOR X1, X4
1482 MOVO X4, X7
1483 PSRLL $0x10, X7
1484 PSLLL $0x10, X4
1485 PXOR X7, X4
1486 PADDD X4, X3
1487 PXOR X3, X2
1488 MOVO X2, X7
1489 PSRLL $0x0c, X7
1490 PSLLL $0x14, X2
1491 PXOR X7, X2
1492 PADDD X8, X1
1493 PADDD X2, X1
1494 PXOR X1, X4
1495 MOVO X4, X7
1496 PSRLL $0x08, X7
1497 PSLLL $0x18, X4
1498 PXOR X7, X4
1499 PADDD X4, X3
1500 PXOR X3, X2
1501 MOVO X2, X7
1502 PSRLL $0x07, X7
1503 PSLLL $0x19, X2
1504 PXOR X7, X2
1505
1506 // Round 13 diagonal step part 1: diagonalize.
1507 PSHUFD $0x39, X2, X2
1508 PSHUFD $0x4e, X3, X3
1509 PSHUFD $0x93, X4, X4
1510
1511 // Round 13 diagonal step part 2: column step.
1512 MOVD 36(SP), X9
1513 MOVD 28(SP), X7
1514 MOVOA X7, X8
1515 PUNPCKLLQ X9, X8
1516 MOVD 12(SP), X7
1517 MOVD 40(SP), X9
1518 PUNPCKLLQ X7, X9
1519 PUNPCKLQDQ X8, X9
1520 MOVOU permuted_blake_consts<>+160(SB), X8
1521 PXOR X9, X8
1522 PADDD X8, X1
1523 MOVD 16(SP), X9
1524 MOVD 4(SP), X7
1525 MOVOA X7, X8
1526 PUNPCKLLQ X9, X8
1527 MOVD 24(SP), X7
1528 MOVD 56(SP), X9
1529 PUNPCKLLQ X7, X9
1530 PUNPCKLQDQ X8, X9
1531 MOVOU permuted_blake_consts<>+176(SB), X8
1532 PXOR X9, X8
1533 PADDD X2, X1
1534 PXOR X1, X4
1535 MOVO X4, X7
1536 PSRLL $0x10, X7
1537 PSLLL $0x10, X4
1538 PXOR X7, X4
1539 PADDD X4, X3
1540 PXOR X3, X2
1541 MOVO X2, X7
1542 PSRLL $0x0c, X7
1543 PSLLL $0x14, X2
1544 PXOR X7, X2
1545 PADDD X8, X1
1546 PADDD X2, X1
1547 PXOR X1, X4
1548 MOVO X4, X7
1549 PSRLL $0x08, X7
1550 PSLLL $0x18, X4
1551 PXOR X7, X4
1552 PADDD X4, X3
1553 PXOR X3, X2
1554 MOVO X2, X7
1555 PSRLL $0x07, X7
1556 PSLLL $0x19, X2
1557 PXOR X7, X2
1558
1559 // Round 13 diagonal step part 3: undiagonalize.
1560 PSHUFD $0x93, X2, X2
1561 PSHUFD $0x4e, X3, X3
1562 PSHUFD $0x39, X4, X4
1563
1564 // Round 14 column step.
1565 MOVD 44(SP), X9
1566 MOVD 52(SP), X7
1567 MOVOA X7, X8
1568 PUNPCKLLQ X9, X8
1569 MOVD 12(SP), X7
1570 MOVD 28(SP), X9
1571 PUNPCKLLQ X7, X9
1572 PUNPCKLQDQ X8, X9
1573 MOVOU permuted_blake_consts<>+192(SB), X8
1574 PXOR X9, X8
1575 PADDD X8, X1
1576 MOVD 56(SP), X9
1577 MOVD 48(SP), X7
1578 MOVOA X7, X8
1579 PUNPCKLLQ X9, X8
1580 MOVD 4(SP), X7
1581 MOVD 36(SP), X9
1582 PUNPCKLLQ X7, X9
1583 PUNPCKLQDQ X8, X9
1584 MOVOU permuted_blake_consts<>+208(SB), X8
1585 PXOR X9, X8
1586 PADDD X2, X1
1587 PXOR X1, X4
1588 MOVO X4, X7
1589 PSRLL $0x10, X7
1590 PSLLL $0x10, X4
1591 PXOR X7, X4
1592 PADDD X4, X3
1593 PXOR X3, X2
1594 MOVO X2, X7
1595 PSRLL $0x0c, X7
1596 PSLLL $0x14, X2
1597 PXOR X7, X2
1598 PADDD X8, X1
1599 PADDD X2, X1
1600 PXOR X1, X4
1601 MOVO X4, X7
1602 PSRLL $0x08, X7
1603 PSLLL $0x18, X4
1604 PXOR X7, X4
1605 PADDD X4, X3
1606 PXOR X3, X2
1607 MOVO X2, X7
1608 PSRLL $0x07, X7
1609 PSLLL $0x19, X2
1610 PXOR X7, X2
1611
1612 // Round 14 diagonal step part 1: diagonalize.
1613 PSHUFD $0x39, X2, X2
1614 PSHUFD $0x4e, X3, X3
1615 PSHUFD $0x93, X4, X4
1616
1617 // Round 14 diagonal step part 2: column step.
1618 MOVD 60(SP), X9
1619 MOVD 16(SP), X7
1620 MOVOA X7, X8
1621 PUNPCKLLQ X9, X8
1622 MOVD 20(SP), X7
1623 MOVD 8(SP), X9
1624 PUNPCKLLQ X7, X9
1625 PUNPCKLQDQ X8, X9
1626 MOVOU permuted_blake_consts<>+224(SB), X8
1627 PXOR X9, X8
1628 PADDD X8, X1
1629 MOVD 32(SP), X9
1630 MOVD (SP), X7
1631 MOVOA X7, X8
1632 PUNPCKLLQ X9, X8
1633 MOVD 40(SP), X7
1634 MOVD 24(SP), X9
1635 PUNPCKLLQ X7, X9
1636 PUNPCKLQDQ X8, X9
1637 MOVOU permuted_blake_consts<>+240(SB), X8
1638 PXOR X9, X8
1639 PADDD X2, X1
1640 PXOR X1, X4
1641 MOVO X4, X7
1642 PSRLL $0x10, X7
1643 PSLLL $0x10, X4
1644 PXOR X7, X4
1645 PADDD X4, X3
1646 PXOR X3, X2
1647 MOVO X2, X7
1648 PSRLL $0x0c, X7
1649 PSLLL $0x14, X2
1650 PXOR X7, X2
1651 PADDD X8, X1
1652 PADDD X2, X1
1653 PXOR X1, X4
1654 MOVO X4, X7
1655 PSRLL $0x08, X7
1656 PSLLL $0x18, X4
1657 PXOR X7, X4
1658 PADDD X4, X3
1659 PXOR X3, X2
1660 MOVO X2, X7
1661 PSRLL $0x07, X7
1662 PSLLL $0x19, X2
1663 PXOR X7, X2
1664
1665 // Round 14 diagonal step part 3: undiagonalize.
1666 PSHUFD $0x93, X2, X2
1667 PSHUFD $0x4e, X3, X3
1668 PSHUFD $0x39, X4, X4
1669
1670 // Finally the chain value is defined as:
1671 // h'0 = h0^s0^v0^v8
1672 // h'1 = h1^s1^v1^v9
1673 // h'2 = h2^s2^v2^va
1674 // h'3 = h3^s3^v3^vb
1675 // h'4 = h4^s0^v4^vc
1676 // h'5 = h5^s1^v5^vd
1677 // h'6 = h6^s2^v6^ve
1678 // h'7 = h7^s3^v7^vf
1679 PXOR X5, X1
1680 PXOR X0, X1
1681 PXOR X3, X1
1682 PXOR X6, X2
1683 PXOR X0, X2
1684 PXOR X4, X2
1685
1686 // Either terminate the loop when there are no more full blocks
1687 // to compress or move the message pointer to the next block of
1688 // bytes to compress, increment the message bits counter
1689 // accordingly, and loop back around to compress it.
1690 DECQ BX
1691 JZ done
1692 LEAQ 64(DX), DX
1693 ADDQ $0x00000200, CX
1694 JMP compressLoop
1695
1696 done:
1697 // Output the resulting chain value.
1698 MOVOU X1, (AX)
1699 MOVOU X2, 16(AX)
1700 RET
1701
1702 // func blocksSSE41(state *State, msg []byte, counter uint64)
1703 // Requires: SSE2, SSE4.1, SSSE3
1704 TEXT ·blocksSSE41(SB), NOSPLIT, $0-40
1705 MOVQ state+0(FP), AX
1706 MOVQ counter+32(FP), CX
1707 MOVQ msg_base+8(FP), DX
1708 MOVQ msg_len+16(FP), BX
1709
1710 // Populate registers for faster right rotations.
1711 MOVOU shuffle_rotr8_4x32<>+0(SB), X4
1712 MOVOU shuffle_rotr16_4x32<>+0(SB), X5
1713
1714 // Convert message len to number of blocks for loop counter.
1715 SHRQ $0x06, BX
1716
1717 // Initialize state matrix.
1718 // row0 = |v0 v1 v2 v3| | h0 h1 h2 h3 |
1719 // row1 = |v4 v5 v6 v7| | h4 h5 h6 h7 |
1720 MOVOU 32(AX), X6
1721 MOVOU (AX), X7
1722 MOVOU 16(AX), X8
1723
1724 compressLoop:
1725 // row2 = |v8 v9 va vb| = |s0^c0 s1^c1 s2^c2 s3^c3|
1726 // row3 = |vc vd ve vf| |t0^c4 t0^c5 t1^c6 t1^c7|
1727 MOVOU first_8_blake_consts<>+0(SB), X9
1728 PXOR X6, X9
1729 MOVD CX, X10
1730 PSHUFD $0x50, X10, X10
1731 PXOR first_8_blake_consts<>+16(SB), X10
1732 MOVO X7, X11
1733 MOVO X8, X12
1734
1735 // Convert message to big endian.
1736 MOVOU shuffle_le_to_be_4x32<>+0(SB), X13
1737 MOVOU (DX), X0
1738 PSHUFB X13, X0
1739 MOVOU 16(DX), X1
1740 PSHUFB X13, X1
1741 MOVOU 32(DX), X2
1742 PSHUFB X13, X2
1743 MOVOU 48(DX), X3
1744 PSHUFB X13, X3
1745
1746 // Round 1 column step.
1747 PSHUFD $0x08, X0, X14
1748 PSHUFD $0x80, X1, X13
1749 PBLENDW $0xf0, X13, X14
1750 MOVOU permuted_blake_consts<>+0(SB), X15
1751 PXOR X14, X15
1752 PADDD X15, X7
1753 PSHUFD $0x0d, X0, X14
1754 PSHUFD $0xd0, X1, X13
1755 PBLENDW $0xf0, X13, X14
1756 MOVOU permuted_blake_consts<>+16(SB), X15
1757 PXOR X14, X15
1758 PADDD X8, X7
1759 PXOR X7, X10
1760 PSHUFB X5, X10
1761 PADDD X10, X9
1762 PXOR X9, X8
1763 MOVO X8, X13
1764 PSRLL $0x0c, X13
1765 PSLLL $0x14, X8
1766 PXOR X13, X8
1767 PADDD X15, X7
1768 PADDD X8, X7
1769 PXOR X7, X10
1770 PSHUFB X4, X10
1771 PADDD X10, X9
1772 PXOR X9, X8
1773 MOVO X8, X13
1774 PSRLL $0x07, X13
1775 PSLLL $0x19, X8
1776 PXOR X13, X8
1777
1778 // Round 1 diagonal step part 1: diagonalize.
1779 PSHUFD $0x39, X8, X8
1780 PSHUFD $0x4e, X9, X9
1781 PSHUFD $0x93, X10, X10
1782
1783 // Round 1 diagonal step part 2: column step.
1784 PSHUFD $0x08, X2, X14
1785 PSHUFD $0x80, X3, X13
1786 PBLENDW $0xf0, X13, X14
1787 MOVOU permuted_blake_consts<>+32(SB), X15
1788 PXOR X14, X15
1789 PADDD X15, X7
1790 PSHUFD $0x0d, X2, X14
1791 PSHUFD $0xd0, X3, X13
1792 PBLENDW $0xf0, X13, X14
1793 MOVOU permuted_blake_consts<>+48(SB), X15
1794 PXOR X14, X15
1795 PADDD X8, X7
1796 PXOR X7, X10
1797 PSHUFB X5, X10
1798 PADDD X10, X9
1799 PXOR X9, X8
1800 MOVO X8, X13
1801 PSRLL $0x0c, X13
1802 PSLLL $0x14, X8
1803 PXOR X13, X8
1804 PADDD X15, X7
1805 PADDD X8, X7
1806 PXOR X7, X10
1807 PSHUFB X4, X10
1808 PADDD X10, X9
1809 PXOR X9, X8
1810 MOVO X8, X13
1811 PSRLL $0x07, X13
1812 PSLLL $0x19, X8
1813 PXOR X13, X8
1814
1815 // Round 1 diagonal step part 3: undiagonalize.
1816 PSHUFD $0x93, X8, X8
1817 PSHUFD $0x4e, X9, X9
1818 PSHUFD $0x39, X10, X10
1819
1820 // Round 2 column step.
1821 PSHUFD $0x00, X1, X14
1822 PSHUFD $0x10, X2, X13
1823 PBLENDW $0x30, X13, X14
1824 PSHUFD $0x42, X3, X13
1825 PBLENDW $0xc3, X13, X14
1826 MOVOU permuted_blake_consts<>+64(SB), X15
1827 PXOR X14, X15
1828 PADDD X15, X7
1829 PSHUFD $0x80, X1, X14
1830 PSHUFD $0x02, X2, X13
1831 PBLENDW $0x0f, X13, X14
1832 PSHUFD $0x30, X3, X13
1833 PBLENDW $0x30, X13, X14
1834 MOVOU permuted_blake_consts<>+80(SB), X15
1835 PXOR X14, X15
1836 PADDD X8, X7
1837 PXOR X7, X10
1838 PSHUFB X5, X10
1839 PADDD X10, X9
1840 PXOR X9, X8
1841 MOVO X8, X13
1842 PSRLL $0x0c, X13
1843 PSLLL $0x14, X8
1844 PXOR X13, X8
1845 PADDD X15, X7
1846 PADDD X8, X7
1847 PXOR X7, X10
1848 PSHUFB X4, X10
1849 PADDD X10, X9
1850 PXOR X9, X8
1851 MOVO X8, X13
1852 PSRLL $0x07, X13
1853 PSLLL $0x19, X8
1854 PXOR X13, X8
1855
1856 // Round 2 diagonal step part 1: diagonalize.
1857 PSHUFD $0x39, X8, X8
1858 PSHUFD $0x4e, X9, X9
1859 PSHUFD $0x93, X10, X10
1860
1861 // Round 2 diagonal step part 2: column step.
1862 PSHUFD $0x01, X0, X14
1863 PSHUFD $0x40, X1, X13
1864 PBLENDW $0xc0, X13, X14
1865 PSHUFD $0x30, X2, X13
1866 PBLENDW $0x30, X13, X14
1867 MOVOU permuted_blake_consts<>+96(SB), X15
1868 PXOR X14, X15
1869 PADDD X15, X7
1870 PSHUFD $0xc8, X0, X14
1871 PSHUFD $0x30, X1, X13
1872 PBLENDW $0x30, X13, X14
1873 PSHUFD $0x00, X3, X13
1874 PBLENDW $0x03, X13, X14
1875 MOVOU permuted_blake_consts<>+112(SB), X15
1876 PXOR X14, X15
1877 PADDD X8, X7
1878 PXOR X7, X10
1879 PSHUFB X5, X10
1880 PADDD X10, X9
1881 PXOR X9, X8
1882 MOVO X8, X13
1883 PSRLL $0x0c, X13
1884 PSLLL $0x14, X8
1885 PXOR X13, X8
1886 PADDD X15, X7
1887 PADDD X8, X7
1888 PXOR X7, X10
1889 PSHUFB X4, X10
1890 PADDD X10, X9
1891 PXOR X9, X8
1892 MOVO X8, X13
1893 PSRLL $0x07, X13
1894 PSLLL $0x19, X8
1895 PXOR X13, X8
1896
1897 // Round 2 diagonal step part 3: undiagonalize.
1898 PSHUFD $0x93, X8, X8
1899 PSHUFD $0x4e, X9, X9
1900 PSHUFD $0x39, X10, X10
1901
1902 // Round 3 column step.
1903 PSHUFD $0x10, X1, X14
1904 PSHUFD $0x03, X2, X13
1905 PBLENDW $0x03, X13, X14
1906 PSHUFD $0xc0, X3, X13
1907 PBLENDW $0xcc, X13, X14
1908 MOVOU permuted_blake_consts<>+128(SB), X15
1909 PXOR X14, X15
1910 PADDD X15, X7
1911 PSHUFD $0x20, X0, X14
1912 PSHUFD $0x00, X2, X13
1913 PBLENDW $0x03, X13, X14
1914 PSHUFD $0x40, X3, X13
1915 PBLENDW $0xc0, X13, X14
1916 MOVOU permuted_blake_consts<>+144(SB), X15
1917 PXOR X14, X15
1918 PADDD X8, X7
1919 PXOR X7, X10
1920 PSHUFB X5, X10
1921 PADDD X10, X9
1922 PXOR X9, X8
1923 MOVO X8, X13
1924 PSRLL $0x0c, X13
1925 PSLLL $0x14, X8
1926 PXOR X13, X8
1927 PADDD X15, X7
1928 PADDD X8, X7
1929 PXOR X7, X10
1930 PSHUFB X4, X10
1931 PADDD X10, X9
1932 PXOR X9, X8
1933 MOVO X8, X13
1934 PSRLL $0x07, X13
1935 PSLLL $0x19, X8
1936 PXOR X13, X8
1937
1938 // Round 3 diagonal step part 1: diagonalize.
1939 PSHUFD $0x39, X8, X8
1940 PSHUFD $0x4e, X9, X9
1941 PSHUFD $0x93, X10, X10
1942
1943 // Round 3 diagonal step part 2: column step.
1944 PSHUFD $0x0c, X0, X14
1945 PSHUFD $0x30, X1, X13
1946 PBLENDW $0x30, X13, X14
1947 PSHUFD $0x42, X2, X13
1948 PBLENDW $0xc3, X13, X14
1949 MOVOU permuted_blake_consts<>+160(SB), X15
1950 PXOR X14, X15
1951 PADDD X15, X7
1952 PSHUFD $0x10, X0, X14
1953 PSHUFD $0x08, X1, X13
1954 PBLENDW $0xcc, X13, X14
1955 PSHUFD $0x02, X3, X13
1956 PBLENDW $0x03, X13, X14
1957 MOVOU permuted_blake_consts<>+176(SB), X15
1958 PXOR X14, X15
1959 PADDD X8, X7
1960 PXOR X7, X10
1961 PSHUFB X5, X10
1962 PADDD X10, X9
1963 PXOR X9, X8
1964 MOVO X8, X13
1965 PSRLL $0x0c, X13
1966 PSLLL $0x14, X8
1967 PXOR X13, X8
1968 PADDD X15, X7
1969 PADDD X8, X7
1970 PXOR X7, X10
1971 PSHUFB X4, X10
1972 PADDD X10, X9
1973 PXOR X9, X8
1974 MOVO X8, X13
1975 PSRLL $0x07, X13
1976 PSLLL $0x19, X8
1977 PXOR X13, X8
1978
1979 // Round 3 diagonal step part 3: undiagonalize.
1980 PSHUFD $0x93, X8, X8
1981 PSHUFD $0x4e, X9, X9
1982 PSHUFD $0x39, X10, X10
1983
1984 // Round 4 column step.
1985 PSHUFD $0x0c, X0, X14
1986 PSHUFD $0x03, X1, X13
1987 PBLENDW $0x03, X13, X14
1988 PSHUFD $0xc0, X2, X13
1989 PBLENDW $0xc0, X13, X14
1990 PSHUFD $0x10, X3, X13
1991 PBLENDW $0x30, X13, X14
1992 MOVOU permuted_blake_consts<>+192(SB), X15
1993 PXOR X14, X15
1994 PADDD X15, X7
1995 PSHUFD $0x04, X0, X14
1996 PSHUFD $0x01, X2, X13
1997 PBLENDW $0x03, X13, X14
1998 PSHUFD $0x80, X3, X13
1999 PBLENDW $0xf0, X13, X14
2000 MOVOU permuted_blake_consts<>+208(SB), X15
2001 PXOR X14, X15
2002 PADDD X8, X7
2003 PXOR X7, X10
2004 PSHUFB X5, X10
2005 PADDD X10, X9
2006 PXOR X9, X8
2007 MOVO X8, X13
2008 PSRLL $0x0c, X13
2009 PSLLL $0x14, X8
2010 PXOR X13, X8
2011 PADDD X15, X7
2012 PADDD X8, X7
2013 PXOR X7, X10
2014 PSHUFB X4, X10
2015 PADDD X10, X9
2016 PXOR X9, X8
2017 MOVO X8, X13
2018 PSRLL $0x07, X13
2019 PSLLL $0x19, X8
2020 PXOR X13, X8
2021
2022 // Round 4 diagonal step part 1: diagonalize.
2023 PSHUFD $0x39, X8, X8
2024 PSHUFD $0x4e, X9, X9
2025 PSHUFD $0x93, X10, X10
2026
2027 // Round 4 diagonal step part 2: column step.
2028 PSHUFD $0x02, X0, X14
2029 PSHUFD $0x04, X1, X13
2030 PBLENDW $0x3c, X13, X14
2031 PSHUFD $0xc0, X3, X13
2032 PBLENDW $0xc0, X13, X14
2033 MOVOU permuted_blake_consts<>+224(SB), X15
2034 PXOR X14, X15
2035 PADDD X15, X7
2036 PSHUFD $0x00, X0, X14
2037 PSHUFD $0x02, X1, X13
2038 PBLENDW $0x03, X13, X14
2039 PSHUFD $0x08, X2, X13
2040 PBLENDW $0xcc, X13, X14
2041 MOVOU permuted_blake_consts<>+240(SB), X15
2042 PXOR X14, X15
2043 PADDD X8, X7
2044 PXOR X7, X10
2045 PSHUFB X5, X10
2046 PADDD X10, X9
2047 PXOR X9, X8
2048 MOVO X8, X13
2049 PSRLL $0x0c, X13
2050 PSLLL $0x14, X8
2051 PXOR X13, X8
2052 PADDD X15, X7
2053 PADDD X8, X7
2054 PXOR X7, X10
2055 PSHUFB X4, X10
2056 PADDD X10, X9
2057 PXOR X9, X8
2058 MOVO X8, X13
2059 PSRLL $0x07, X13
2060 PSLLL $0x19, X8
2061 PXOR X13, X8
2062
2063 // Round 4 diagonal step part 3: undiagonalize.
2064 PSHUFD $0x93, X8, X8
2065 PSHUFD $0x4e, X9, X9
2066 PSHUFD $0x39, X10, X10
2067
2068 // Round 5 column step.
2069 PSHUFD $0x20, X0, X14
2070 PSHUFD $0x04, X1, X13
2071 PBLENDW $0x0c, X13, X14
2072 PSHUFD $0x81, X2, X13
2073 PBLENDW $0xc3, X13, X14
2074 MOVOU permuted_blake_consts<>+256(SB), X15
2075 PXOR X14, X15
2076 PADDD X15, X7
2077 PSHUFD $0x00, X0, X14
2078 PSHUFD $0x0c, X1, X13
2079 PBLENDW $0x3c, X13, X14
2080 PSHUFD $0xc0, X3, X13
2081 PBLENDW $0xc0, X13, X14
2082 MOVOU permuted_blake_consts<>+272(SB), X15
2083 PXOR X14, X15
2084 PADDD X8, X7
2085 PXOR X7, X10
2086 PSHUFB X5, X10
2087 PADDD X10, X9
2088 PXOR X9, X8
2089 MOVO X8, X13
2090 PSRLL $0x0c, X13
2091 PSLLL $0x14, X8
2092 PXOR X13, X8
2093 PADDD X15, X7
2094 PADDD X8, X7
2095 PXOR X7, X10
2096 PSHUFB X4, X10
2097 PADDD X10, X9
2098 PXOR X9, X8
2099 MOVO X8, X13
2100 PSRLL $0x07, X13
2101 PSLLL $0x19, X8
2102 PXOR X13, X8
2103
2104 // Round 5 diagonal step part 1: diagonalize.
2105 PSHUFD $0x39, X8, X8
2106 PSHUFD $0x4e, X9, X9
2107 PSHUFD $0x93, X10, X10
2108
2109 // Round 5 diagonal step part 2: column step.
2110 PSHUFD $0xc0, X0, X14
2111 PSHUFD $0x20, X1, X13
2112 PBLENDW $0x30, X13, X14
2113 PSHUFD $0x0c, X2, X13
2114 PBLENDW $0x0c, X13, X14
2115 PSHUFD $0x02, X3, X13
2116 PBLENDW $0x03, X13, X14
2117 MOVOU permuted_blake_consts<>+288(SB), X15
2118 PXOR X14, X15
2119 PADDD X15, X7
2120 PSHUFD $0x01, X0, X14
2121 PSHUFD $0x00, X2, X13
2122 PBLENDW $0x30, X13, X14
2123 PSHUFD $0x40, X3, X13
2124 PBLENDW $0xcc, X13, X14
2125 MOVOU permuted_blake_consts<>+304(SB), X15
2126 PXOR X14, X15
2127 PADDD X8, X7
2128 PXOR X7, X10
2129 PSHUFB X5, X10
2130 PADDD X10, X9
2131 PXOR X9, X8
2132 MOVO X8, X13
2133 PSRLL $0x0c, X13
2134 PSLLL $0x14, X8
2135 PXOR X13, X8
2136 PADDD X15, X7
2137 PADDD X8, X7
2138 PXOR X7, X10
2139 PSHUFB X4, X10
2140 PADDD X10, X9
2141 PXOR X9, X8
2142 MOVO X8, X13
2143 PSRLL $0x07, X13
2144 PSLLL $0x19, X8
2145 PXOR X13, X8
2146
2147 // Round 5 diagonal step part 3: undiagonalize.
2148 PSHUFD $0x93, X8, X8
2149 PSHUFD $0x4e, X9, X9
2150 PSHUFD $0x39, X10, X10
2151
2152 // Round 6 column step.
2153 PSHUFD $0x02, X0, X14
2154 PSHUFD $0x08, X1, X13
2155 PBLENDW $0x0c, X13, X14
2156 PSHUFD $0x00, X2, X13
2157 PBLENDW $0xc0, X13, X14
2158 MOVOU permuted_blake_consts<>+320(SB), X15
2159 PXOR X14, X15
2160 PADDD X15, X7
2161 PSHUFD $0xc0, X0, X14
2162 PSHUFD $0x38, X2, X13
2163 PBLENDW $0x3c, X13, X14
2164 PSHUFD $0x00, X3, X13
2165 PBLENDW $0x03, X13, X14
2166 MOVOU permuted_blake_consts<>+336(SB), X15
2167 PXOR X14, X15
2168 PADDD X8, X7
2169 PXOR X7, X10
2170 PSHUFB X5, X10
2171 PADDD X10, X9
2172 PXOR X9, X8
2173 MOVO X8, X13
2174 PSRLL $0x0c, X13
2175 PSLLL $0x14, X8
2176 PXOR X13, X8
2177 PADDD X15, X7
2178 PADDD X8, X7
2179 PXOR X7, X10
2180 PSHUFB X4, X10
2181 PADDD X10, X9
2182 PXOR X9, X8
2183 MOVO X8, X13
2184 PSRLL $0x07, X13
2185 PSLLL $0x19, X8
2186 PXOR X13, X8
2187
2188 // Round 6 diagonal step part 1: diagonalize.
2189 PSHUFD $0x39, X8, X8
2190 PSHUFD $0x4e, X9, X9
2191 PSHUFD $0x93, X10, X10
2192
2193 // Round 6 diagonal step part 2: column step.
2194 PSHUFD $0x40, X0, X14
2195 PSHUFD $0x0c, X1, X13
2196 PBLENDW $0x0f, X13, X14
2197 PSHUFD $0x30, X3, X13
2198 PBLENDW $0x30, X13, X14
2199 MOVOU permuted_blake_consts<>+352(SB), X15
2200 PXOR X14, X15
2201 PADDD X15, X7
2202 PSHUFD $0x04, X1, X14
2203 PSHUFD $0x40, X2, X13
2204 PBLENDW $0xc0, X13, X14
2205 PSHUFD $0x21, X3, X13
2206 PBLENDW $0x33, X13, X14
2207 MOVOU permuted_blake_consts<>+368(SB), X15
2208 PXOR X14, X15
2209 PADDD X8, X7
2210 PXOR X7, X10
2211 PSHUFB X5, X10
2212 PADDD X10, X9
2213 PXOR X9, X8
2214 MOVO X8, X13
2215 PSRLL $0x0c, X13
2216 PSLLL $0x14, X8
2217 PXOR X13, X8
2218 PADDD X15, X7
2219 PADDD X8, X7
2220 PXOR X7, X10
2221 PSHUFB X4, X10
2222 PADDD X10, X9
2223 PXOR X9, X8
2224 MOVO X8, X13
2225 PSRLL $0x07, X13
2226 PSLLL $0x19, X8
2227 PXOR X13, X8
2228
2229 // Round 6 diagonal step part 3: undiagonalize.
2230 PSHUFD $0x93, X8, X8
2231 PSHUFD $0x4e, X9, X9
2232 PSHUFD $0x39, X10, X10
2233
2234 // Round 7 column step.
2235 PSHUFD $0x04, X0, X14
2236 PSHUFD $0x00, X1, X13
2237 PBLENDW $0xc0, X13, X14
2238 PSHUFD $0x20, X3, X13
2239 PBLENDW $0x33, X13, X14
2240 MOVOU permuted_blake_consts<>+384(SB), X15
2241 PXOR X14, X15
2242 PADDD X15, X7
2243 PSHUFD $0x01, X1, X14
2244 PSHUFD $0x80, X2, X13
2245 PBLENDW $0xc0, X13, X14
2246 PSHUFD $0x1c, X3, X13
2247 PBLENDW $0x3c, X13, X14
2248 MOVOU permuted_blake_consts<>+400(SB), X15
2249 PXOR X14, X15
2250 PADDD X8, X7
2251 PXOR X7, X10
2252 PSHUFB X5, X10
2253 PADDD X10, X9
2254 PXOR X9, X8
2255 MOVO X8, X13
2256 PSRLL $0x0c, X13
2257 PSLLL $0x14, X8
2258 PXOR X13, X8
2259 PADDD X15, X7
2260 PADDD X8, X7
2261 PXOR X7, X10
2262 PSHUFB X4, X10
2263 PADDD X10, X9
2264 PXOR X9, X8
2265 MOVO X8, X13
2266 PSRLL $0x07, X13
2267 PSLLL $0x19, X8
2268 PXOR X13, X8
2269
2270 // Round 7 diagonal step part 1: diagonalize.
2271 PSHUFD $0x39, X8, X8
2272 PSHUFD $0x4e, X9, X9
2273 PSHUFD $0x93, X10, X10
2274
2275 // Round 7 diagonal step part 2: column step.
2276 PSHUFD $0x00, X0, X14
2277 PSHUFD $0x08, X1, X13
2278 PBLENDW $0x0c, X13, X14
2279 PSHUFD $0x10, X2, X13
2280 PBLENDW $0xf0, X13, X14
2281 MOVOU permuted_blake_consts<>+416(SB), X15
2282 PXOR X14, X15
2283 PADDD X15, X7
2284 PSHUFD $0x2c, X0, X14
2285 PSHUFD $0x03, X1, X13
2286 PBLENDW $0x03, X13, X14
2287 PSHUFD $0xc0, X2, X13
2288 PBLENDW $0xc0, X13, X14
2289 MOVOU permuted_blake_consts<>+432(SB), X15
2290 PXOR X14, X15
2291 PADDD X8, X7
2292 PXOR X7, X10
2293 PSHUFB X5, X10
2294 PADDD X10, X9
2295 PXOR X9, X8
2296 MOVO X8, X13
2297 PSRLL $0x0c, X13
2298 PSLLL $0x14, X8
2299 PXOR X13, X8
2300 PADDD X15, X7
2301 PADDD X8, X7
2302 PXOR X7, X10
2303 PSHUFB X4, X10
2304 PADDD X10, X9
2305 PXOR X9, X8
2306 MOVO X8, X13
2307 PSRLL $0x07, X13
2308 PSLLL $0x19, X8
2309 PXOR X13, X8
2310
2311 // Round 7 diagonal step part 3: undiagonalize.
2312 PSHUFD $0x93, X8, X8
2313 PSHUFD $0x4e, X9, X9
2314 PSHUFD $0x39, X10, X10
2315
2316 // Round 8 column step.
2317 PSHUFD $0xc0, X0, X14
2318 PSHUFD $0x0c, X1, X13
2319 PBLENDW $0x0c, X13, X14
2320 PSHUFD $0x01, X3, X13
2321 PBLENDW $0x33, X13, X14
2322 MOVOU permuted_blake_consts<>+448(SB), X15
2323 PXOR X14, X15
2324 PADDD X15, X7
2325 PSHUFD $0x10, X0, X14
2326 PSHUFD $0x43, X2, X13
2327 PBLENDW $0xc3, X13, X14
2328 PSHUFD $0x08, X3, X13
2329 PBLENDW $0x0c, X13, X14
2330 MOVOU permuted_blake_consts<>+464(SB), X15
2331 PXOR X14, X15
2332 PADDD X8, X7
2333 PXOR X7, X10
2334 PSHUFB X5, X10
2335 PADDD X10, X9
2336 PXOR X9, X8
2337 MOVO X8, X13
2338 PSRLL $0x0c, X13
2339 PSLLL $0x14, X8
2340 PXOR X13, X8
2341 PADDD X15, X7
2342 PADDD X8, X7
2343 PXOR X7, X10
2344 PSHUFB X4, X10
2345 PADDD X10, X9
2346 PXOR X9, X8
2347 MOVO X8, X13
2348 PSRLL $0x07, X13
2349 PSLLL $0x19, X8
2350 PXOR X13, X8
2351
2352 // Round 8 diagonal step part 1: diagonalize.
2353 PSHUFD $0x39, X8, X8
2354 PSHUFD $0x4e, X9, X9
2355 PSHUFD $0x93, X10, X10
2356
2357 // Round 8 diagonal step part 2: column step.
2358 PSHUFD $0x80, X0, X14
2359 PSHUFD $0x01, X1, X13
2360 PBLENDW $0x03, X13, X14
2361 PSHUFD $0x00, X2, X13
2362 PBLENDW $0x30, X13, X14
2363 PSHUFD $0x0c, X3, X13
2364 PBLENDW $0x0c, X13, X14
2365 MOVOU permuted_blake_consts<>+480(SB), X15
2366 PXOR X14, X15
2367 PADDD X15, X7
2368 PSHUFD $0x00, X0, X14
2369 PSHUFD $0x20, X1, X13
2370 PBLENDW $0x3c, X13, X14
2371 PSHUFD $0x80, X2, X13
2372 PBLENDW $0xc0, X13, X14
2373 MOVOU permuted_blake_consts<>+496(SB), X15
2374 PXOR X14, X15
2375 PADDD X8, X7
2376 PXOR X7, X10
2377 PSHUFB X5, X10
2378 PADDD X10, X9
2379 PXOR X9, X8
2380 MOVO X8, X13
2381 PSRLL $0x0c, X13
2382 PSLLL $0x14, X8
2383 PXOR X13, X8
2384 PADDD X15, X7
2385 PADDD X8, X7
2386 PXOR X7, X10
2387 PSHUFB X4, X10
2388 PADDD X10, X9
2389 PXOR X9, X8
2390 MOVO X8, X13
2391 PSRLL $0x07, X13
2392 PSLLL $0x19, X8
2393 PXOR X13, X8
2394
2395 // Round 8 diagonal step part 3: undiagonalize.
2396 PSHUFD $0x93, X8, X8
2397 PSHUFD $0x4e, X9, X9
2398 PSHUFD $0x39, X10, X10
2399
2400 // Round 9 column step.
2401 PSHUFD $0x00, X0, X14
2402 PSHUFD $0x02, X1, X13
2403 PBLENDW $0x03, X13, X14
2404 PSHUFD $0x30, X2, X13
2405 PBLENDW $0x30, X13, X14
2406 PSHUFD $0x08, X3, X13
2407 PBLENDW $0x0c, X13, X14
2408 MOVOU permuted_blake_consts<>+512(SB), X15
2409 PXOR X14, X15
2410 PADDD X15, X7
2411 PSHUFD $0x30, X0, X14
2412 PSHUFD $0x04, X2, X13
2413 PBLENDW $0xcc, X13, X14
2414 PSHUFD $0x03, X3, X13
2415 PBLENDW $0x03, X13, X14
2416 MOVOU permuted_blake_consts<>+528(SB), X15
2417 PXOR X14, X15
2418 PADDD X8, X7
2419 PXOR X7, X10
2420 PSHUFB X5, X10
2421 PADDD X10, X9
2422 PXOR X9, X8
2423 MOVO X8, X13
2424 PSRLL $0x0c, X13
2425 PSLLL $0x14, X8
2426 PXOR X13, X8
2427 PADDD X15, X7
2428 PADDD X8, X7
2429 PXOR X7, X10
2430 PSHUFB X4, X10
2431 PADDD X10, X9
2432 PXOR X9, X8
2433 MOVO X8, X13
2434 PSRLL $0x07, X13
2435 PSLLL $0x19, X8
2436 PXOR X13, X8
2437
2438 // Round 9 diagonal step part 1: diagonalize.
2439 PSHUFD $0x39, X8, X8
2440 PSHUFD $0x4e, X9, X9
2441 PSHUFD $0x93, X10, X10
2442
2443 // Round 9 diagonal step part 2: column step.
2444 PSHUFD $0x10, X0, X14
2445 PSHUFD $0x80, X2, X13
2446 PBLENDW $0xc0, X13, X14
2447 PSHUFD $0x04, X3, X13
2448 PBLENDW $0x0f, X13, X14
2449 MOVOU permuted_blake_consts<>+544(SB), X15
2450 PXOR X14, X15
2451 PADDD X15, X7
2452 PSHUFD $0x02, X0, X14
2453 PSHUFD $0x4c, X1, X13
2454 PBLENDW $0xfc, X13, X14
2455 MOVOU permuted_blake_consts<>+560(SB), X15
2456 PXOR X14, X15
2457 PADDD X8, X7
2458 PXOR X7, X10
2459 PSHUFB X5, X10
2460 PADDD X10, X9
2461 PXOR X9, X8
2462 MOVO X8, X13
2463 PSRLL $0x0c, X13
2464 PSLLL $0x14, X8
2465 PXOR X13, X8
2466 PADDD X15, X7
2467 PADDD X8, X7
2468 PXOR X7, X10
2469 PSHUFB X4, X10
2470 PADDD X10, X9
2471 PXOR X9, X8
2472 MOVO X8, X13
2473 PSRLL $0x07, X13
2474 PSLLL $0x19, X8
2475 PXOR X13, X8
2476
2477 // Round 9 diagonal step part 3: undiagonalize.
2478 PSHUFD $0x93, X8, X8
2479 PSHUFD $0x4e, X9, X9
2480 PSHUFD $0x39, X10, X10
2481
2482 // Round 10 column step.
2483 PSHUFD $0x40, X0, X14
2484 PSHUFD $0x30, X1, X13
2485 PBLENDW $0x30, X13, X14
2486 PSHUFD $0x02, X2, X13
2487 PBLENDW $0x0f, X13, X14
2488 MOVOU permuted_blake_consts<>+576(SB), X15
2489 PXOR X14, X15
2490 PADDD X15, X7
2491 PSHUFD $0x02, X0, X14
2492 PSHUFD $0x60, X1, X13
2493 PBLENDW $0xfc, X13, X14
2494 MOVOU permuted_blake_consts<>+592(SB), X15
2495 PXOR X14, X15
2496 PADDD X8, X7
2497 PXOR X7, X10
2498 PSHUFB X5, X10
2499 PADDD X10, X9
2500 PXOR X9, X8
2501 MOVO X8, X13
2502 PSRLL $0x0c, X13
2503 PSLLL $0x14, X8
2504 PXOR X13, X8
2505 PADDD X15, X7
2506 PADDD X8, X7
2507 PXOR X7, X10
2508 PSHUFB X4, X10
2509 PADDD X10, X9
2510 PXOR X9, X8
2511 MOVO X8, X13
2512 PSRLL $0x07, X13
2513 PSLLL $0x19, X8
2514 PXOR X13, X8
2515
2516 // Round 10 diagonal step part 1: diagonalize.
2517 PSHUFD $0x39, X8, X8
2518 PSHUFD $0x4e, X9, X9
2519 PSHUFD $0x93, X10, X10
2520
2521 // Round 10 diagonal step part 2: column step.
2522 PSHUFD $0x30, X0, X14
2523 PSHUFD $0x04, X2, X13
2524 PBLENDW $0x0c, X13, X14
2525 PSHUFD $0x43, X3, X13
2526 PBLENDW $0xc3, X13, X14
2527 MOVOU permuted_blake_consts<>+608(SB), X15
2528 PXOR X14, X15
2529 PADDD X15, X7
2530 PSHUFD $0x00, X0, X14
2531 PSHUFD $0x03, X2, X13
2532 PBLENDW $0x03, X13, X14
2533 PSHUFD $0x08, X3, X13
2534 PBLENDW $0x3c, X13, X14
2535 MOVOU permuted_blake_consts<>+624(SB), X15
2536 PXOR X14, X15
2537 PADDD X8, X7
2538 PXOR X7, X10
2539 PSHUFB X5, X10
2540 PADDD X10, X9
2541 PXOR X9, X8
2542 MOVO X8, X13
2543 PSRLL $0x0c, X13
2544 PSLLL $0x14, X8
2545 PXOR X13, X8
2546 PADDD X15, X7
2547 PADDD X8, X7
2548 PXOR X7, X10
2549 PSHUFB X4, X10
2550 PADDD X10, X9
2551 PXOR X9, X8
2552 MOVO X8, X13
2553 PSRLL $0x07, X13
2554 PSLLL $0x19, X8
2555 PXOR X13, X8
2556
2557 // Round 10 diagonal step part 3: undiagonalize.
2558 PSHUFD $0x93, X8, X8
2559 PSHUFD $0x4e, X9, X9
2560 PSHUFD $0x39, X10, X10
2561
2562 // Round 11 column step.
2563 PSHUFD $0x08, X0, X14
2564 PSHUFD $0x80, X1, X13
2565 PBLENDW $0xf0, X13, X14
2566 MOVOU permuted_blake_consts<>+0(SB), X15
2567 PXOR X14, X15
2568 PADDD X15, X7
2569 PSHUFD $0x0d, X0, X14
2570 PSHUFD $0xd0, X1, X13
2571 PBLENDW $0xf0, X13, X14
2572 MOVOU permuted_blake_consts<>+16(SB), X15
2573 PXOR X14, X15
2574 PADDD X8, X7
2575 PXOR X7, X10
2576 PSHUFB X5, X10
2577 PADDD X10, X9
2578 PXOR X9, X8
2579 MOVO X8, X13
2580 PSRLL $0x0c, X13
2581 PSLLL $0x14, X8
2582 PXOR X13, X8
2583 PADDD X15, X7
2584 PADDD X8, X7
2585 PXOR X7, X10
2586 PSHUFB X4, X10
2587 PADDD X10, X9
2588 PXOR X9, X8
2589 MOVO X8, X13
2590 PSRLL $0x07, X13
2591 PSLLL $0x19, X8
2592 PXOR X13, X8
2593
2594 // Round 11 diagonal step part 1: diagonalize.
2595 PSHUFD $0x39, X8, X8
2596 PSHUFD $0x4e, X9, X9
2597 PSHUFD $0x93, X10, X10
2598
2599 // Round 11 diagonal step part 2: column step.
2600 PSHUFD $0x08, X2, X14
2601 PSHUFD $0x80, X3, X13
2602 PBLENDW $0xf0, X13, X14
2603 MOVOU permuted_blake_consts<>+32(SB), X15
2604 PXOR X14, X15
2605 PADDD X15, X7
2606 PSHUFD $0x0d, X2, X14
2607 PSHUFD $0xd0, X3, X13
2608 PBLENDW $0xf0, X13, X14
2609 MOVOU permuted_blake_consts<>+48(SB), X15
2610 PXOR X14, X15
2611 PADDD X8, X7
2612 PXOR X7, X10
2613 PSHUFB X5, X10
2614 PADDD X10, X9
2615 PXOR X9, X8
2616 MOVO X8, X13
2617 PSRLL $0x0c, X13
2618 PSLLL $0x14, X8
2619 PXOR X13, X8
2620 PADDD X15, X7
2621 PADDD X8, X7
2622 PXOR X7, X10
2623 PSHUFB X4, X10
2624 PADDD X10, X9
2625 PXOR X9, X8
2626 MOVO X8, X13
2627 PSRLL $0x07, X13
2628 PSLLL $0x19, X8
2629 PXOR X13, X8
2630
2631 // Round 11 diagonal step part 3: undiagonalize.
2632 PSHUFD $0x93, X8, X8
2633 PSHUFD $0x4e, X9, X9
2634 PSHUFD $0x39, X10, X10
2635
2636 // Round 12 column step.
2637 PSHUFD $0x00, X1, X14
2638 PSHUFD $0x10, X2, X13
2639 PBLENDW $0x30, X13, X14
2640 PSHUFD $0x42, X3, X13
2641 PBLENDW $0xc3, X13, X14
2642 MOVOU permuted_blake_consts<>+64(SB), X15
2643 PXOR X14, X15
2644 PADDD X15, X7
2645 PSHUFD $0x80, X1, X14
2646 PSHUFD $0x02, X2, X13
2647 PBLENDW $0x0f, X13, X14
2648 PSHUFD $0x30, X3, X13
2649 PBLENDW $0x30, X13, X14
2650 MOVOU permuted_blake_consts<>+80(SB), X15
2651 PXOR X14, X15
2652 PADDD X8, X7
2653 PXOR X7, X10
2654 PSHUFB X5, X10
2655 PADDD X10, X9
2656 PXOR X9, X8
2657 MOVO X8, X13
2658 PSRLL $0x0c, X13
2659 PSLLL $0x14, X8
2660 PXOR X13, X8
2661 PADDD X15, X7
2662 PADDD X8, X7
2663 PXOR X7, X10
2664 PSHUFB X4, X10
2665 PADDD X10, X9
2666 PXOR X9, X8
2667 MOVO X8, X13
2668 PSRLL $0x07, X13
2669 PSLLL $0x19, X8
2670 PXOR X13, X8
2671
2672 // Round 12 diagonal step part 1: diagonalize.
2673 PSHUFD $0x39, X8, X8
2674 PSHUFD $0x4e, X9, X9
2675 PSHUFD $0x93, X10, X10
2676
2677 // Round 12 diagonal step part 2: column step.
2678 PSHUFD $0x01, X0, X14
2679 PSHUFD $0x40, X1, X13
2680 PBLENDW $0xc0, X13, X14
2681 PSHUFD $0x30, X2, X13
2682 PBLENDW $0x30, X13, X14
2683 MOVOU permuted_blake_consts<>+96(SB), X15
2684 PXOR X14, X15
2685 PADDD X15, X7
2686 PSHUFD $0xc8, X0, X14
2687 PSHUFD $0x30, X1, X13
2688 PBLENDW $0x30, X13, X14
2689 PSHUFD $0x00, X3, X13
2690 PBLENDW $0x03, X13, X14
2691 MOVOU permuted_blake_consts<>+112(SB), X15
2692 PXOR X14, X15
2693 PADDD X8, X7
2694 PXOR X7, X10
2695 PSHUFB X5, X10
2696 PADDD X10, X9
2697 PXOR X9, X8
2698 MOVO X8, X13
2699 PSRLL $0x0c, X13
2700 PSLLL $0x14, X8
2701 PXOR X13, X8
2702 PADDD X15, X7
2703 PADDD X8, X7
2704 PXOR X7, X10
2705 PSHUFB X4, X10
2706 PADDD X10, X9
2707 PXOR X9, X8
2708 MOVO X8, X13
2709 PSRLL $0x07, X13
2710 PSLLL $0x19, X8
2711 PXOR X13, X8
2712
2713 // Round 12 diagonal step part 3: undiagonalize.
2714 PSHUFD $0x93, X8, X8
2715 PSHUFD $0x4e, X9, X9
2716 PSHUFD $0x39, X10, X10
2717
2718 // Round 13 column step.
2719 PSHUFD $0x10, X1, X14
2720 PSHUFD $0x03, X2, X13
2721 PBLENDW $0x03, X13, X14
2722 PSHUFD $0xc0, X3, X13
2723 PBLENDW $0xcc, X13, X14
2724 MOVOU permuted_blake_consts<>+128(SB), X15
2725 PXOR X14, X15
2726 PADDD X15, X7
2727 PSHUFD $0x20, X0, X14
2728 PSHUFD $0x00, X2, X13
2729 PBLENDW $0x03, X13, X14
2730 PSHUFD $0x40, X3, X13
2731 PBLENDW $0xc0, X13, X14
2732 MOVOU permuted_blake_consts<>+144(SB), X15
2733 PXOR X14, X15
2734 PADDD X8, X7
2735 PXOR X7, X10
2736 PSHUFB X5, X10
2737 PADDD X10, X9
2738 PXOR X9, X8
2739 MOVO X8, X13
2740 PSRLL $0x0c, X13
2741 PSLLL $0x14, X8
2742 PXOR X13, X8
2743 PADDD X15, X7
2744 PADDD X8, X7
2745 PXOR X7, X10
2746 PSHUFB X4, X10
2747 PADDD X10, X9
2748 PXOR X9, X8
2749 MOVO X8, X13
2750 PSRLL $0x07, X13
2751 PSLLL $0x19, X8
2752 PXOR X13, X8
2753
2754 // Round 13 diagonal step part 1: diagonalize.
2755 PSHUFD $0x39, X8, X8
2756 PSHUFD $0x4e, X9, X9
2757 PSHUFD $0x93, X10, X10
2758
2759 // Round 13 diagonal step part 2: column step.
2760 PSHUFD $0x0c, X0, X14
2761 PSHUFD $0x30, X1, X13
2762 PBLENDW $0x30, X13, X14
2763 PSHUFD $0x42, X2, X13
2764 PBLENDW $0xc3, X13, X14
2765 MOVOU permuted_blake_consts<>+160(SB), X15
2766 PXOR X14, X15
2767 PADDD X15, X7
2768 PSHUFD $0x10, X0, X14
2769 PSHUFD $0x08, X1, X13
2770 PBLENDW $0xcc, X13, X14
2771 PSHUFD $0x02, X3, X13
2772 PBLENDW $0x03, X13, X14
2773 MOVOU permuted_blake_consts<>+176(SB), X15
2774 PXOR X14, X15
2775 PADDD X8, X7
2776 PXOR X7, X10
2777 PSHUFB X5, X10
2778 PADDD X10, X9
2779 PXOR X9, X8
2780 MOVO X8, X13
2781 PSRLL $0x0c, X13
2782 PSLLL $0x14, X8
2783 PXOR X13, X8
2784 PADDD X15, X7
2785 PADDD X8, X7
2786 PXOR X7, X10
2787 PSHUFB X4, X10
2788 PADDD X10, X9
2789 PXOR X9, X8
2790 MOVO X8, X13
2791 PSRLL $0x07, X13
2792 PSLLL $0x19, X8
2793 PXOR X13, X8
2794
2795 // Round 13 diagonal step part 3: undiagonalize.
2796 PSHUFD $0x93, X8, X8
2797 PSHUFD $0x4e, X9, X9
2798 PSHUFD $0x39, X10, X10
2799
2800 // Round 14 column step.
2801 PSHUFD $0x0c, X0, X14
2802 PSHUFD $0x03, X1, X13
2803 PBLENDW $0x03, X13, X14
2804 PSHUFD $0xc0, X2, X13
2805 PBLENDW $0xc0, X13, X14
2806 PSHUFD $0x10, X3, X13
2807 PBLENDW $0x30, X13, X14
2808 MOVOU permuted_blake_consts<>+192(SB), X15
2809 PXOR X14, X15
2810 PADDD X15, X7
2811 PSHUFD $0x04, X0, X14
2812 PSHUFD $0x01, X2, X13
2813 PBLENDW $0x03, X13, X14
2814 PSHUFD $0x80, X3, X13
2815 PBLENDW $0xf0, X13, X14
2816 MOVOU permuted_blake_consts<>+208(SB), X15
2817 PXOR X14, X15
2818 PADDD X8, X7
2819 PXOR X7, X10
2820 PSHUFB X5, X10
2821 PADDD X10, X9
2822 PXOR X9, X8
2823 MOVO X8, X13
2824 PSRLL $0x0c, X13
2825 PSLLL $0x14, X8
2826 PXOR X13, X8
2827 PADDD X15, X7
2828 PADDD X8, X7
2829 PXOR X7, X10
2830 PSHUFB X4, X10
2831 PADDD X10, X9
2832 PXOR X9, X8
2833 MOVO X8, X13
2834 PSRLL $0x07, X13
2835 PSLLL $0x19, X8
2836 PXOR X13, X8
2837
2838 // Round 14 diagonal step part 1: diagonalize.
2839 PSHUFD $0x39, X8, X8
2840 PSHUFD $0x4e, X9, X9
2841 PSHUFD $0x93, X10, X10
2842
2843 // Round 14 diagonal step part 2: column step.
2844 PSHUFD $0x02, X0, X14
2845 PSHUFD $0x04, X1, X13
2846 PBLENDW $0x3c, X13, X14
2847 PSHUFD $0xc0, X3, X13
2848 PBLENDW $0xc0, X13, X14
2849 MOVOU permuted_blake_consts<>+224(SB), X15
2850 PXOR X14, X15
2851 PADDD X15, X7
2852 PSHUFD $0x00, X0, X14
2853 PSHUFD $0x02, X1, X13
2854 PBLENDW $0x03, X13, X14
2855 PSHUFD $0x08, X2, X13
2856 PBLENDW $0xcc, X13, X14
2857 MOVOU permuted_blake_consts<>+240(SB), X15
2858 PXOR X14, X15
2859 PADDD X8, X7
2860 PXOR X7, X10
2861 PSHUFB X5, X10
2862 PADDD X10, X9
2863 PXOR X9, X8
2864 MOVO X8, X13
2865 PSRLL $0x0c, X13
2866 PSLLL $0x14, X8
2867 PXOR X13, X8
2868 PADDD X15, X7
2869 PADDD X8, X7
2870 PXOR X7, X10
2871 PSHUFB X4, X10
2872 PADDD X10, X9
2873 PXOR X9, X8
2874 MOVO X8, X13
2875 PSRLL $0x07, X13
2876 PSLLL $0x19, X8
2877 PXOR X13, X8
2878
2879 // Round 14 diagonal step part 3: undiagonalize.
2880 PSHUFD $0x93, X8, X8
2881 PSHUFD $0x4e, X9, X9
2882 PSHUFD $0x39, X10, X10
2883
2884 // Finally the chain value is defined as:
2885 // h'0 = h0^s0^v0^v8
2886 // h'1 = h1^s1^v1^v9
2887 // h'2 = h2^s2^v2^va
2888 // h'3 = h3^s3^v3^vb
2889 // h'4 = h4^s0^v4^vc
2890 // h'5 = h5^s1^v5^vd
2891 // h'6 = h6^s2^v6^ve
2892 // h'7 = h7^s3^v7^vf
2893 PXOR X11, X7
2894 PXOR X6, X7
2895 PXOR X9, X7
2896 PXOR X12, X8
2897 PXOR X6, X8
2898 PXOR X10, X8
2899
2900 // Either terminate the loop when there are no more full blocks
2901 // to compress or move the message pointer to the next block of
2902 // bytes to compress, increment the message bits counter
2903 // accordingly, and loop back around to compress it.
2904 DECQ BX
2905 JZ done
2906 LEAQ 64(DX), DX
2907 ADDQ $0x00000200, CX
2908 JMP compressLoop
2909
2910 done:
2911 // Output the resulting chain value.
2912 MOVOU X7, (AX)
2913 MOVOU X8, 16(AX)
2914 RET
2915
2916 // func blocksAVX(state *State, msg []byte, counter uint64)
2917 // Requires: AVX
2918 TEXT ·blocksAVX(SB), NOSPLIT, $0-40
2919 MOVQ state+0(FP), AX
2920 MOVQ counter+32(FP), CX
2921 MOVQ msg_base+8(FP), DX
2922 MOVQ msg_len+16(FP), BX
2923
2924 // Populate registers for fast right rotations.
2925 VMOVDQU shuffle_rotr8_4x32<>+0(SB), X4
2926 VMOVDQU shuffle_rotr16_4x32<>+0(SB), X5
2927
2928 // Convert message len to number of blocks for loop counter.
2929 SHRQ $0x06, BX
2930
2931 // Initialize state matrix.
2932 // row0 = |v0 v1 v2 v3| | h0 h1 h2 h3 |
2933 // row1 = |v4 v5 v6 v7| | h4 h5 h6 h7 |
2934 VMOVDQU 32(AX), X6
2935 VMOVDQU (AX), X7
2936 VMOVDQU 16(AX), X8
2937
2938 compressLoop:
2939 // row2 = |v8 v9 va vb| = |s0^c0 s1^c1 s2^c2 s3^c3|
2940 // row3 = |vc vd ve vf| |t0^c4 t0^c5 t1^c6 t1^c7|
2941 VMOVDQU first_8_blake_consts<>+0(SB), X9
2942 VPXOR X6, X9, X9
2943 VMOVQ CX, X10
2944 VPSHUFD $0x50, X10, X10
2945 VPXOR first_8_blake_consts<>+16(SB), X10, X10
2946 VMOVDQA X7, X11
2947 VMOVDQA X8, X12
2948
2949 // Convert message to big endian.
2950 VMOVDQU shuffle_le_to_be_4x32<>+0(SB), X13
2951 VMOVDQU (DX), X0
2952 VPSHUFB X13, X0, X0
2953 VMOVDQU 16(DX), X1
2954 VPSHUFB X13, X1, X1
2955 VMOVDQU 32(DX), X2
2956 VPSHUFB X13, X2, X2
2957 VMOVDQU 48(DX), X3
2958 VPSHUFB X13, X3, X3
2959
2960 // Round 1 column step.
2961 VPSHUFD $0x08, X0, X14
2962 VPSHUFD $0x80, X1, X13
2963 VPBLENDW $0xf0, X13, X14, X14
2964 VMOVDQU permuted_blake_consts<>+0(SB), X15
2965 VPXOR X14, X15, X15
2966 VPADDD X15, X7, X7
2967 VPSHUFD $0x0d, X0, X14
2968 VPSHUFD $0xd0, X1, X13
2969 VPBLENDW $0xf0, X13, X14, X14
2970 VMOVDQU permuted_blake_consts<>+16(SB), X15
2971 VPXOR X14, X15, X15
2972 VPADDD X8, X7, X7
2973 VPXOR X7, X10, X10
2974 VPSHUFB X5, X10, X10
2975 VPADDD X10, X9, X9
2976 VPXOR X9, X8, X8
2977 VPSRLD $0x0c, X8, X13
2978 VPSLLD $0x14, X8, X8
2979 VPXOR X13, X8, X8
2980 VPADDD X15, X7, X7
2981 VPADDD X8, X7, X7
2982 VPXOR X7, X10, X10
2983 VPSHUFB X4, X10, X10
2984 VPADDD X10, X9, X9
2985 VPXOR X9, X8, X8
2986 VPSRLD $0x07, X8, X13
2987 VPSLLD $0x19, X8, X8
2988 VPXOR X13, X8, X8
2989
2990 // Round 1 diagonal step part 1: diagonalize.
2991 VPSHUFD $0x39, X8, X8
2992 VPSHUFD $0x4e, X9, X9
2993 VPSHUFD $0x93, X10, X10
2994
2995 // Round 1 diagonal step part 2: column step.
2996 VPSHUFD $0x08, X2, X14
2997 VPSHUFD $0x80, X3, X13
2998 VPBLENDW $0xf0, X13, X14, X14
2999 VMOVDQU permuted_blake_consts<>+32(SB), X15
3000 VPXOR X14, X15, X15
3001 VPADDD X15, X7, X7
3002 VPSHUFD $0x0d, X2, X14
3003 VPSHUFD $0xd0, X3, X13
3004 VPBLENDW $0xf0, X13, X14, X14
3005 VMOVDQU permuted_blake_consts<>+48(SB), X15
3006 VPXOR X14, X15, X15
3007 VPADDD X8, X7, X7
3008 VPXOR X7, X10, X10
3009 VPSHUFB X5, X10, X10
3010 VPADDD X10, X9, X9
3011 VPXOR X9, X8, X8
3012 VPSRLD $0x0c, X8, X13
3013 VPSLLD $0x14, X8, X8
3014 VPXOR X13, X8, X8
3015 VPADDD X15, X7, X7
3016 VPADDD X8, X7, X7
3017 VPXOR X7, X10, X10
3018 VPSHUFB X4, X10, X10
3019 VPADDD X10, X9, X9
3020 VPXOR X9, X8, X8
3021 VPSRLD $0x07, X8, X13
3022 VPSLLD $0x19, X8, X8
3023 VPXOR X13, X8, X8
3024
3025 // Round 1 diagonal step part 3: undiagonalize.
3026 VPSHUFD $0x93, X8, X8
3027 VPSHUFD $0x4e, X9, X9
3028 VPSHUFD $0x39, X10, X10
3029
3030 // Round 2 column step.
3031 VPSHUFD $0x00, X1, X14
3032 VPSHUFD $0x10, X2, X13
3033 VPBLENDW $0x30, X13, X14, X14
3034 VPSHUFD $0x42, X3, X13
3035 VPBLENDW $0xc3, X13, X14, X14
3036 VMOVDQU permuted_blake_consts<>+64(SB), X15
3037 VPXOR X14, X15, X15
3038 VPADDD X15, X7, X7
3039 VPSHUFD $0x80, X1, X14
3040 VPSHUFD $0x02, X2, X13
3041 VPBLENDW $0x0f, X13, X14, X14
3042 VPSHUFD $0x30, X3, X13
3043 VPBLENDW $0x30, X13, X14, X14
3044 VMOVDQU permuted_blake_consts<>+80(SB), X15
3045 VPXOR X14, X15, X15
3046 VPADDD X8, X7, X7
3047 VPXOR X7, X10, X10
3048 VPSHUFB X5, X10, X10
3049 VPADDD X10, X9, X9
3050 VPXOR X9, X8, X8
3051 VPSRLD $0x0c, X8, X13
3052 VPSLLD $0x14, X8, X8
3053 VPXOR X13, X8, X8
3054 VPADDD X15, X7, X7
3055 VPADDD X8, X7, X7
3056 VPXOR X7, X10, X10
3057 VPSHUFB X4, X10, X10
3058 VPADDD X10, X9, X9
3059 VPXOR X9, X8, X8
3060 VPSRLD $0x07, X8, X13
3061 VPSLLD $0x19, X8, X8
3062 VPXOR X13, X8, X8
3063
3064 // Round 2 diagonal step part 1: diagonalize.
3065 VPSHUFD $0x39, X8, X8
3066 VPSHUFD $0x4e, X9, X9
3067 VPSHUFD $0x93, X10, X10
3068
3069 // Round 2 diagonal step part 2: column step.
3070 VPSHUFD $0x01, X0, X14
3071 VPSHUFD $0x40, X1, X13
3072 VPBLENDW $0xc0, X13, X14, X14
3073 VPSHUFD $0x30, X2, X13
3074 VPBLENDW $0x30, X13, X14, X14
3075 VMOVDQU permuted_blake_consts<>+96(SB), X15
3076 VPXOR X14, X15, X15
3077 VPADDD X15, X7, X7
3078 VPSHUFD $0xc8, X0, X14
3079 VPSHUFD $0x30, X1, X13
3080 VPBLENDW $0x30, X13, X14, X14
3081 VPSHUFD $0x00, X3, X13
3082 VPBLENDW $0x03, X13, X14, X14
3083 VMOVDQU permuted_blake_consts<>+112(SB), X15
3084 VPXOR X14, X15, X15
3085 VPADDD X8, X7, X7
3086 VPXOR X7, X10, X10
3087 VPSHUFB X5, X10, X10
3088 VPADDD X10, X9, X9
3089 VPXOR X9, X8, X8
3090 VPSRLD $0x0c, X8, X13
3091 VPSLLD $0x14, X8, X8
3092 VPXOR X13, X8, X8
3093 VPADDD X15, X7, X7
3094 VPADDD X8, X7, X7
3095 VPXOR X7, X10, X10
3096 VPSHUFB X4, X10, X10
3097 VPADDD X10, X9, X9
3098 VPXOR X9, X8, X8
3099 VPSRLD $0x07, X8, X13
3100 VPSLLD $0x19, X8, X8
3101 VPXOR X13, X8, X8
3102
3103 // Round 2 diagonal step part 3: undiagonalize.
3104 VPSHUFD $0x93, X8, X8
3105 VPSHUFD $0x4e, X9, X9
3106 VPSHUFD $0x39, X10, X10
3107
3108 // Round 3 column step.
3109 VPSHUFD $0x10, X1, X14
3110 VPSHUFD $0x03, X2, X13
3111 VPBLENDW $0x03, X13, X14, X14
3112 VPSHUFD $0xc0, X3, X13
3113 VPBLENDW $0xcc, X13, X14, X14
3114 VMOVDQU permuted_blake_consts<>+128(SB), X15
3115 VPXOR X14, X15, X15
3116 VPADDD X15, X7, X7
3117 VPSHUFD $0x20, X0, X14
3118 VPSHUFD $0x00, X2, X13
3119 VPBLENDW $0x03, X13, X14, X14
3120 VPSHUFD $0x40, X3, X13
3121 VPBLENDW $0xc0, X13, X14, X14
3122 VMOVDQU permuted_blake_consts<>+144(SB), X15
3123 VPXOR X14, X15, X15
3124 VPADDD X8, X7, X7
3125 VPXOR X7, X10, X10
3126 VPSHUFB X5, X10, X10
3127 VPADDD X10, X9, X9
3128 VPXOR X9, X8, X8
3129 VPSRLD $0x0c, X8, X13
3130 VPSLLD $0x14, X8, X8
3131 VPXOR X13, X8, X8
3132 VPADDD X15, X7, X7
3133 VPADDD X8, X7, X7
3134 VPXOR X7, X10, X10
3135 VPSHUFB X4, X10, X10
3136 VPADDD X10, X9, X9
3137 VPXOR X9, X8, X8
3138 VPSRLD $0x07, X8, X13
3139 VPSLLD $0x19, X8, X8
3140 VPXOR X13, X8, X8
3141
3142 // Round 3 diagonal step part 1: diagonalize.
3143 VPSHUFD $0x39, X8, X8
3144 VPSHUFD $0x4e, X9, X9
3145 VPSHUFD $0x93, X10, X10
3146
3147 // Round 3 diagonal step part 2: column step.
3148 VPSHUFD $0x0c, X0, X14
3149 VPSHUFD $0x30, X1, X13
3150 VPBLENDW $0x30, X13, X14, X14
3151 VPSHUFD $0x42, X2, X13
3152 VPBLENDW $0xc3, X13, X14, X14
3153 VMOVDQU permuted_blake_consts<>+160(SB), X15
3154 VPXOR X14, X15, X15
3155 VPADDD X15, X7, X7
3156 VPSHUFD $0x10, X0, X14
3157 VPSHUFD $0x08, X1, X13
3158 VPBLENDW $0xcc, X13, X14, X14
3159 VPSHUFD $0x02, X3, X13
3160 VPBLENDW $0x03, X13, X14, X14
3161 VMOVDQU permuted_blake_consts<>+176(SB), X15
3162 VPXOR X14, X15, X15
3163 VPADDD X8, X7, X7
3164 VPXOR X7, X10, X10
3165 VPSHUFB X5, X10, X10
3166 VPADDD X10, X9, X9
3167 VPXOR X9, X8, X8
3168 VPSRLD $0x0c, X8, X13
3169 VPSLLD $0x14, X8, X8
3170 VPXOR X13, X8, X8
3171 VPADDD X15, X7, X7
3172 VPADDD X8, X7, X7
3173 VPXOR X7, X10, X10
3174 VPSHUFB X4, X10, X10
3175 VPADDD X10, X9, X9
3176 VPXOR X9, X8, X8
3177 VPSRLD $0x07, X8, X13
3178 VPSLLD $0x19, X8, X8
3179 VPXOR X13, X8, X8
3180
3181 // Round 3 diagonal step part 3: undiagonalize.
3182 VPSHUFD $0x93, X8, X8
3183 VPSHUFD $0x4e, X9, X9
3184 VPSHUFD $0x39, X10, X10
3185
3186 // Round 4 column step.
3187 VPSHUFD $0x0c, X0, X14
3188 VPSHUFD $0x03, X1, X13
3189 VPBLENDW $0x03, X13, X14, X14
3190 VPSHUFD $0xc0, X2, X13
3191 VPBLENDW $0xc0, X13, X14, X14
3192 VPSHUFD $0x10, X3, X13
3193 VPBLENDW $0x30, X13, X14, X14
3194 VMOVDQU permuted_blake_consts<>+192(SB), X15
3195 VPXOR X14, X15, X15
3196 VPADDD X15, X7, X7
3197 VPSHUFD $0x04, X0, X14
3198 VPSHUFD $0x01, X2, X13
3199 VPBLENDW $0x03, X13, X14, X14
3200 VPSHUFD $0x80, X3, X13
3201 VPBLENDW $0xf0, X13, X14, X14
3202 VMOVDQU permuted_blake_consts<>+208(SB), X15
3203 VPXOR X14, X15, X15
3204 VPADDD X8, X7, X7
3205 VPXOR X7, X10, X10
3206 VPSHUFB X5, X10, X10
3207 VPADDD X10, X9, X9
3208 VPXOR X9, X8, X8
3209 VPSRLD $0x0c, X8, X13
3210 VPSLLD $0x14, X8, X8
3211 VPXOR X13, X8, X8
3212 VPADDD X15, X7, X7
3213 VPADDD X8, X7, X7
3214 VPXOR X7, X10, X10
3215 VPSHUFB X4, X10, X10
3216 VPADDD X10, X9, X9
3217 VPXOR X9, X8, X8
3218 VPSRLD $0x07, X8, X13
3219 VPSLLD $0x19, X8, X8
3220 VPXOR X13, X8, X8
3221
3222 // Round 4 diagonal step part 1: diagonalize.
3223 VPSHUFD $0x39, X8, X8
3224 VPSHUFD $0x4e, X9, X9
3225 VPSHUFD $0x93, X10, X10
3226
3227 // Round 4 diagonal step part 2: column step.
3228 VPSHUFD $0x02, X0, X14
3229 VPSHUFD $0x04, X1, X13
3230 VPBLENDW $0x3c, X13, X14, X14
3231 VPSHUFD $0xc0, X3, X13
3232 VPBLENDW $0xc0, X13, X14, X14
3233 VMOVDQU permuted_blake_consts<>+224(SB), X15
3234 VPXOR X14, X15, X15
3235 VPADDD X15, X7, X7
3236 VPSHUFD $0x00, X0, X14
3237 VPSHUFD $0x02, X1, X13
3238 VPBLENDW $0x03, X13, X14, X14
3239 VPSHUFD $0x08, X2, X13
3240 VPBLENDW $0xcc, X13, X14, X14
3241 VMOVDQU permuted_blake_consts<>+240(SB), X15
3242 VPXOR X14, X15, X15
3243 VPADDD X8, X7, X7
3244 VPXOR X7, X10, X10
3245 VPSHUFB X5, X10, X10
3246 VPADDD X10, X9, X9
3247 VPXOR X9, X8, X8
3248 VPSRLD $0x0c, X8, X13
3249 VPSLLD $0x14, X8, X8
3250 VPXOR X13, X8, X8
3251 VPADDD X15, X7, X7
3252 VPADDD X8, X7, X7
3253 VPXOR X7, X10, X10
3254 VPSHUFB X4, X10, X10
3255 VPADDD X10, X9, X9
3256 VPXOR X9, X8, X8
3257 VPSRLD $0x07, X8, X13
3258 VPSLLD $0x19, X8, X8
3259 VPXOR X13, X8, X8
3260
3261 // Round 4 diagonal step part 3: undiagonalize.
3262 VPSHUFD $0x93, X8, X8
3263 VPSHUFD $0x4e, X9, X9
3264 VPSHUFD $0x39, X10, X10
3265
3266 // Round 5 column step.
3267 VPSHUFD $0x20, X0, X14
3268 VPSHUFD $0x04, X1, X13
3269 VPBLENDW $0x0c, X13, X14, X14
3270 VPSHUFD $0x81, X2, X13
3271 VPBLENDW $0xc3, X13, X14, X14
3272 VMOVDQU permuted_blake_consts<>+256(SB), X15
3273 VPXOR X14, X15, X15
3274 VPADDD X15, X7, X7
3275 VPSHUFD $0x00, X0, X14
3276 VPSHUFD $0x0c, X1, X13
3277 VPBLENDW $0x3c, X13, X14, X14
3278 VPSHUFD $0xc0, X3, X13
3279 VPBLENDW $0xc0, X13, X14, X14
3280 VMOVDQU permuted_blake_consts<>+272(SB), X15
3281 VPXOR X14, X15, X15
3282 VPADDD X8, X7, X7
3283 VPXOR X7, X10, X10
3284 VPSHUFB X5, X10, X10
3285 VPADDD X10, X9, X9
3286 VPXOR X9, X8, X8
3287 VPSRLD $0x0c, X8, X13
3288 VPSLLD $0x14, X8, X8
3289 VPXOR X13, X8, X8
3290 VPADDD X15, X7, X7
3291 VPADDD X8, X7, X7
3292 VPXOR X7, X10, X10
3293 VPSHUFB X4, X10, X10
3294 VPADDD X10, X9, X9
3295 VPXOR X9, X8, X8
3296 VPSRLD $0x07, X8, X13
3297 VPSLLD $0x19, X8, X8
3298 VPXOR X13, X8, X8
3299
3300 // Round 5 diagonal step part 1: diagonalize.
3301 VPSHUFD $0x39, X8, X8
3302 VPSHUFD $0x4e, X9, X9
3303 VPSHUFD $0x93, X10, X10
3304
3305 // Round 5 diagonal step part 2: column step.
3306 VPSHUFD $0xc0, X0, X14
3307 VPSHUFD $0x20, X1, X13
3308 VPBLENDW $0x30, X13, X14, X14
3309 VPSHUFD $0x0c, X2, X13
3310 VPBLENDW $0x0c, X13, X14, X14
3311 VPSHUFD $0x02, X3, X13
3312 VPBLENDW $0x03, X13, X14, X14
3313 VMOVDQU permuted_blake_consts<>+288(SB), X15
3314 VPXOR X14, X15, X15
3315 VPADDD X15, X7, X7
3316 VPSHUFD $0x01, X0, X14
3317 VPSHUFD $0x00, X2, X13
3318 VPBLENDW $0x30, X13, X14, X14
3319 VPSHUFD $0x40, X3, X13
3320 VPBLENDW $0xcc, X13, X14, X14
3321 VMOVDQU permuted_blake_consts<>+304(SB), X15
3322 VPXOR X14, X15, X15
3323 VPADDD X8, X7, X7
3324 VPXOR X7, X10, X10
3325 VPSHUFB X5, X10, X10
3326 VPADDD X10, X9, X9
3327 VPXOR X9, X8, X8
3328 VPSRLD $0x0c, X8, X13
3329 VPSLLD $0x14, X8, X8
3330 VPXOR X13, X8, X8
3331 VPADDD X15, X7, X7
3332 VPADDD X8, X7, X7
3333 VPXOR X7, X10, X10
3334 VPSHUFB X4, X10, X10
3335 VPADDD X10, X9, X9
3336 VPXOR X9, X8, X8
3337 VPSRLD $0x07, X8, X13
3338 VPSLLD $0x19, X8, X8
3339 VPXOR X13, X8, X8
3340
3341 // Round 5 diagonal step part 3: undiagonalize.
3342 VPSHUFD $0x93, X8, X8
3343 VPSHUFD $0x4e, X9, X9
3344 VPSHUFD $0x39, X10, X10
3345
3346 // Round 6 column step.
3347 VPSHUFD $0x02, X0, X14
3348 VPSHUFD $0x08, X1, X13
3349 VPBLENDW $0x0c, X13, X14, X14
3350 VPSHUFD $0x00, X2, X13
3351 VPBLENDW $0xc0, X13, X14, X14
3352 VMOVDQU permuted_blake_consts<>+320(SB), X15
3353 VPXOR X14, X15, X15
3354 VPADDD X15, X7, X7
3355 VPSHUFD $0xc0, X0, X14
3356 VPSHUFD $0x38, X2, X13
3357 VPBLENDW $0x3c, X13, X14, X14
3358 VPSHUFD $0x00, X3, X13
3359 VPBLENDW $0x03, X13, X14, X14
3360 VMOVDQU permuted_blake_consts<>+336(SB), X15
3361 VPXOR X14, X15, X15
3362 VPADDD X8, X7, X7
3363 VPXOR X7, X10, X10
3364 VPSHUFB X5, X10, X10
3365 VPADDD X10, X9, X9
3366 VPXOR X9, X8, X8
3367 VPSRLD $0x0c, X8, X13
3368 VPSLLD $0x14, X8, X8
3369 VPXOR X13, X8, X8
3370 VPADDD X15, X7, X7
3371 VPADDD X8, X7, X7
3372 VPXOR X7, X10, X10
3373 VPSHUFB X4, X10, X10
3374 VPADDD X10, X9, X9
3375 VPXOR X9, X8, X8
3376 VPSRLD $0x07, X8, X13
3377 VPSLLD $0x19, X8, X8
3378 VPXOR X13, X8, X8
3379
3380 // Round 6 diagonal step part 1: diagonalize.
3381 VPSHUFD $0x39, X8, X8
3382 VPSHUFD $0x4e, X9, X9
3383 VPSHUFD $0x93, X10, X10
3384
3385 // Round 6 diagonal step part 2: column step.
3386 VPSHUFD $0x40, X0, X14
3387 VPSHUFD $0x0c, X1, X13
3388 VPBLENDW $0x0f, X13, X14, X14
3389 VPSHUFD $0x30, X3, X13
3390 VPBLENDW $0x30, X13, X14, X14
3391 VMOVDQU permuted_blake_consts<>+352(SB), X15
3392 VPXOR X14, X15, X15
3393 VPADDD X15, X7, X7
3394 VPSHUFD $0x04, X1, X14
3395 VPSHUFD $0x40, X2, X13
3396 VPBLENDW $0xc0, X13, X14, X14
3397 VPSHUFD $0x21, X3, X13
3398 VPBLENDW $0x33, X13, X14, X14
3399 VMOVDQU permuted_blake_consts<>+368(SB), X15
3400 VPXOR X14, X15, X15
3401 VPADDD X8, X7, X7
3402 VPXOR X7, X10, X10
3403 VPSHUFB X5, X10, X10
3404 VPADDD X10, X9, X9
3405 VPXOR X9, X8, X8
3406 VPSRLD $0x0c, X8, X13
3407 VPSLLD $0x14, X8, X8
3408 VPXOR X13, X8, X8
3409 VPADDD X15, X7, X7
3410 VPADDD X8, X7, X7
3411 VPXOR X7, X10, X10
3412 VPSHUFB X4, X10, X10
3413 VPADDD X10, X9, X9
3414 VPXOR X9, X8, X8
3415 VPSRLD $0x07, X8, X13
3416 VPSLLD $0x19, X8, X8
3417 VPXOR X13, X8, X8
3418
3419 // Round 6 diagonal step part 3: undiagonalize.
3420 VPSHUFD $0x93, X8, X8
3421 VPSHUFD $0x4e, X9, X9
3422 VPSHUFD $0x39, X10, X10
3423
3424 // Round 7 column step.
3425 VPSHUFD $0x04, X0, X14
3426 VPSHUFD $0x00, X1, X13
3427 VPBLENDW $0xc0, X13, X14, X14
3428 VPSHUFD $0x20, X3, X13
3429 VPBLENDW $0x33, X13, X14, X14
3430 VMOVDQU permuted_blake_consts<>+384(SB), X15
3431 VPXOR X14, X15, X15
3432 VPADDD X15, X7, X7
3433 VPSHUFD $0x01, X1, X14
3434 VPSHUFD $0x80, X2, X13
3435 VPBLENDW $0xc0, X13, X14, X14
3436 VPSHUFD $0x1c, X3, X13
3437 VPBLENDW $0x3c, X13, X14, X14
3438 VMOVDQU permuted_blake_consts<>+400(SB), X15
3439 VPXOR X14, X15, X15
3440 VPADDD X8, X7, X7
3441 VPXOR X7, X10, X10
3442 VPSHUFB X5, X10, X10
3443 VPADDD X10, X9, X9
3444 VPXOR X9, X8, X8
3445 VPSRLD $0x0c, X8, X13
3446 VPSLLD $0x14, X8, X8
3447 VPXOR X13, X8, X8
3448 VPADDD X15, X7, X7
3449 VPADDD X8, X7, X7
3450 VPXOR X7, X10, X10
3451 VPSHUFB X4, X10, X10
3452 VPADDD X10, X9, X9
3453 VPXOR X9, X8, X8
3454 VPSRLD $0x07, X8, X13
3455 VPSLLD $0x19, X8, X8
3456 VPXOR X13, X8, X8
3457
3458 // Round 7 diagonal step part 1: diagonalize.
3459 VPSHUFD $0x39, X8, X8
3460 VPSHUFD $0x4e, X9, X9
3461 VPSHUFD $0x93, X10, X10
3462
3463 // Round 7 diagonal step part 2: column step.
3464 VPSHUFD $0x00, X0, X14
3465 VPSHUFD $0x08, X1, X13
3466 VPBLENDW $0x0c, X13, X14, X14
3467 VPSHUFD $0x10, X2, X13
3468 VPBLENDW $0xf0, X13, X14, X14
3469 VMOVDQU permuted_blake_consts<>+416(SB), X15
3470 VPXOR X14, X15, X15
3471 VPADDD X15, X7, X7
3472 VPSHUFD $0x2c, X0, X14
3473 VPSHUFD $0x03, X1, X13
3474 VPBLENDW $0x03, X13, X14, X14
3475 VPSHUFD $0xc0, X2, X13
3476 VPBLENDW $0xc0, X13, X14, X14
3477 VMOVDQU permuted_blake_consts<>+432(SB), X15
3478 VPXOR X14, X15, X15
3479 VPADDD X8, X7, X7
3480 VPXOR X7, X10, X10
3481 VPSHUFB X5, X10, X10
3482 VPADDD X10, X9, X9
3483 VPXOR X9, X8, X8
3484 VPSRLD $0x0c, X8, X13
3485 VPSLLD $0x14, X8, X8
3486 VPXOR X13, X8, X8
3487 VPADDD X15, X7, X7
3488 VPADDD X8, X7, X7
3489 VPXOR X7, X10, X10
3490 VPSHUFB X4, X10, X10
3491 VPADDD X10, X9, X9
3492 VPXOR X9, X8, X8
3493 VPSRLD $0x07, X8, X13
3494 VPSLLD $0x19, X8, X8
3495 VPXOR X13, X8, X8
3496
3497 // Round 7 diagonal step part 3: undiagonalize.
3498 VPSHUFD $0x93, X8, X8
3499 VPSHUFD $0x4e, X9, X9
3500 VPSHUFD $0x39, X10, X10
3501
3502 // Round 8 column step.
3503 VPSHUFD $0xc0, X0, X14
3504 VPSHUFD $0x0c, X1, X13
3505 VPBLENDW $0x0c, X13, X14, X14
3506 VPSHUFD $0x01, X3, X13
3507 VPBLENDW $0x33, X13, X14, X14
3508 VMOVDQU permuted_blake_consts<>+448(SB), X15
3509 VPXOR X14, X15, X15
3510 VPADDD X15, X7, X7
3511 VPSHUFD $0x10, X0, X14
3512 VPSHUFD $0x43, X2, X13
3513 VPBLENDW $0xc3, X13, X14, X14
3514 VPSHUFD $0x08, X3, X13
3515 VPBLENDW $0x0c, X13, X14, X14
3516 VMOVDQU permuted_blake_consts<>+464(SB), X15
3517 VPXOR X14, X15, X15
3518 VPADDD X8, X7, X7
3519 VPXOR X7, X10, X10
3520 VPSHUFB X5, X10, X10
3521 VPADDD X10, X9, X9
3522 VPXOR X9, X8, X8
3523 VPSRLD $0x0c, X8, X13
3524 VPSLLD $0x14, X8, X8
3525 VPXOR X13, X8, X8
3526 VPADDD X15, X7, X7
3527 VPADDD X8, X7, X7
3528 VPXOR X7, X10, X10
3529 VPSHUFB X4, X10, X10
3530 VPADDD X10, X9, X9
3531 VPXOR X9, X8, X8
3532 VPSRLD $0x07, X8, X13
3533 VPSLLD $0x19, X8, X8
3534 VPXOR X13, X8, X8
3535
3536 // Round 8 diagonal step part 1: diagonalize.
3537 VPSHUFD $0x39, X8, X8
3538 VPSHUFD $0x4e, X9, X9
3539 VPSHUFD $0x93, X10, X10
3540
3541 // Round 8 diagonal step part 2: column step.
3542 VPSHUFD $0x80, X0, X14
3543 VPSHUFD $0x01, X1, X13
3544 VPBLENDW $0x03, X13, X14, X14
3545 VPSHUFD $0x00, X2, X13
3546 VPBLENDW $0x30, X13, X14, X14
3547 VPSHUFD $0x0c, X3, X13
3548 VPBLENDW $0x0c, X13, X14, X14
3549 VMOVDQU permuted_blake_consts<>+480(SB), X15
3550 VPXOR X14, X15, X15
3551 VPADDD X15, X7, X7
3552 VPSHUFD $0x00, X0, X14
3553 VPSHUFD $0x20, X1, X13
3554 VPBLENDW $0x3c, X13, X14, X14
3555 VPSHUFD $0x80, X2, X13
3556 VPBLENDW $0xc0, X13, X14, X14
3557 VMOVDQU permuted_blake_consts<>+496(SB), X15
3558 VPXOR X14, X15, X15
3559 VPADDD X8, X7, X7
3560 VPXOR X7, X10, X10
3561 VPSHUFB X5, X10, X10
3562 VPADDD X10, X9, X9
3563 VPXOR X9, X8, X8
3564 VPSRLD $0x0c, X8, X13
3565 VPSLLD $0x14, X8, X8
3566 VPXOR X13, X8, X8
3567 VPADDD X15, X7, X7
3568 VPADDD X8, X7, X7
3569 VPXOR X7, X10, X10
3570 VPSHUFB X4, X10, X10
3571 VPADDD X10, X9, X9
3572 VPXOR X9, X8, X8
3573 VPSRLD $0x07, X8, X13
3574 VPSLLD $0x19, X8, X8
3575 VPXOR X13, X8, X8
3576
3577 // Round 8 diagonal step part 3: undiagonalize.
3578 VPSHUFD $0x93, X8, X8
3579 VPSHUFD $0x4e, X9, X9
3580 VPSHUFD $0x39, X10, X10
3581
3582 // Round 9 column step.
3583 VPSHUFD $0x00, X0, X14
3584 VPSHUFD $0x02, X1, X13
3585 VPBLENDW $0x03, X13, X14, X14
3586 VPSHUFD $0x30, X2, X13
3587 VPBLENDW $0x30, X13, X14, X14
3588 VPSHUFD $0x08, X3, X13
3589 VPBLENDW $0x0c, X13, X14, X14
3590 VMOVDQU permuted_blake_consts<>+512(SB), X15
3591 VPXOR X14, X15, X15
3592 VPADDD X15, X7, X7
3593 VPSHUFD $0x30, X0, X14
3594 VPSHUFD $0x04, X2, X13
3595 VPBLENDW $0xcc, X13, X14, X14
3596 VPSHUFD $0x03, X3, X13
3597 VPBLENDW $0x03, X13, X14, X14
3598 VMOVDQU permuted_blake_consts<>+528(SB), X15
3599 VPXOR X14, X15, X15
3600 VPADDD X8, X7, X7
3601 VPXOR X7, X10, X10
3602 VPSHUFB X5, X10, X10
3603 VPADDD X10, X9, X9
3604 VPXOR X9, X8, X8
3605 VPSRLD $0x0c, X8, X13
3606 VPSLLD $0x14, X8, X8
3607 VPXOR X13, X8, X8
3608 VPADDD X15, X7, X7
3609 VPADDD X8, X7, X7
3610 VPXOR X7, X10, X10
3611 VPSHUFB X4, X10, X10
3612 VPADDD X10, X9, X9
3613 VPXOR X9, X8, X8
3614 VPSRLD $0x07, X8, X13
3615 VPSLLD $0x19, X8, X8
3616 VPXOR X13, X8, X8
3617
3618 // Round 9 diagonal step part 1: diagonalize.
3619 VPSHUFD $0x39, X8, X8
3620 VPSHUFD $0x4e, X9, X9
3621 VPSHUFD $0x93, X10, X10
3622
3623 // Round 9 diagonal step part 2: column step.
3624 VPSHUFD $0x10, X0, X14
3625 VPSHUFD $0x80, X2, X13
3626 VPBLENDW $0xc0, X13, X14, X14
3627 VPSHUFD $0x04, X3, X13
3628 VPBLENDW $0x0f, X13, X14, X14
3629 VMOVDQU permuted_blake_consts<>+544(SB), X15
3630 VPXOR X14, X15, X15
3631 VPADDD X15, X7, X7
3632 VPSHUFD $0x02, X0, X14
3633 VPSHUFD $0x4c, X1, X13
3634 VPBLENDW $0xfc, X13, X14, X14
3635 VMOVDQU permuted_blake_consts<>+560(SB), X15
3636 VPXOR X14, X15, X15
3637 VPADDD X8, X7, X7
3638 VPXOR X7, X10, X10
3639 VPSHUFB X5, X10, X10
3640 VPADDD X10, X9, X9
3641 VPXOR X9, X8, X8
3642 VPSRLD $0x0c, X8, X13
3643 VPSLLD $0x14, X8, X8
3644 VPXOR X13, X8, X8
3645 VPADDD X15, X7, X7
3646 VPADDD X8, X7, X7
3647 VPXOR X7, X10, X10
3648 VPSHUFB X4, X10, X10
3649 VPADDD X10, X9, X9
3650 VPXOR X9, X8, X8
3651 VPSRLD $0x07, X8, X13
3652 VPSLLD $0x19, X8, X8
3653 VPXOR X13, X8, X8
3654
3655 // Round 9 diagonal step part 3: undiagonalize.
3656 VPSHUFD $0x93, X8, X8
3657 VPSHUFD $0x4e, X9, X9
3658 VPSHUFD $0x39, X10, X10
3659
3660 // Round 10 column step.
3661 VPSHUFD $0x40, X0, X14
3662 VPSHUFD $0x30, X1, X13
3663 VPBLENDW $0x30, X13, X14, X14
3664 VPSHUFD $0x02, X2, X13
3665 VPBLENDW $0x0f, X13, X14, X14
3666 VMOVDQU permuted_blake_consts<>+576(SB), X15
3667 VPXOR X14, X15, X15
3668 VPADDD X15, X7, X7
3669 VPSHUFD $0x02, X0, X14
3670 VPSHUFD $0x60, X1, X13
3671 VPBLENDW $0xfc, X13, X14, X14
3672 VMOVDQU permuted_blake_consts<>+592(SB), X15
3673 VPXOR X14, X15, X15
3674 VPADDD X8, X7, X7
3675 VPXOR X7, X10, X10
3676 VPSHUFB X5, X10, X10
3677 VPADDD X10, X9, X9
3678 VPXOR X9, X8, X8
3679 VPSRLD $0x0c, X8, X13
3680 VPSLLD $0x14, X8, X8
3681 VPXOR X13, X8, X8
3682 VPADDD X15, X7, X7
3683 VPADDD X8, X7, X7
3684 VPXOR X7, X10, X10
3685 VPSHUFB X4, X10, X10
3686 VPADDD X10, X9, X9
3687 VPXOR X9, X8, X8
3688 VPSRLD $0x07, X8, X13
3689 VPSLLD $0x19, X8, X8
3690 VPXOR X13, X8, X8
3691
3692 // Round 10 diagonal step part 1: diagonalize.
3693 VPSHUFD $0x39, X8, X8
3694 VPSHUFD $0x4e, X9, X9
3695 VPSHUFD $0x93, X10, X10
3696
3697 // Round 10 diagonal step part 2: column step.
3698 VPSHUFD $0x30, X0, X14
3699 VPSHUFD $0x04, X2, X13
3700 VPBLENDW $0x0c, X13, X14, X14
3701 VPSHUFD $0x43, X3, X13
3702 VPBLENDW $0xc3, X13, X14, X14
3703 VMOVDQU permuted_blake_consts<>+608(SB), X15
3704 VPXOR X14, X15, X15
3705 VPADDD X15, X7, X7
3706 VPSHUFD $0x00, X0, X14
3707 VPSHUFD $0x03, X2, X13
3708 VPBLENDW $0x03, X13, X14, X14
3709 VPSHUFD $0x08, X3, X13
3710 VPBLENDW $0x3c, X13, X14, X14
3711 VMOVDQU permuted_blake_consts<>+624(SB), X15
3712 VPXOR X14, X15, X15
3713 VPADDD X8, X7, X7
3714 VPXOR X7, X10, X10
3715 VPSHUFB X5, X10, X10
3716 VPADDD X10, X9, X9
3717 VPXOR X9, X8, X8
3718 VPSRLD $0x0c, X8, X13
3719 VPSLLD $0x14, X8, X8
3720 VPXOR X13, X8, X8
3721 VPADDD X15, X7, X7
3722 VPADDD X8, X7, X7
3723 VPXOR X7, X10, X10
3724 VPSHUFB X4, X10, X10
3725 VPADDD X10, X9, X9
3726 VPXOR X9, X8, X8
3727 VPSRLD $0x07, X8, X13
3728 VPSLLD $0x19, X8, X8
3729 VPXOR X13, X8, X8
3730
3731 // Round 10 diagonal step part 3: undiagonalize.
3732 VPSHUFD $0x93, X8, X8
3733 VPSHUFD $0x4e, X9, X9
3734 VPSHUFD $0x39, X10, X10
3735
3736 // Round 11 column step.
3737 VPSHUFD $0x08, X0, X14
3738 VPSHUFD $0x80, X1, X13
3739 VPBLENDW $0xf0, X13, X14, X14
3740 VMOVDQU permuted_blake_consts<>+0(SB), X15
3741 VPXOR X14, X15, X15
3742 VPADDD X15, X7, X7
3743 VPSHUFD $0x0d, X0, X14
3744 VPSHUFD $0xd0, X1, X13
3745 VPBLENDW $0xf0, X13, X14, X14
3746 VMOVDQU permuted_blake_consts<>+16(SB), X15
3747 VPXOR X14, X15, X15
3748 VPADDD X8, X7, X7
3749 VPXOR X7, X10, X10
3750 VPSHUFB X5, X10, X10
3751 VPADDD X10, X9, X9
3752 VPXOR X9, X8, X8
3753 VPSRLD $0x0c, X8, X13
3754 VPSLLD $0x14, X8, X8
3755 VPXOR X13, X8, X8
3756 VPADDD X15, X7, X7
3757 VPADDD X8, X7, X7
3758 VPXOR X7, X10, X10
3759 VPSHUFB X4, X10, X10
3760 VPADDD X10, X9, X9
3761 VPXOR X9, X8, X8
3762 VPSRLD $0x07, X8, X13
3763 VPSLLD $0x19, X8, X8
3764 VPXOR X13, X8, X8
3765
3766 // Round 11 diagonal step part 1: diagonalize.
3767 VPSHUFD $0x39, X8, X8
3768 VPSHUFD $0x4e, X9, X9
3769 VPSHUFD $0x93, X10, X10
3770
3771 // Round 11 diagonal step part 2: column step.
3772 VPSHUFD $0x08, X2, X14
3773 VPSHUFD $0x80, X3, X13
3774 VPBLENDW $0xf0, X13, X14, X14
3775 VMOVDQU permuted_blake_consts<>+32(SB), X15
3776 VPXOR X14, X15, X15
3777 VPADDD X15, X7, X7
3778 VPSHUFD $0x0d, X2, X14
3779 VPSHUFD $0xd0, X3, X13
3780 VPBLENDW $0xf0, X13, X14, X14
3781 VMOVDQU permuted_blake_consts<>+48(SB), X15
3782 VPXOR X14, X15, X15
3783 VPADDD X8, X7, X7
3784 VPXOR X7, X10, X10
3785 VPSHUFB X5, X10, X10
3786 VPADDD X10, X9, X9
3787 VPXOR X9, X8, X8
3788 VPSRLD $0x0c, X8, X13
3789 VPSLLD $0x14, X8, X8
3790 VPXOR X13, X8, X8
3791 VPADDD X15, X7, X7
3792 VPADDD X8, X7, X7
3793 VPXOR X7, X10, X10
3794 VPSHUFB X4, X10, X10
3795 VPADDD X10, X9, X9
3796 VPXOR X9, X8, X8
3797 VPSRLD $0x07, X8, X13
3798 VPSLLD $0x19, X8, X8
3799 VPXOR X13, X8, X8
3800
3801 // Round 11 diagonal step part 3: undiagonalize.
3802 VPSHUFD $0x93, X8, X8
3803 VPSHUFD $0x4e, X9, X9
3804 VPSHUFD $0x39, X10, X10
3805
3806 // Round 12 column step.
3807 VPSHUFD $0x00, X1, X14
3808 VPSHUFD $0x10, X2, X13
3809 VPBLENDW $0x30, X13, X14, X14
3810 VPSHUFD $0x42, X3, X13
3811 VPBLENDW $0xc3, X13, X14, X14
3812 VMOVDQU permuted_blake_consts<>+64(SB), X15
3813 VPXOR X14, X15, X15
3814 VPADDD X15, X7, X7
3815 VPSHUFD $0x80, X1, X14
3816 VPSHUFD $0x02, X2, X13
3817 VPBLENDW $0x0f, X13, X14, X14
3818 VPSHUFD $0x30, X3, X13
3819 VPBLENDW $0x30, X13, X14, X14
3820 VMOVDQU permuted_blake_consts<>+80(SB), X15
3821 VPXOR X14, X15, X15
3822 VPADDD X8, X7, X7
3823 VPXOR X7, X10, X10
3824 VPSHUFB X5, X10, X10
3825 VPADDD X10, X9, X9
3826 VPXOR X9, X8, X8
3827 VPSRLD $0x0c, X8, X13
3828 VPSLLD $0x14, X8, X8
3829 VPXOR X13, X8, X8
3830 VPADDD X15, X7, X7
3831 VPADDD X8, X7, X7
3832 VPXOR X7, X10, X10
3833 VPSHUFB X4, X10, X10
3834 VPADDD X10, X9, X9
3835 VPXOR X9, X8, X8
3836 VPSRLD $0x07, X8, X13
3837 VPSLLD $0x19, X8, X8
3838 VPXOR X13, X8, X8
3839
3840 // Round 12 diagonal step part 1: diagonalize.
3841 VPSHUFD $0x39, X8, X8
3842 VPSHUFD $0x4e, X9, X9
3843 VPSHUFD $0x93, X10, X10
3844
3845 // Round 12 diagonal step part 2: column step.
3846 VPSHUFD $0x01, X0, X14
3847 VPSHUFD $0x40, X1, X13
3848 VPBLENDW $0xc0, X13, X14, X14
3849 VPSHUFD $0x30, X2, X13
3850 VPBLENDW $0x30, X13, X14, X14
3851 VMOVDQU permuted_blake_consts<>+96(SB), X15
3852 VPXOR X14, X15, X15
3853 VPADDD X15, X7, X7
3854 VPSHUFD $0xc8, X0, X14
3855 VPSHUFD $0x30, X1, X13
3856 VPBLENDW $0x30, X13, X14, X14
3857 VPSHUFD $0x00, X3, X13
3858 VPBLENDW $0x03, X13, X14, X14
3859 VMOVDQU permuted_blake_consts<>+112(SB), X15
3860 VPXOR X14, X15, X15
3861 VPADDD X8, X7, X7
3862 VPXOR X7, X10, X10
3863 VPSHUFB X5, X10, X10
3864 VPADDD X10, X9, X9
3865 VPXOR X9, X8, X8
3866 VPSRLD $0x0c, X8, X13
3867 VPSLLD $0x14, X8, X8
3868 VPXOR X13, X8, X8
3869 VPADDD X15, X7, X7
3870 VPADDD X8, X7, X7
3871 VPXOR X7, X10, X10
3872 VPSHUFB X4, X10, X10
3873 VPADDD X10, X9, X9
3874 VPXOR X9, X8, X8
3875 VPSRLD $0x07, X8, X13
3876 VPSLLD $0x19, X8, X8
3877 VPXOR X13, X8, X8
3878
3879 // Round 12 diagonal step part 3: undiagonalize.
3880 VPSHUFD $0x93, X8, X8
3881 VPSHUFD $0x4e, X9, X9
3882 VPSHUFD $0x39, X10, X10
3883
3884 // Round 13 column step.
3885 VPSHUFD $0x10, X1, X14
3886 VPSHUFD $0x03, X2, X13
3887 VPBLENDW $0x03, X13, X14, X14
3888 VPSHUFD $0xc0, X3, X13
3889 VPBLENDW $0xcc, X13, X14, X14
3890 VMOVDQU permuted_blake_consts<>+128(SB), X15
3891 VPXOR X14, X15, X15
3892 VPADDD X15, X7, X7
3893 VPSHUFD $0x20, X0, X14
3894 VPSHUFD $0x00, X2, X13
3895 VPBLENDW $0x03, X13, X14, X14
3896 VPSHUFD $0x40, X3, X13
3897 VPBLENDW $0xc0, X13, X14, X14
3898 VMOVDQU permuted_blake_consts<>+144(SB), X15
3899 VPXOR X14, X15, X15
3900 VPADDD X8, X7, X7
3901 VPXOR X7, X10, X10
3902 VPSHUFB X5, X10, X10
3903 VPADDD X10, X9, X9
3904 VPXOR X9, X8, X8
3905 VPSRLD $0x0c, X8, X13
3906 VPSLLD $0x14, X8, X8
3907 VPXOR X13, X8, X8
3908 VPADDD X15, X7, X7
3909 VPADDD X8, X7, X7
3910 VPXOR X7, X10, X10
3911 VPSHUFB X4, X10, X10
3912 VPADDD X10, X9, X9
3913 VPXOR X9, X8, X8
3914 VPSRLD $0x07, X8, X13
3915 VPSLLD $0x19, X8, X8
3916 VPXOR X13, X8, X8
3917
3918 // Round 13 diagonal step part 1: diagonalize.
3919 VPSHUFD $0x39, X8, X8
3920 VPSHUFD $0x4e, X9, X9
3921 VPSHUFD $0x93, X10, X10
3922
3923 // Round 13 diagonal step part 2: column step.
3924 VPSHUFD $0x0c, X0, X14
3925 VPSHUFD $0x30, X1, X13
3926 VPBLENDW $0x30, X13, X14, X14
3927 VPSHUFD $0x42, X2, X13
3928 VPBLENDW $0xc3, X13, X14, X14
3929 VMOVDQU permuted_blake_consts<>+160(SB), X15
3930 VPXOR X14, X15, X15
3931 VPADDD X15, X7, X7
3932 VPSHUFD $0x10, X0, X14
3933 VPSHUFD $0x08, X1, X13
3934 VPBLENDW $0xcc, X13, X14, X14
3935 VPSHUFD $0x02, X3, X13
3936 VPBLENDW $0x03, X13, X14, X14
3937 VMOVDQU permuted_blake_consts<>+176(SB), X15
3938 VPXOR X14, X15, X15
3939 VPADDD X8, X7, X7
3940 VPXOR X7, X10, X10
3941 VPSHUFB X5, X10, X10
3942 VPADDD X10, X9, X9
3943 VPXOR X9, X8, X8
3944 VPSRLD $0x0c, X8, X13
3945 VPSLLD $0x14, X8, X8
3946 VPXOR X13, X8, X8
3947 VPADDD X15, X7, X7
3948 VPADDD X8, X7, X7
3949 VPXOR X7, X10, X10
3950 VPSHUFB X4, X10, X10
3951 VPADDD X10, X9, X9
3952 VPXOR X9, X8, X8
3953 VPSRLD $0x07, X8, X13
3954 VPSLLD $0x19, X8, X8
3955 VPXOR X13, X8, X8
3956
3957 // Round 13 diagonal step part 3: undiagonalize.
3958 VPSHUFD $0x93, X8, X8
3959 VPSHUFD $0x4e, X9, X9
3960 VPSHUFD $0x39, X10, X10
3961
3962 // Round 14 column step.
3963 VPSHUFD $0x0c, X0, X14
3964 VPSHUFD $0x03, X1, X13
3965 VPBLENDW $0x03, X13, X14, X14
3966 VPSHUFD $0xc0, X2, X13
3967 VPBLENDW $0xc0, X13, X14, X14
3968 VPSHUFD $0x10, X3, X13
3969 VPBLENDW $0x30, X13, X14, X14
3970 VMOVDQU permuted_blake_consts<>+192(SB), X15
3971 VPXOR X14, X15, X15
3972 VPADDD X15, X7, X7
3973 VPSHUFD $0x04, X0, X14
3974 VPSHUFD $0x01, X2, X13
3975 VPBLENDW $0x03, X13, X14, X14
3976 VPSHUFD $0x80, X3, X13
3977 VPBLENDW $0xf0, X13, X14, X14
3978 VMOVDQU permuted_blake_consts<>+208(SB), X15
3979 VPXOR X14, X15, X15
3980 VPADDD X8, X7, X7
3981 VPXOR X7, X10, X10
3982 VPSHUFB X5, X10, X10
3983 VPADDD X10, X9, X9
3984 VPXOR X9, X8, X8
3985 VPSRLD $0x0c, X8, X13
3986 VPSLLD $0x14, X8, X8
3987 VPXOR X13, X8, X8
3988 VPADDD X15, X7, X7
3989 VPADDD X8, X7, X7
3990 VPXOR X7, X10, X10
3991 VPSHUFB X4, X10, X10
3992 VPADDD X10, X9, X9
3993 VPXOR X9, X8, X8
3994 VPSRLD $0x07, X8, X13
3995 VPSLLD $0x19, X8, X8
3996 VPXOR X13, X8, X8
3997
3998 // Round 14 diagonal step part 1: diagonalize.
3999 VPSHUFD $0x39, X8, X8
4000 VPSHUFD $0x4e, X9, X9
4001 VPSHUFD $0x93, X10, X10
4002
4003 // Round 14 diagonal step part 2: column step.
4004 VPSHUFD $0x02, X0, X14
4005 VPSHUFD $0x04, X1, X13
4006 VPBLENDW $0x3c, X13, X14, X14
4007 VPSHUFD $0xc0, X3, X13
4008 VPBLENDW $0xc0, X13, X14, X14
4009 VMOVDQU permuted_blake_consts<>+224(SB), X15
4010 VPXOR X14, X15, X15
4011 VPADDD X15, X7, X7
4012 VPSHUFD $0x00, X0, X14
4013 VPSHUFD $0x02, X1, X13
4014 VPBLENDW $0x03, X13, X14, X14
4015 VPSHUFD $0x08, X2, X13
4016 VPBLENDW $0xcc, X13, X14, X14
4017 VMOVDQU permuted_blake_consts<>+240(SB), X15
4018 VPXOR X14, X15, X15
4019 VPADDD X8, X7, X7
4020 VPXOR X7, X10, X10
4021 VPSHUFB X5, X10, X10
4022 VPADDD X10, X9, X9
4023 VPXOR X9, X8, X8
4024 VPSRLD $0x0c, X8, X13
4025 VPSLLD $0x14, X8, X8
4026 VPXOR X13, X8, X8
4027 VPADDD X15, X7, X7
4028 VPADDD X8, X7, X7
4029 VPXOR X7, X10, X10
4030 VPSHUFB X4, X10, X10
4031 VPADDD X10, X9, X9
4032 VPXOR X9, X8, X8
4033 VPSRLD $0x07, X8, X13
4034 VPSLLD $0x19, X8, X8
4035 VPXOR X13, X8, X8
4036
4037 // Round 14 diagonal step part 3: undiagonalize.
4038 VPSHUFD $0x93, X8, X8
4039 VPSHUFD $0x4e, X9, X9
4040 VPSHUFD $0x39, X10, X10
4041
4042 // Finally the chain value is defined as:
4043 // h'0 = h0^s0^v0^v8
4044 // h'1 = h1^s1^v1^v9
4045 // h'2 = h2^s2^v2^va
4046 // h'3 = h3^s3^v3^vb
4047 // h'4 = h4^s0^v4^vc
4048 // h'5 = h5^s1^v5^vd
4049 // h'6 = h6^s2^v6^ve
4050 // h'7 = h7^s3^v7^vf
4051 VPXOR X11, X7, X7
4052 VPXOR X6, X7, X7
4053 VPXOR X9, X7, X7
4054 VPXOR X12, X8, X8
4055 VPXOR X6, X8, X8
4056 VPXOR X10, X8, X8
4057
4058 // Either terminate the loop when there are no more full blocks
4059 // to compress or move the message pointer to the next block of
4060 // bytes to compress, increment the message bits counter
4061 // accordingly, and loop back around to compress it.
4062 DECQ BX
4063 JZ done
4064 LEAQ 64(DX), DX
4065 ADDQ $0x00000200, CX
4066 JMP compressLoop
4067
4068 done:
4069 // Output the resulting chain value.
4070 VMOVDQU X7, (AX)
4071 VMOVDQU X8, 16(AX)
4072 RET
4073