encodeblock_amd64.s raw
1 // Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT.
2
3 //go:build !appengine && !noasm && gc && !noasm
4
5 #include "textflag.h"
6
7 // func _dummy_()
8 TEXT ·_dummy_(SB), $0
9 #ifdef GOAMD64_v4
10 #ifndef GOAMD64_v3
11 #define GOAMD64_v3
12 #endif
13 #endif
14 RET
15
16 // func encodeBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
17 // Requires: BMI, SSE2
18 TEXT ·encodeBlockAsm(SB), $24-64
19 MOVQ tmp+48(FP), AX
20 MOVQ dst_base+0(FP), CX
21 MOVQ $0x00000200, DX
22 MOVQ AX, BX
23 PXOR X0, X0
24
25 zero_loop_encodeBlockAsm:
26 MOVOU X0, (BX)
27 MOVOU X0, 16(BX)
28 MOVOU X0, 32(BX)
29 MOVOU X0, 48(BX)
30 MOVOU X0, 64(BX)
31 MOVOU X0, 80(BX)
32 MOVOU X0, 96(BX)
33 MOVOU X0, 112(BX)
34 ADDQ $0x80, BX
35 DECQ DX
36 JNZ zero_loop_encodeBlockAsm
37 MOVL $0x00000000, 12(SP)
38 MOVQ src_len+32(FP), DX
39 LEAQ -9(DX), BX
40 LEAQ -8(DX), SI
41 MOVL SI, 8(SP)
42 SHRQ $0x05, DX
43 SUBL DX, BX
44 LEAQ (CX)(BX*1), BX
45 MOVQ BX, (SP)
46 MOVL $0x00000001, DX
47 MOVL DX, 16(SP)
48 MOVQ src_base+24(FP), BX
49
50 search_loop_encodeBlockAsm:
51 MOVL DX, SI
52 SUBL 12(SP), SI
53 SHRL $0x06, SI
54 LEAL 4(DX)(SI*1), SI
55 CMPL SI, 8(SP)
56 JAE emit_remainder_encodeBlockAsm
57 MOVQ (BX)(DX*1), DI
58 MOVL SI, 20(SP)
59 MOVQ $0x0000cf1bbcdcbf9b, R9
60 MOVQ DI, R10
61 MOVQ DI, R11
62 SHRQ $0x08, R11
63 SHLQ $0x10, R10
64 IMULQ R9, R10
65 SHRQ $0x32, R10
66 SHLQ $0x10, R11
67 IMULQ R9, R11
68 SHRQ $0x32, R11
69 MOVL (AX)(R10*4), SI
70 MOVL (AX)(R11*4), R8
71 MOVL DX, (AX)(R10*4)
72 LEAL 1(DX), R10
73 MOVL R10, (AX)(R11*4)
74 MOVQ DI, R10
75 SHRQ $0x10, R10
76 SHLQ $0x10, R10
77 IMULQ R9, R10
78 SHRQ $0x32, R10
79 MOVL DX, R9
80 SUBL 16(SP), R9
81 MOVL 1(BX)(R9*1), R11
82 MOVQ DI, R9
83 SHRQ $0x08, R9
84 CMPL R9, R11
85 JNE no_repeat_found_encodeBlockAsm
86 LEAL 1(DX), DI
87 MOVL 12(SP), R8
88 MOVL DI, SI
89 SUBL 16(SP), SI
90 JZ repeat_extend_back_end_encodeBlockAsm
91
92 repeat_extend_back_loop_encodeBlockAsm:
93 CMPL DI, R8
94 JBE repeat_extend_back_end_encodeBlockAsm
95 MOVB -1(BX)(SI*1), R9
96 MOVB -1(BX)(DI*1), R10
97 CMPB R9, R10
98 JNE repeat_extend_back_end_encodeBlockAsm
99 LEAL -1(DI), DI
100 DECL SI
101 JNZ repeat_extend_back_loop_encodeBlockAsm
102
103 repeat_extend_back_end_encodeBlockAsm:
104 MOVL DI, SI
105 SUBL 12(SP), SI
106 LEAQ 5(CX)(SI*1), SI
107 CMPQ SI, (SP)
108 JB repeat_dst_size_check_encodeBlockAsm
109 MOVQ $0x00000000, ret+56(FP)
110 RET
111
112 repeat_dst_size_check_encodeBlockAsm:
113 MOVL 12(SP), SI
114 CMPL SI, DI
115 JEQ emit_literal_done_repeat_emit_encodeBlockAsm
116 MOVL DI, R9
117 MOVL DI, 12(SP)
118 LEAQ (BX)(SI*1), R10
119 SUBL SI, R9
120 LEAL -1(R9), SI
121 CMPL SI, $0x3c
122 JB one_byte_repeat_emit_encodeBlockAsm
123 CMPL SI, $0x00000100
124 JB two_bytes_repeat_emit_encodeBlockAsm
125 CMPL SI, $0x00010000
126 JB three_bytes_repeat_emit_encodeBlockAsm
127 CMPL SI, $0x01000000
128 JB four_bytes_repeat_emit_encodeBlockAsm
129 MOVB $0xfc, (CX)
130 MOVL SI, 1(CX)
131 ADDQ $0x05, CX
132 JMP memmove_long_repeat_emit_encodeBlockAsm
133
134 four_bytes_repeat_emit_encodeBlockAsm:
135 MOVL SI, R11
136 SHRL $0x10, R11
137 MOVB $0xf8, (CX)
138 MOVW SI, 1(CX)
139 MOVB R11, 3(CX)
140 ADDQ $0x04, CX
141 JMP memmove_long_repeat_emit_encodeBlockAsm
142
143 three_bytes_repeat_emit_encodeBlockAsm:
144 MOVB $0xf4, (CX)
145 MOVW SI, 1(CX)
146 ADDQ $0x03, CX
147 JMP memmove_long_repeat_emit_encodeBlockAsm
148
149 two_bytes_repeat_emit_encodeBlockAsm:
150 MOVB $0xf0, (CX)
151 MOVB SI, 1(CX)
152 ADDQ $0x02, CX
153 CMPL SI, $0x40
154 JB memmove_repeat_emit_encodeBlockAsm
155 JMP memmove_long_repeat_emit_encodeBlockAsm
156
157 one_byte_repeat_emit_encodeBlockAsm:
158 SHLB $0x02, SI
159 MOVB SI, (CX)
160 ADDQ $0x01, CX
161
162 memmove_repeat_emit_encodeBlockAsm:
163 LEAQ (CX)(R9*1), SI
164
165 // genMemMoveShort
166 CMPQ R9, $0x08
167 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8
168 CMPQ R9, $0x10
169 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16
170 CMPQ R9, $0x20
171 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32
172 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64
173
174 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8:
175 MOVQ (R10), R11
176 MOVQ R11, (CX)
177 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
178
179 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16:
180 MOVQ (R10), R11
181 MOVQ -8(R10)(R9*1), R10
182 MOVQ R11, (CX)
183 MOVQ R10, -8(CX)(R9*1)
184 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
185
186 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32:
187 MOVOU (R10), X0
188 MOVOU -16(R10)(R9*1), X1
189 MOVOU X0, (CX)
190 MOVOU X1, -16(CX)(R9*1)
191 JMP memmove_end_copy_repeat_emit_encodeBlockAsm
192
193 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64:
194 MOVOU (R10), X0
195 MOVOU 16(R10), X1
196 MOVOU -32(R10)(R9*1), X2
197 MOVOU -16(R10)(R9*1), X3
198 MOVOU X0, (CX)
199 MOVOU X1, 16(CX)
200 MOVOU X2, -32(CX)(R9*1)
201 MOVOU X3, -16(CX)(R9*1)
202
203 memmove_end_copy_repeat_emit_encodeBlockAsm:
204 MOVQ SI, CX
205 JMP emit_literal_done_repeat_emit_encodeBlockAsm
206
207 memmove_long_repeat_emit_encodeBlockAsm:
208 LEAQ (CX)(R9*1), SI
209
210 // genMemMoveLong
211 MOVOU (R10), X0
212 MOVOU 16(R10), X1
213 MOVOU -32(R10)(R9*1), X2
214 MOVOU -16(R10)(R9*1), X3
215 MOVQ R9, R12
216 SHRQ $0x05, R12
217 MOVQ CX, R11
218 ANDL $0x0000001f, R11
219 MOVQ $0x00000040, R13
220 SUBQ R11, R13
221 DECQ R12
222 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
223 LEAQ -32(R10)(R13*1), R11
224 LEAQ -32(CX)(R13*1), R14
225
226 emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back:
227 MOVOU (R11), X4
228 MOVOU 16(R11), X5
229 MOVOA X4, (R14)
230 MOVOA X5, 16(R14)
231 ADDQ $0x20, R14
232 ADDQ $0x20, R11
233 ADDQ $0x20, R13
234 DECQ R12
235 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back
236
237 emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32:
238 MOVOU -32(R10)(R13*1), X4
239 MOVOU -16(R10)(R13*1), X5
240 MOVOA X4, -32(CX)(R13*1)
241 MOVOA X5, -16(CX)(R13*1)
242 ADDQ $0x20, R13
243 CMPQ R9, R13
244 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32
245 MOVOU X0, (CX)
246 MOVOU X1, 16(CX)
247 MOVOU X2, -32(CX)(R9*1)
248 MOVOU X3, -16(CX)(R9*1)
249 MOVQ SI, CX
250
251 emit_literal_done_repeat_emit_encodeBlockAsm:
252 ADDL $0x05, DX
253 MOVL DX, SI
254 SUBL 16(SP), SI
255 MOVQ src_len+32(FP), R9
256 SUBL DX, R9
257 LEAQ (BX)(DX*1), R10
258 LEAQ (BX)(SI*1), SI
259
260 // matchLen
261 XORL R12, R12
262
263 matchlen_loopback_16_repeat_extend_encodeBlockAsm:
264 CMPL R9, $0x10
265 JB matchlen_match8_repeat_extend_encodeBlockAsm
266 MOVQ (R10)(R12*1), R11
267 MOVQ 8(R10)(R12*1), R13
268 XORQ (SI)(R12*1), R11
269 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
270 XORQ 8(SI)(R12*1), R13
271 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm
272 LEAL -16(R9), R9
273 LEAL 16(R12), R12
274 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm
275
276 matchlen_bsf_16repeat_extend_encodeBlockAsm:
277 #ifdef GOAMD64_v3
278 TZCNTQ R13, R13
279
280 #else
281 BSFQ R13, R13
282
283 #endif
284 SARQ $0x03, R13
285 LEAL 8(R12)(R13*1), R12
286 JMP repeat_extend_forward_end_encodeBlockAsm
287
288 matchlen_match8_repeat_extend_encodeBlockAsm:
289 CMPL R9, $0x08
290 JB matchlen_match4_repeat_extend_encodeBlockAsm
291 MOVQ (R10)(R12*1), R11
292 XORQ (SI)(R12*1), R11
293 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm
294 LEAL -8(R9), R9
295 LEAL 8(R12), R12
296 JMP matchlen_match4_repeat_extend_encodeBlockAsm
297
298 matchlen_bsf_8_repeat_extend_encodeBlockAsm:
299 #ifdef GOAMD64_v3
300 TZCNTQ R11, R11
301
302 #else
303 BSFQ R11, R11
304
305 #endif
306 SARQ $0x03, R11
307 LEAL (R12)(R11*1), R12
308 JMP repeat_extend_forward_end_encodeBlockAsm
309
310 matchlen_match4_repeat_extend_encodeBlockAsm:
311 CMPL R9, $0x04
312 JB matchlen_match2_repeat_extend_encodeBlockAsm
313 MOVL (R10)(R12*1), R11
314 CMPL (SI)(R12*1), R11
315 JNE matchlen_match2_repeat_extend_encodeBlockAsm
316 LEAL -4(R9), R9
317 LEAL 4(R12), R12
318
319 matchlen_match2_repeat_extend_encodeBlockAsm:
320 CMPL R9, $0x01
321 JE matchlen_match1_repeat_extend_encodeBlockAsm
322 JB repeat_extend_forward_end_encodeBlockAsm
323 MOVW (R10)(R12*1), R11
324 CMPW (SI)(R12*1), R11
325 JNE matchlen_match1_repeat_extend_encodeBlockAsm
326 LEAL 2(R12), R12
327 SUBL $0x02, R9
328 JZ repeat_extend_forward_end_encodeBlockAsm
329
330 matchlen_match1_repeat_extend_encodeBlockAsm:
331 MOVB (R10)(R12*1), R11
332 CMPB (SI)(R12*1), R11
333 JNE repeat_extend_forward_end_encodeBlockAsm
334 LEAL 1(R12), R12
335
336 repeat_extend_forward_end_encodeBlockAsm:
337 ADDL R12, DX
338 MOVL DX, SI
339 SUBL DI, SI
340 MOVL 16(SP), DI
341 TESTL R8, R8
342 JZ repeat_as_copy_encodeBlockAsm
343
344 // emitRepeat
345 emit_repeat_again_match_repeat_encodeBlockAsm:
346 MOVL SI, R8
347 LEAL -4(SI), SI
348 CMPL R8, $0x08
349 JBE repeat_two_match_repeat_encodeBlockAsm
350 CMPL R8, $0x0c
351 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm
352 CMPL DI, $0x00000800
353 JB repeat_two_offset_match_repeat_encodeBlockAsm
354
355 cant_repeat_two_offset_match_repeat_encodeBlockAsm:
356 CMPL SI, $0x00000104
357 JB repeat_three_match_repeat_encodeBlockAsm
358 CMPL SI, $0x00010100
359 JB repeat_four_match_repeat_encodeBlockAsm
360 CMPL SI, $0x0100ffff
361 JB repeat_five_match_repeat_encodeBlockAsm
362 LEAL -16842747(SI), SI
363 MOVL $0xfffb001d, (CX)
364 MOVB $0xff, 4(CX)
365 ADDQ $0x05, CX
366 JMP emit_repeat_again_match_repeat_encodeBlockAsm
367
368 repeat_five_match_repeat_encodeBlockAsm:
369 LEAL -65536(SI), SI
370 MOVL SI, DI
371 MOVW $0x001d, (CX)
372 MOVW SI, 2(CX)
373 SARL $0x10, DI
374 MOVB DI, 4(CX)
375 ADDQ $0x05, CX
376 JMP repeat_end_emit_encodeBlockAsm
377
378 repeat_four_match_repeat_encodeBlockAsm:
379 LEAL -256(SI), SI
380 MOVW $0x0019, (CX)
381 MOVW SI, 2(CX)
382 ADDQ $0x04, CX
383 JMP repeat_end_emit_encodeBlockAsm
384
385 repeat_three_match_repeat_encodeBlockAsm:
386 LEAL -4(SI), SI
387 MOVW $0x0015, (CX)
388 MOVB SI, 2(CX)
389 ADDQ $0x03, CX
390 JMP repeat_end_emit_encodeBlockAsm
391
392 repeat_two_match_repeat_encodeBlockAsm:
393 SHLL $0x02, SI
394 ORL $0x01, SI
395 MOVW SI, (CX)
396 ADDQ $0x02, CX
397 JMP repeat_end_emit_encodeBlockAsm
398
399 repeat_two_offset_match_repeat_encodeBlockAsm:
400 XORQ R8, R8
401 LEAL 1(R8)(SI*4), SI
402 MOVB DI, 1(CX)
403 SARL $0x08, DI
404 SHLL $0x05, DI
405 ORL DI, SI
406 MOVB SI, (CX)
407 ADDQ $0x02, CX
408 JMP repeat_end_emit_encodeBlockAsm
409
410 repeat_as_copy_encodeBlockAsm:
411 // emitCopy
412 CMPL DI, $0x00010000
413 JB two_byte_offset_repeat_as_copy_encodeBlockAsm
414 CMPL SI, $0x40
415 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm
416 MOVB $0xff, (CX)
417 MOVL DI, 1(CX)
418 LEAL -64(SI), SI
419 ADDQ $0x05, CX
420 CMPL SI, $0x04
421 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm
422
423 // emitRepeat
424 emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy:
425 MOVL SI, R8
426 LEAL -4(SI), SI
427 CMPL R8, $0x08
428 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy
429 CMPL R8, $0x0c
430 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
431 CMPL DI, $0x00000800
432 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy
433
434 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
435 CMPL SI, $0x00000104
436 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy
437 CMPL SI, $0x00010100
438 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy
439 CMPL SI, $0x0100ffff
440 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy
441 LEAL -16842747(SI), SI
442 MOVL $0xfffb001d, (CX)
443 MOVB $0xff, 4(CX)
444 ADDQ $0x05, CX
445 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy
446
447 repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy:
448 LEAL -65536(SI), SI
449 MOVL SI, DI
450 MOVW $0x001d, (CX)
451 MOVW SI, 2(CX)
452 SARL $0x10, DI
453 MOVB DI, 4(CX)
454 ADDQ $0x05, CX
455 JMP repeat_end_emit_encodeBlockAsm
456
457 repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy:
458 LEAL -256(SI), SI
459 MOVW $0x0019, (CX)
460 MOVW SI, 2(CX)
461 ADDQ $0x04, CX
462 JMP repeat_end_emit_encodeBlockAsm
463
464 repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy:
465 LEAL -4(SI), SI
466 MOVW $0x0015, (CX)
467 MOVB SI, 2(CX)
468 ADDQ $0x03, CX
469 JMP repeat_end_emit_encodeBlockAsm
470
471 repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy:
472 SHLL $0x02, SI
473 ORL $0x01, SI
474 MOVW SI, (CX)
475 ADDQ $0x02, CX
476 JMP repeat_end_emit_encodeBlockAsm
477
478 repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy:
479 XORQ R8, R8
480 LEAL 1(R8)(SI*4), SI
481 MOVB DI, 1(CX)
482 SARL $0x08, DI
483 SHLL $0x05, DI
484 ORL DI, SI
485 MOVB SI, (CX)
486 ADDQ $0x02, CX
487 JMP repeat_end_emit_encodeBlockAsm
488
489 four_bytes_remain_repeat_as_copy_encodeBlockAsm:
490 TESTL SI, SI
491 JZ repeat_end_emit_encodeBlockAsm
492 XORL R8, R8
493 LEAL -1(R8)(SI*4), SI
494 MOVB SI, (CX)
495 MOVL DI, 1(CX)
496 ADDQ $0x05, CX
497 JMP repeat_end_emit_encodeBlockAsm
498
499 two_byte_offset_repeat_as_copy_encodeBlockAsm:
500 CMPL SI, $0x40
501 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm
502 CMPL DI, $0x00000800
503 JAE long_offset_short_repeat_as_copy_encodeBlockAsm
504 MOVL $0x00000001, R8
505 LEAL 16(R8), R8
506 MOVB DI, 1(CX)
507 MOVL DI, R9
508 SHRL $0x08, R9
509 SHLL $0x05, R9
510 ORL R9, R8
511 MOVB R8, (CX)
512 ADDQ $0x02, CX
513 SUBL $0x08, SI
514
515 // emitRepeat
516 LEAL -4(SI), SI
517 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
518
519 emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
520 MOVL SI, R8
521 LEAL -4(SI), SI
522 CMPL R8, $0x08
523 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
524 CMPL R8, $0x0c
525 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
526 CMPL DI, $0x00000800
527 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
528
529 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
530 CMPL SI, $0x00000104
531 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
532 CMPL SI, $0x00010100
533 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
534 CMPL SI, $0x0100ffff
535 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
536 LEAL -16842747(SI), SI
537 MOVL $0xfffb001d, (CX)
538 MOVB $0xff, 4(CX)
539 ADDQ $0x05, CX
540 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b
541
542 repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
543 LEAL -65536(SI), SI
544 MOVL SI, DI
545 MOVW $0x001d, (CX)
546 MOVW SI, 2(CX)
547 SARL $0x10, DI
548 MOVB DI, 4(CX)
549 ADDQ $0x05, CX
550 JMP repeat_end_emit_encodeBlockAsm
551
552 repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
553 LEAL -256(SI), SI
554 MOVW $0x0019, (CX)
555 MOVW SI, 2(CX)
556 ADDQ $0x04, CX
557 JMP repeat_end_emit_encodeBlockAsm
558
559 repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
560 LEAL -4(SI), SI
561 MOVW $0x0015, (CX)
562 MOVB SI, 2(CX)
563 ADDQ $0x03, CX
564 JMP repeat_end_emit_encodeBlockAsm
565
566 repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
567 SHLL $0x02, SI
568 ORL $0x01, SI
569 MOVW SI, (CX)
570 ADDQ $0x02, CX
571 JMP repeat_end_emit_encodeBlockAsm
572
573 repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b:
574 XORQ R8, R8
575 LEAL 1(R8)(SI*4), SI
576 MOVB DI, 1(CX)
577 SARL $0x08, DI
578 SHLL $0x05, DI
579 ORL DI, SI
580 MOVB SI, (CX)
581 ADDQ $0x02, CX
582 JMP repeat_end_emit_encodeBlockAsm
583
584 long_offset_short_repeat_as_copy_encodeBlockAsm:
585 MOVB $0xee, (CX)
586 MOVW DI, 1(CX)
587 LEAL -60(SI), SI
588 ADDQ $0x03, CX
589
590 // emitRepeat
591 emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short:
592 MOVL SI, R8
593 LEAL -4(SI), SI
594 CMPL R8, $0x08
595 JBE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short
596 CMPL R8, $0x0c
597 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
598 CMPL DI, $0x00000800
599 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short
600
601 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
602 CMPL SI, $0x00000104
603 JB repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short
604 CMPL SI, $0x00010100
605 JB repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short
606 CMPL SI, $0x0100ffff
607 JB repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short
608 LEAL -16842747(SI), SI
609 MOVL $0xfffb001d, (CX)
610 MOVB $0xff, 4(CX)
611 ADDQ $0x05, CX
612 JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short
613
614 repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short:
615 LEAL -65536(SI), SI
616 MOVL SI, DI
617 MOVW $0x001d, (CX)
618 MOVW SI, 2(CX)
619 SARL $0x10, DI
620 MOVB DI, 4(CX)
621 ADDQ $0x05, CX
622 JMP repeat_end_emit_encodeBlockAsm
623
624 repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short:
625 LEAL -256(SI), SI
626 MOVW $0x0019, (CX)
627 MOVW SI, 2(CX)
628 ADDQ $0x04, CX
629 JMP repeat_end_emit_encodeBlockAsm
630
631 repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short:
632 LEAL -4(SI), SI
633 MOVW $0x0015, (CX)
634 MOVB SI, 2(CX)
635 ADDQ $0x03, CX
636 JMP repeat_end_emit_encodeBlockAsm
637
638 repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short:
639 SHLL $0x02, SI
640 ORL $0x01, SI
641 MOVW SI, (CX)
642 ADDQ $0x02, CX
643 JMP repeat_end_emit_encodeBlockAsm
644
645 repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
646 XORQ R8, R8
647 LEAL 1(R8)(SI*4), SI
648 MOVB DI, 1(CX)
649 SARL $0x08, DI
650 SHLL $0x05, DI
651 ORL DI, SI
652 MOVB SI, (CX)
653 ADDQ $0x02, CX
654 JMP repeat_end_emit_encodeBlockAsm
655
656 two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
657 MOVL SI, R8
658 SHLL $0x02, R8
659 CMPL SI, $0x0c
660 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
661 CMPL DI, $0x00000800
662 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm
663 LEAL -15(R8), R8
664 MOVB DI, 1(CX)
665 SHRL $0x08, DI
666 SHLL $0x05, DI
667 ORL DI, R8
668 MOVB R8, (CX)
669 ADDQ $0x02, CX
670 JMP repeat_end_emit_encodeBlockAsm
671
672 emit_copy_three_repeat_as_copy_encodeBlockAsm:
673 LEAL -2(R8), R8
674 MOVB R8, (CX)
675 MOVW DI, 1(CX)
676 ADDQ $0x03, CX
677
678 repeat_end_emit_encodeBlockAsm:
679 MOVL DX, 12(SP)
680 JMP search_loop_encodeBlockAsm
681
682 no_repeat_found_encodeBlockAsm:
683 CMPL (BX)(SI*1), DI
684 JEQ candidate_match_encodeBlockAsm
685 SHRQ $0x08, DI
686 MOVL (AX)(R10*4), SI
687 LEAL 2(DX), R9
688 CMPL (BX)(R8*1), DI
689 JEQ candidate2_match_encodeBlockAsm
690 MOVL R9, (AX)(R10*4)
691 SHRQ $0x08, DI
692 CMPL (BX)(SI*1), DI
693 JEQ candidate3_match_encodeBlockAsm
694 MOVL 20(SP), DX
695 JMP search_loop_encodeBlockAsm
696
697 candidate3_match_encodeBlockAsm:
698 ADDL $0x02, DX
699 JMP candidate_match_encodeBlockAsm
700
701 candidate2_match_encodeBlockAsm:
702 MOVL R9, (AX)(R10*4)
703 INCL DX
704 MOVL R8, SI
705
706 candidate_match_encodeBlockAsm:
707 MOVL 12(SP), DI
708 TESTL SI, SI
709 JZ match_extend_back_end_encodeBlockAsm
710
711 match_extend_back_loop_encodeBlockAsm:
712 CMPL DX, DI
713 JBE match_extend_back_end_encodeBlockAsm
714 MOVB -1(BX)(SI*1), R8
715 MOVB -1(BX)(DX*1), R9
716 CMPB R8, R9
717 JNE match_extend_back_end_encodeBlockAsm
718 LEAL -1(DX), DX
719 DECL SI
720 JZ match_extend_back_end_encodeBlockAsm
721 JMP match_extend_back_loop_encodeBlockAsm
722
723 match_extend_back_end_encodeBlockAsm:
724 MOVL DX, DI
725 SUBL 12(SP), DI
726 LEAQ 5(CX)(DI*1), DI
727 CMPQ DI, (SP)
728 JB match_dst_size_check_encodeBlockAsm
729 MOVQ $0x00000000, ret+56(FP)
730 RET
731
732 match_dst_size_check_encodeBlockAsm:
733 MOVL DX, DI
734 MOVL 12(SP), R8
735 CMPL R8, DI
736 JEQ emit_literal_done_match_emit_encodeBlockAsm
737 MOVL DI, R9
738 MOVL DI, 12(SP)
739 LEAQ (BX)(R8*1), DI
740 SUBL R8, R9
741 LEAL -1(R9), R8
742 CMPL R8, $0x3c
743 JB one_byte_match_emit_encodeBlockAsm
744 CMPL R8, $0x00000100
745 JB two_bytes_match_emit_encodeBlockAsm
746 CMPL R8, $0x00010000
747 JB three_bytes_match_emit_encodeBlockAsm
748 CMPL R8, $0x01000000
749 JB four_bytes_match_emit_encodeBlockAsm
750 MOVB $0xfc, (CX)
751 MOVL R8, 1(CX)
752 ADDQ $0x05, CX
753 JMP memmove_long_match_emit_encodeBlockAsm
754
755 four_bytes_match_emit_encodeBlockAsm:
756 MOVL R8, R10
757 SHRL $0x10, R10
758 MOVB $0xf8, (CX)
759 MOVW R8, 1(CX)
760 MOVB R10, 3(CX)
761 ADDQ $0x04, CX
762 JMP memmove_long_match_emit_encodeBlockAsm
763
764 three_bytes_match_emit_encodeBlockAsm:
765 MOVB $0xf4, (CX)
766 MOVW R8, 1(CX)
767 ADDQ $0x03, CX
768 JMP memmove_long_match_emit_encodeBlockAsm
769
770 two_bytes_match_emit_encodeBlockAsm:
771 MOVB $0xf0, (CX)
772 MOVB R8, 1(CX)
773 ADDQ $0x02, CX
774 CMPL R8, $0x40
775 JB memmove_match_emit_encodeBlockAsm
776 JMP memmove_long_match_emit_encodeBlockAsm
777
778 one_byte_match_emit_encodeBlockAsm:
779 SHLB $0x02, R8
780 MOVB R8, (CX)
781 ADDQ $0x01, CX
782
783 memmove_match_emit_encodeBlockAsm:
784 LEAQ (CX)(R9*1), R8
785
786 // genMemMoveShort
787 CMPQ R9, $0x08
788 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8
789 CMPQ R9, $0x10
790 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16
791 CMPQ R9, $0x20
792 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32
793 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64
794
795 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8:
796 MOVQ (DI), R10
797 MOVQ R10, (CX)
798 JMP memmove_end_copy_match_emit_encodeBlockAsm
799
800 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16:
801 MOVQ (DI), R10
802 MOVQ -8(DI)(R9*1), DI
803 MOVQ R10, (CX)
804 MOVQ DI, -8(CX)(R9*1)
805 JMP memmove_end_copy_match_emit_encodeBlockAsm
806
807 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32:
808 MOVOU (DI), X0
809 MOVOU -16(DI)(R9*1), X1
810 MOVOU X0, (CX)
811 MOVOU X1, -16(CX)(R9*1)
812 JMP memmove_end_copy_match_emit_encodeBlockAsm
813
814 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64:
815 MOVOU (DI), X0
816 MOVOU 16(DI), X1
817 MOVOU -32(DI)(R9*1), X2
818 MOVOU -16(DI)(R9*1), X3
819 MOVOU X0, (CX)
820 MOVOU X1, 16(CX)
821 MOVOU X2, -32(CX)(R9*1)
822 MOVOU X3, -16(CX)(R9*1)
823
824 memmove_end_copy_match_emit_encodeBlockAsm:
825 MOVQ R8, CX
826 JMP emit_literal_done_match_emit_encodeBlockAsm
827
828 memmove_long_match_emit_encodeBlockAsm:
829 LEAQ (CX)(R9*1), R8
830
831 // genMemMoveLong
832 MOVOU (DI), X0
833 MOVOU 16(DI), X1
834 MOVOU -32(DI)(R9*1), X2
835 MOVOU -16(DI)(R9*1), X3
836 MOVQ R9, R11
837 SHRQ $0x05, R11
838 MOVQ CX, R10
839 ANDL $0x0000001f, R10
840 MOVQ $0x00000040, R12
841 SUBQ R10, R12
842 DECQ R11
843 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
844 LEAQ -32(DI)(R12*1), R10
845 LEAQ -32(CX)(R12*1), R13
846
847 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back:
848 MOVOU (R10), X4
849 MOVOU 16(R10), X5
850 MOVOA X4, (R13)
851 MOVOA X5, 16(R13)
852 ADDQ $0x20, R13
853 ADDQ $0x20, R10
854 ADDQ $0x20, R12
855 DECQ R11
856 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back
857
858 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32:
859 MOVOU -32(DI)(R12*1), X4
860 MOVOU -16(DI)(R12*1), X5
861 MOVOA X4, -32(CX)(R12*1)
862 MOVOA X5, -16(CX)(R12*1)
863 ADDQ $0x20, R12
864 CMPQ R9, R12
865 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32
866 MOVOU X0, (CX)
867 MOVOU X1, 16(CX)
868 MOVOU X2, -32(CX)(R9*1)
869 MOVOU X3, -16(CX)(R9*1)
870 MOVQ R8, CX
871
872 emit_literal_done_match_emit_encodeBlockAsm:
873 match_nolit_loop_encodeBlockAsm:
874 MOVL DX, DI
875 SUBL SI, DI
876 MOVL DI, 16(SP)
877 ADDL $0x04, DX
878 ADDL $0x04, SI
879 MOVQ src_len+32(FP), DI
880 SUBL DX, DI
881 LEAQ (BX)(DX*1), R8
882 LEAQ (BX)(SI*1), SI
883
884 // matchLen
885 XORL R10, R10
886
887 matchlen_loopback_16_match_nolit_encodeBlockAsm:
888 CMPL DI, $0x10
889 JB matchlen_match8_match_nolit_encodeBlockAsm
890 MOVQ (R8)(R10*1), R9
891 MOVQ 8(R8)(R10*1), R11
892 XORQ (SI)(R10*1), R9
893 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
894 XORQ 8(SI)(R10*1), R11
895 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm
896 LEAL -16(DI), DI
897 LEAL 16(R10), R10
898 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm
899
900 matchlen_bsf_16match_nolit_encodeBlockAsm:
901 #ifdef GOAMD64_v3
902 TZCNTQ R11, R11
903
904 #else
905 BSFQ R11, R11
906
907 #endif
908 SARQ $0x03, R11
909 LEAL 8(R10)(R11*1), R10
910 JMP match_nolit_end_encodeBlockAsm
911
912 matchlen_match8_match_nolit_encodeBlockAsm:
913 CMPL DI, $0x08
914 JB matchlen_match4_match_nolit_encodeBlockAsm
915 MOVQ (R8)(R10*1), R9
916 XORQ (SI)(R10*1), R9
917 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm
918 LEAL -8(DI), DI
919 LEAL 8(R10), R10
920 JMP matchlen_match4_match_nolit_encodeBlockAsm
921
922 matchlen_bsf_8_match_nolit_encodeBlockAsm:
923 #ifdef GOAMD64_v3
924 TZCNTQ R9, R9
925
926 #else
927 BSFQ R9, R9
928
929 #endif
930 SARQ $0x03, R9
931 LEAL (R10)(R9*1), R10
932 JMP match_nolit_end_encodeBlockAsm
933
934 matchlen_match4_match_nolit_encodeBlockAsm:
935 CMPL DI, $0x04
936 JB matchlen_match2_match_nolit_encodeBlockAsm
937 MOVL (R8)(R10*1), R9
938 CMPL (SI)(R10*1), R9
939 JNE matchlen_match2_match_nolit_encodeBlockAsm
940 LEAL -4(DI), DI
941 LEAL 4(R10), R10
942
943 matchlen_match2_match_nolit_encodeBlockAsm:
944 CMPL DI, $0x01
945 JE matchlen_match1_match_nolit_encodeBlockAsm
946 JB match_nolit_end_encodeBlockAsm
947 MOVW (R8)(R10*1), R9
948 CMPW (SI)(R10*1), R9
949 JNE matchlen_match1_match_nolit_encodeBlockAsm
950 LEAL 2(R10), R10
951 SUBL $0x02, DI
952 JZ match_nolit_end_encodeBlockAsm
953
954 matchlen_match1_match_nolit_encodeBlockAsm:
955 MOVB (R8)(R10*1), R9
956 CMPB (SI)(R10*1), R9
957 JNE match_nolit_end_encodeBlockAsm
958 LEAL 1(R10), R10
959
960 match_nolit_end_encodeBlockAsm:
961 ADDL R10, DX
962 MOVL 16(SP), SI
963 ADDL $0x04, R10
964 MOVL DX, 12(SP)
965
966 // emitCopy
967 CMPL SI, $0x00010000
968 JB two_byte_offset_match_nolit_encodeBlockAsm
969 CMPL R10, $0x40
970 JBE four_bytes_remain_match_nolit_encodeBlockAsm
971 MOVB $0xff, (CX)
972 MOVL SI, 1(CX)
973 LEAL -64(R10), R10
974 ADDQ $0x05, CX
975 CMPL R10, $0x04
976 JB four_bytes_remain_match_nolit_encodeBlockAsm
977
978 // emitRepeat
979 emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy:
980 MOVL R10, DI
981 LEAL -4(R10), R10
982 CMPL DI, $0x08
983 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy
984 CMPL DI, $0x0c
985 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
986 CMPL SI, $0x00000800
987 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy
988
989 cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
990 CMPL R10, $0x00000104
991 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy
992 CMPL R10, $0x00010100
993 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy
994 CMPL R10, $0x0100ffff
995 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy
996 LEAL -16842747(R10), R10
997 MOVL $0xfffb001d, (CX)
998 MOVB $0xff, 4(CX)
999 ADDQ $0x05, CX
1000 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy
1001
1002 repeat_five_match_nolit_encodeBlockAsm_emit_copy:
1003 LEAL -65536(R10), R10
1004 MOVL R10, SI
1005 MOVW $0x001d, (CX)
1006 MOVW R10, 2(CX)
1007 SARL $0x10, SI
1008 MOVB SI, 4(CX)
1009 ADDQ $0x05, CX
1010 JMP match_nolit_emitcopy_end_encodeBlockAsm
1011
1012 repeat_four_match_nolit_encodeBlockAsm_emit_copy:
1013 LEAL -256(R10), R10
1014 MOVW $0x0019, (CX)
1015 MOVW R10, 2(CX)
1016 ADDQ $0x04, CX
1017 JMP match_nolit_emitcopy_end_encodeBlockAsm
1018
1019 repeat_three_match_nolit_encodeBlockAsm_emit_copy:
1020 LEAL -4(R10), R10
1021 MOVW $0x0015, (CX)
1022 MOVB R10, 2(CX)
1023 ADDQ $0x03, CX
1024 JMP match_nolit_emitcopy_end_encodeBlockAsm
1025
1026 repeat_two_match_nolit_encodeBlockAsm_emit_copy:
1027 SHLL $0x02, R10
1028 ORL $0x01, R10
1029 MOVW R10, (CX)
1030 ADDQ $0x02, CX
1031 JMP match_nolit_emitcopy_end_encodeBlockAsm
1032
1033 repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy:
1034 XORQ DI, DI
1035 LEAL 1(DI)(R10*4), R10
1036 MOVB SI, 1(CX)
1037 SARL $0x08, SI
1038 SHLL $0x05, SI
1039 ORL SI, R10
1040 MOVB R10, (CX)
1041 ADDQ $0x02, CX
1042 JMP match_nolit_emitcopy_end_encodeBlockAsm
1043
1044 four_bytes_remain_match_nolit_encodeBlockAsm:
1045 TESTL R10, R10
1046 JZ match_nolit_emitcopy_end_encodeBlockAsm
1047 XORL DI, DI
1048 LEAL -1(DI)(R10*4), R10
1049 MOVB R10, (CX)
1050 MOVL SI, 1(CX)
1051 ADDQ $0x05, CX
1052 JMP match_nolit_emitcopy_end_encodeBlockAsm
1053
1054 two_byte_offset_match_nolit_encodeBlockAsm:
1055 CMPL R10, $0x40
1056 JBE two_byte_offset_short_match_nolit_encodeBlockAsm
1057 CMPL SI, $0x00000800
1058 JAE long_offset_short_match_nolit_encodeBlockAsm
1059 MOVL $0x00000001, DI
1060 LEAL 16(DI), DI
1061 MOVB SI, 1(CX)
1062 MOVL SI, R8
1063 SHRL $0x08, R8
1064 SHLL $0x05, R8
1065 ORL R8, DI
1066 MOVB DI, (CX)
1067 ADDQ $0x02, CX
1068 SUBL $0x08, R10
1069
1070 // emitRepeat
1071 LEAL -4(R10), R10
1072 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1073
1074 emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1075 MOVL R10, DI
1076 LEAL -4(R10), R10
1077 CMPL DI, $0x08
1078 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b
1079 CMPL DI, $0x0c
1080 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1081 CMPL SI, $0x00000800
1082 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b
1083
1084 cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1085 CMPL R10, $0x00000104
1086 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b
1087 CMPL R10, $0x00010100
1088 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b
1089 CMPL R10, $0x0100ffff
1090 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b
1091 LEAL -16842747(R10), R10
1092 MOVL $0xfffb001d, (CX)
1093 MOVB $0xff, 4(CX)
1094 ADDQ $0x05, CX
1095 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b
1096
1097 repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1098 LEAL -65536(R10), R10
1099 MOVL R10, SI
1100 MOVW $0x001d, (CX)
1101 MOVW R10, 2(CX)
1102 SARL $0x10, SI
1103 MOVB SI, 4(CX)
1104 ADDQ $0x05, CX
1105 JMP match_nolit_emitcopy_end_encodeBlockAsm
1106
1107 repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1108 LEAL -256(R10), R10
1109 MOVW $0x0019, (CX)
1110 MOVW R10, 2(CX)
1111 ADDQ $0x04, CX
1112 JMP match_nolit_emitcopy_end_encodeBlockAsm
1113
1114 repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1115 LEAL -4(R10), R10
1116 MOVW $0x0015, (CX)
1117 MOVB R10, 2(CX)
1118 ADDQ $0x03, CX
1119 JMP match_nolit_emitcopy_end_encodeBlockAsm
1120
1121 repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1122 SHLL $0x02, R10
1123 ORL $0x01, R10
1124 MOVW R10, (CX)
1125 ADDQ $0x02, CX
1126 JMP match_nolit_emitcopy_end_encodeBlockAsm
1127
1128 repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b:
1129 XORQ DI, DI
1130 LEAL 1(DI)(R10*4), R10
1131 MOVB SI, 1(CX)
1132 SARL $0x08, SI
1133 SHLL $0x05, SI
1134 ORL SI, R10
1135 MOVB R10, (CX)
1136 ADDQ $0x02, CX
1137 JMP match_nolit_emitcopy_end_encodeBlockAsm
1138
1139 long_offset_short_match_nolit_encodeBlockAsm:
1140 MOVB $0xee, (CX)
1141 MOVW SI, 1(CX)
1142 LEAL -60(R10), R10
1143 ADDQ $0x03, CX
1144
1145 // emitRepeat
1146 emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short:
1147 MOVL R10, DI
1148 LEAL -4(R10), R10
1149 CMPL DI, $0x08
1150 JBE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short
1151 CMPL DI, $0x0c
1152 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
1153 CMPL SI, $0x00000800
1154 JB repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short
1155
1156 cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
1157 CMPL R10, $0x00000104
1158 JB repeat_three_match_nolit_encodeBlockAsm_emit_copy_short
1159 CMPL R10, $0x00010100
1160 JB repeat_four_match_nolit_encodeBlockAsm_emit_copy_short
1161 CMPL R10, $0x0100ffff
1162 JB repeat_five_match_nolit_encodeBlockAsm_emit_copy_short
1163 LEAL -16842747(R10), R10
1164 MOVL $0xfffb001d, (CX)
1165 MOVB $0xff, 4(CX)
1166 ADDQ $0x05, CX
1167 JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short
1168
1169 repeat_five_match_nolit_encodeBlockAsm_emit_copy_short:
1170 LEAL -65536(R10), R10
1171 MOVL R10, SI
1172 MOVW $0x001d, (CX)
1173 MOVW R10, 2(CX)
1174 SARL $0x10, SI
1175 MOVB SI, 4(CX)
1176 ADDQ $0x05, CX
1177 JMP match_nolit_emitcopy_end_encodeBlockAsm
1178
1179 repeat_four_match_nolit_encodeBlockAsm_emit_copy_short:
1180 LEAL -256(R10), R10
1181 MOVW $0x0019, (CX)
1182 MOVW R10, 2(CX)
1183 ADDQ $0x04, CX
1184 JMP match_nolit_emitcopy_end_encodeBlockAsm
1185
1186 repeat_three_match_nolit_encodeBlockAsm_emit_copy_short:
1187 LEAL -4(R10), R10
1188 MOVW $0x0015, (CX)
1189 MOVB R10, 2(CX)
1190 ADDQ $0x03, CX
1191 JMP match_nolit_emitcopy_end_encodeBlockAsm
1192
1193 repeat_two_match_nolit_encodeBlockAsm_emit_copy_short:
1194 SHLL $0x02, R10
1195 ORL $0x01, R10
1196 MOVW R10, (CX)
1197 ADDQ $0x02, CX
1198 JMP match_nolit_emitcopy_end_encodeBlockAsm
1199
1200 repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
1201 XORQ DI, DI
1202 LEAL 1(DI)(R10*4), R10
1203 MOVB SI, 1(CX)
1204 SARL $0x08, SI
1205 SHLL $0x05, SI
1206 ORL SI, R10
1207 MOVB R10, (CX)
1208 ADDQ $0x02, CX
1209 JMP match_nolit_emitcopy_end_encodeBlockAsm
1210
1211 two_byte_offset_short_match_nolit_encodeBlockAsm:
1212 MOVL R10, DI
1213 SHLL $0x02, DI
1214 CMPL R10, $0x0c
1215 JAE emit_copy_three_match_nolit_encodeBlockAsm
1216 CMPL SI, $0x00000800
1217 JAE emit_copy_three_match_nolit_encodeBlockAsm
1218 LEAL -15(DI), DI
1219 MOVB SI, 1(CX)
1220 SHRL $0x08, SI
1221 SHLL $0x05, SI
1222 ORL SI, DI
1223 MOVB DI, (CX)
1224 ADDQ $0x02, CX
1225 JMP match_nolit_emitcopy_end_encodeBlockAsm
1226
1227 emit_copy_three_match_nolit_encodeBlockAsm:
1228 LEAL -2(DI), DI
1229 MOVB DI, (CX)
1230 MOVW SI, 1(CX)
1231 ADDQ $0x03, CX
1232
1233 match_nolit_emitcopy_end_encodeBlockAsm:
1234 CMPL DX, 8(SP)
1235 JAE emit_remainder_encodeBlockAsm
1236 MOVQ -2(BX)(DX*1), DI
1237 CMPQ CX, (SP)
1238 JB match_nolit_dst_ok_encodeBlockAsm
1239 MOVQ $0x00000000, ret+56(FP)
1240 RET
1241
1242 match_nolit_dst_ok_encodeBlockAsm:
1243 MOVQ $0x0000cf1bbcdcbf9b, R9
1244 MOVQ DI, R8
1245 SHRQ $0x10, DI
1246 MOVQ DI, SI
1247 SHLQ $0x10, R8
1248 IMULQ R9, R8
1249 SHRQ $0x32, R8
1250 SHLQ $0x10, SI
1251 IMULQ R9, SI
1252 SHRQ $0x32, SI
1253 LEAL -2(DX), R9
1254 LEAQ (AX)(SI*4), R10
1255 MOVL (R10), SI
1256 MOVL R9, (AX)(R8*4)
1257 MOVL DX, (R10)
1258 CMPL (BX)(SI*1), DI
1259 JEQ match_nolit_loop_encodeBlockAsm
1260 INCL DX
1261 JMP search_loop_encodeBlockAsm
1262
1263 emit_remainder_encodeBlockAsm:
1264 MOVQ src_len+32(FP), AX
1265 SUBL 12(SP), AX
1266 LEAQ 5(CX)(AX*1), AX
1267 CMPQ AX, (SP)
1268 JB emit_remainder_ok_encodeBlockAsm
1269 MOVQ $0x00000000, ret+56(FP)
1270 RET
1271
1272 emit_remainder_ok_encodeBlockAsm:
1273 MOVQ src_len+32(FP), AX
1274 MOVL 12(SP), DX
1275 CMPL DX, AX
1276 JEQ emit_literal_done_emit_remainder_encodeBlockAsm
1277 MOVL AX, SI
1278 MOVL AX, 12(SP)
1279 LEAQ (BX)(DX*1), AX
1280 SUBL DX, SI
1281 LEAL -1(SI), DX
1282 CMPL DX, $0x3c
1283 JB one_byte_emit_remainder_encodeBlockAsm
1284 CMPL DX, $0x00000100
1285 JB two_bytes_emit_remainder_encodeBlockAsm
1286 CMPL DX, $0x00010000
1287 JB three_bytes_emit_remainder_encodeBlockAsm
1288 CMPL DX, $0x01000000
1289 JB four_bytes_emit_remainder_encodeBlockAsm
1290 MOVB $0xfc, (CX)
1291 MOVL DX, 1(CX)
1292 ADDQ $0x05, CX
1293 JMP memmove_long_emit_remainder_encodeBlockAsm
1294
1295 four_bytes_emit_remainder_encodeBlockAsm:
1296 MOVL DX, BX
1297 SHRL $0x10, BX
1298 MOVB $0xf8, (CX)
1299 MOVW DX, 1(CX)
1300 MOVB BL, 3(CX)
1301 ADDQ $0x04, CX
1302 JMP memmove_long_emit_remainder_encodeBlockAsm
1303
1304 three_bytes_emit_remainder_encodeBlockAsm:
1305 MOVB $0xf4, (CX)
1306 MOVW DX, 1(CX)
1307 ADDQ $0x03, CX
1308 JMP memmove_long_emit_remainder_encodeBlockAsm
1309
1310 two_bytes_emit_remainder_encodeBlockAsm:
1311 MOVB $0xf0, (CX)
1312 MOVB DL, 1(CX)
1313 ADDQ $0x02, CX
1314 CMPL DX, $0x40
1315 JB memmove_emit_remainder_encodeBlockAsm
1316 JMP memmove_long_emit_remainder_encodeBlockAsm
1317
1318 one_byte_emit_remainder_encodeBlockAsm:
1319 SHLB $0x02, DL
1320 MOVB DL, (CX)
1321 ADDQ $0x01, CX
1322
1323 memmove_emit_remainder_encodeBlockAsm:
1324 LEAQ (CX)(SI*1), DX
1325 MOVL SI, BX
1326
1327 // genMemMoveShort
1328 CMPQ BX, $0x03
1329 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2
1330 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3
1331 CMPQ BX, $0x08
1332 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7
1333 CMPQ BX, $0x10
1334 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16
1335 CMPQ BX, $0x20
1336 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32
1337 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64
1338
1339 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2:
1340 MOVB (AX), SI
1341 MOVB -1(AX)(BX*1), AL
1342 MOVB SI, (CX)
1343 MOVB AL, -1(CX)(BX*1)
1344 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1345
1346 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3:
1347 MOVW (AX), SI
1348 MOVB 2(AX), AL
1349 MOVW SI, (CX)
1350 MOVB AL, 2(CX)
1351 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1352
1353 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7:
1354 MOVL (AX), SI
1355 MOVL -4(AX)(BX*1), AX
1356 MOVL SI, (CX)
1357 MOVL AX, -4(CX)(BX*1)
1358 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1359
1360 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16:
1361 MOVQ (AX), SI
1362 MOVQ -8(AX)(BX*1), AX
1363 MOVQ SI, (CX)
1364 MOVQ AX, -8(CX)(BX*1)
1365 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1366
1367 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32:
1368 MOVOU (AX), X0
1369 MOVOU -16(AX)(BX*1), X1
1370 MOVOU X0, (CX)
1371 MOVOU X1, -16(CX)(BX*1)
1372 JMP memmove_end_copy_emit_remainder_encodeBlockAsm
1373
1374 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64:
1375 MOVOU (AX), X0
1376 MOVOU 16(AX), X1
1377 MOVOU -32(AX)(BX*1), X2
1378 MOVOU -16(AX)(BX*1), X3
1379 MOVOU X0, (CX)
1380 MOVOU X1, 16(CX)
1381 MOVOU X2, -32(CX)(BX*1)
1382 MOVOU X3, -16(CX)(BX*1)
1383
1384 memmove_end_copy_emit_remainder_encodeBlockAsm:
1385 MOVQ DX, CX
1386 JMP emit_literal_done_emit_remainder_encodeBlockAsm
1387
1388 memmove_long_emit_remainder_encodeBlockAsm:
1389 LEAQ (CX)(SI*1), DX
1390 MOVL SI, BX
1391
1392 // genMemMoveLong
1393 MOVOU (AX), X0
1394 MOVOU 16(AX), X1
1395 MOVOU -32(AX)(BX*1), X2
1396 MOVOU -16(AX)(BX*1), X3
1397 MOVQ BX, DI
1398 SHRQ $0x05, DI
1399 MOVQ CX, SI
1400 ANDL $0x0000001f, SI
1401 MOVQ $0x00000040, R8
1402 SUBQ SI, R8
1403 DECQ DI
1404 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1405 LEAQ -32(AX)(R8*1), SI
1406 LEAQ -32(CX)(R8*1), R9
1407
1408 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back:
1409 MOVOU (SI), X4
1410 MOVOU 16(SI), X5
1411 MOVOA X4, (R9)
1412 MOVOA X5, 16(R9)
1413 ADDQ $0x20, R9
1414 ADDQ $0x20, SI
1415 ADDQ $0x20, R8
1416 DECQ DI
1417 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back
1418
1419 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32:
1420 MOVOU -32(AX)(R8*1), X4
1421 MOVOU -16(AX)(R8*1), X5
1422 MOVOA X4, -32(CX)(R8*1)
1423 MOVOA X5, -16(CX)(R8*1)
1424 ADDQ $0x20, R8
1425 CMPQ BX, R8
1426 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32
1427 MOVOU X0, (CX)
1428 MOVOU X1, 16(CX)
1429 MOVOU X2, -32(CX)(BX*1)
1430 MOVOU X3, -16(CX)(BX*1)
1431 MOVQ DX, CX
1432
1433 emit_literal_done_emit_remainder_encodeBlockAsm:
1434 MOVQ dst_base+0(FP), AX
1435 SUBQ AX, CX
1436 MOVQ CX, ret+56(FP)
1437 RET
1438
1439 // func encodeBlockAsm4MB(dst []byte, src []byte, tmp *[65536]byte) int
1440 // Requires: BMI, SSE2
1441 TEXT ·encodeBlockAsm4MB(SB), $24-64
1442 MOVQ tmp+48(FP), AX
1443 MOVQ dst_base+0(FP), CX
1444 MOVQ $0x00000200, DX
1445 MOVQ AX, BX
1446 PXOR X0, X0
1447
1448 zero_loop_encodeBlockAsm4MB:
1449 MOVOU X0, (BX)
1450 MOVOU X0, 16(BX)
1451 MOVOU X0, 32(BX)
1452 MOVOU X0, 48(BX)
1453 MOVOU X0, 64(BX)
1454 MOVOU X0, 80(BX)
1455 MOVOU X0, 96(BX)
1456 MOVOU X0, 112(BX)
1457 ADDQ $0x80, BX
1458 DECQ DX
1459 JNZ zero_loop_encodeBlockAsm4MB
1460 MOVL $0x00000000, 12(SP)
1461 MOVQ src_len+32(FP), DX
1462 LEAQ -9(DX), BX
1463 LEAQ -8(DX), SI
1464 MOVL SI, 8(SP)
1465 SHRQ $0x05, DX
1466 SUBL DX, BX
1467 LEAQ (CX)(BX*1), BX
1468 MOVQ BX, (SP)
1469 MOVL $0x00000001, DX
1470 MOVL DX, 16(SP)
1471 MOVQ src_base+24(FP), BX
1472
1473 search_loop_encodeBlockAsm4MB:
1474 MOVL DX, SI
1475 SUBL 12(SP), SI
1476 SHRL $0x06, SI
1477 LEAL 4(DX)(SI*1), SI
1478 CMPL SI, 8(SP)
1479 JAE emit_remainder_encodeBlockAsm4MB
1480 MOVQ (BX)(DX*1), DI
1481 MOVL SI, 20(SP)
1482 MOVQ $0x0000cf1bbcdcbf9b, R9
1483 MOVQ DI, R10
1484 MOVQ DI, R11
1485 SHRQ $0x08, R11
1486 SHLQ $0x10, R10
1487 IMULQ R9, R10
1488 SHRQ $0x32, R10
1489 SHLQ $0x10, R11
1490 IMULQ R9, R11
1491 SHRQ $0x32, R11
1492 MOVL (AX)(R10*4), SI
1493 MOVL (AX)(R11*4), R8
1494 MOVL DX, (AX)(R10*4)
1495 LEAL 1(DX), R10
1496 MOVL R10, (AX)(R11*4)
1497 MOVQ DI, R10
1498 SHRQ $0x10, R10
1499 SHLQ $0x10, R10
1500 IMULQ R9, R10
1501 SHRQ $0x32, R10
1502 MOVL DX, R9
1503 SUBL 16(SP), R9
1504 MOVL 1(BX)(R9*1), R11
1505 MOVQ DI, R9
1506 SHRQ $0x08, R9
1507 CMPL R9, R11
1508 JNE no_repeat_found_encodeBlockAsm4MB
1509 LEAL 1(DX), DI
1510 MOVL 12(SP), R8
1511 MOVL DI, SI
1512 SUBL 16(SP), SI
1513 JZ repeat_extend_back_end_encodeBlockAsm4MB
1514
1515 repeat_extend_back_loop_encodeBlockAsm4MB:
1516 CMPL DI, R8
1517 JBE repeat_extend_back_end_encodeBlockAsm4MB
1518 MOVB -1(BX)(SI*1), R9
1519 MOVB -1(BX)(DI*1), R10
1520 CMPB R9, R10
1521 JNE repeat_extend_back_end_encodeBlockAsm4MB
1522 LEAL -1(DI), DI
1523 DECL SI
1524 JNZ repeat_extend_back_loop_encodeBlockAsm4MB
1525
1526 repeat_extend_back_end_encodeBlockAsm4MB:
1527 MOVL DI, SI
1528 SUBL 12(SP), SI
1529 LEAQ 4(CX)(SI*1), SI
1530 CMPQ SI, (SP)
1531 JB repeat_dst_size_check_encodeBlockAsm4MB
1532 MOVQ $0x00000000, ret+56(FP)
1533 RET
1534
1535 repeat_dst_size_check_encodeBlockAsm4MB:
1536 MOVL 12(SP), SI
1537 CMPL SI, DI
1538 JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB
1539 MOVL DI, R9
1540 MOVL DI, 12(SP)
1541 LEAQ (BX)(SI*1), R10
1542 SUBL SI, R9
1543 LEAL -1(R9), SI
1544 CMPL SI, $0x3c
1545 JB one_byte_repeat_emit_encodeBlockAsm4MB
1546 CMPL SI, $0x00000100
1547 JB two_bytes_repeat_emit_encodeBlockAsm4MB
1548 CMPL SI, $0x00010000
1549 JB three_bytes_repeat_emit_encodeBlockAsm4MB
1550 MOVL SI, R11
1551 SHRL $0x10, R11
1552 MOVB $0xf8, (CX)
1553 MOVW SI, 1(CX)
1554 MOVB R11, 3(CX)
1555 ADDQ $0x04, CX
1556 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1557
1558 three_bytes_repeat_emit_encodeBlockAsm4MB:
1559 MOVB $0xf4, (CX)
1560 MOVW SI, 1(CX)
1561 ADDQ $0x03, CX
1562 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1563
1564 two_bytes_repeat_emit_encodeBlockAsm4MB:
1565 MOVB $0xf0, (CX)
1566 MOVB SI, 1(CX)
1567 ADDQ $0x02, CX
1568 CMPL SI, $0x40
1569 JB memmove_repeat_emit_encodeBlockAsm4MB
1570 JMP memmove_long_repeat_emit_encodeBlockAsm4MB
1571
1572 one_byte_repeat_emit_encodeBlockAsm4MB:
1573 SHLB $0x02, SI
1574 MOVB SI, (CX)
1575 ADDQ $0x01, CX
1576
1577 memmove_repeat_emit_encodeBlockAsm4MB:
1578 LEAQ (CX)(R9*1), SI
1579
1580 // genMemMoveShort
1581 CMPQ R9, $0x08
1582 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8
1583 CMPQ R9, $0x10
1584 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16
1585 CMPQ R9, $0x20
1586 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32
1587 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64
1588
1589 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8:
1590 MOVQ (R10), R11
1591 MOVQ R11, (CX)
1592 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1593
1594 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16:
1595 MOVQ (R10), R11
1596 MOVQ -8(R10)(R9*1), R10
1597 MOVQ R11, (CX)
1598 MOVQ R10, -8(CX)(R9*1)
1599 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1600
1601 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32:
1602 MOVOU (R10), X0
1603 MOVOU -16(R10)(R9*1), X1
1604 MOVOU X0, (CX)
1605 MOVOU X1, -16(CX)(R9*1)
1606 JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB
1607
1608 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64:
1609 MOVOU (R10), X0
1610 MOVOU 16(R10), X1
1611 MOVOU -32(R10)(R9*1), X2
1612 MOVOU -16(R10)(R9*1), X3
1613 MOVOU X0, (CX)
1614 MOVOU X1, 16(CX)
1615 MOVOU X2, -32(CX)(R9*1)
1616 MOVOU X3, -16(CX)(R9*1)
1617
1618 memmove_end_copy_repeat_emit_encodeBlockAsm4MB:
1619 MOVQ SI, CX
1620 JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB
1621
1622 memmove_long_repeat_emit_encodeBlockAsm4MB:
1623 LEAQ (CX)(R9*1), SI
1624
1625 // genMemMoveLong
1626 MOVOU (R10), X0
1627 MOVOU 16(R10), X1
1628 MOVOU -32(R10)(R9*1), X2
1629 MOVOU -16(R10)(R9*1), X3
1630 MOVQ R9, R12
1631 SHRQ $0x05, R12
1632 MOVQ CX, R11
1633 ANDL $0x0000001f, R11
1634 MOVQ $0x00000040, R13
1635 SUBQ R11, R13
1636 DECQ R12
1637 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1638 LEAQ -32(R10)(R13*1), R11
1639 LEAQ -32(CX)(R13*1), R14
1640
1641 emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back:
1642 MOVOU (R11), X4
1643 MOVOU 16(R11), X5
1644 MOVOA X4, (R14)
1645 MOVOA X5, 16(R14)
1646 ADDQ $0x20, R14
1647 ADDQ $0x20, R11
1648 ADDQ $0x20, R13
1649 DECQ R12
1650 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back
1651
1652 emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
1653 MOVOU -32(R10)(R13*1), X4
1654 MOVOU -16(R10)(R13*1), X5
1655 MOVOA X4, -32(CX)(R13*1)
1656 MOVOA X5, -16(CX)(R13*1)
1657 ADDQ $0x20, R13
1658 CMPQ R9, R13
1659 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
1660 MOVOU X0, (CX)
1661 MOVOU X1, 16(CX)
1662 MOVOU X2, -32(CX)(R9*1)
1663 MOVOU X3, -16(CX)(R9*1)
1664 MOVQ SI, CX
1665
1666 emit_literal_done_repeat_emit_encodeBlockAsm4MB:
1667 ADDL $0x05, DX
1668 MOVL DX, SI
1669 SUBL 16(SP), SI
1670 MOVQ src_len+32(FP), R9
1671 SUBL DX, R9
1672 LEAQ (BX)(DX*1), R10
1673 LEAQ (BX)(SI*1), SI
1674
1675 // matchLen
1676 XORL R12, R12
1677
1678 matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB:
1679 CMPL R9, $0x10
1680 JB matchlen_match8_repeat_extend_encodeBlockAsm4MB
1681 MOVQ (R10)(R12*1), R11
1682 MOVQ 8(R10)(R12*1), R13
1683 XORQ (SI)(R12*1), R11
1684 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
1685 XORQ 8(SI)(R12*1), R13
1686 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB
1687 LEAL -16(R9), R9
1688 LEAL 16(R12), R12
1689 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB
1690
1691 matchlen_bsf_16repeat_extend_encodeBlockAsm4MB:
1692 #ifdef GOAMD64_v3
1693 TZCNTQ R13, R13
1694
1695 #else
1696 BSFQ R13, R13
1697
1698 #endif
1699 SARQ $0x03, R13
1700 LEAL 8(R12)(R13*1), R12
1701 JMP repeat_extend_forward_end_encodeBlockAsm4MB
1702
1703 matchlen_match8_repeat_extend_encodeBlockAsm4MB:
1704 CMPL R9, $0x08
1705 JB matchlen_match4_repeat_extend_encodeBlockAsm4MB
1706 MOVQ (R10)(R12*1), R11
1707 XORQ (SI)(R12*1), R11
1708 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB
1709 LEAL -8(R9), R9
1710 LEAL 8(R12), R12
1711 JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB
1712
1713 matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB:
1714 #ifdef GOAMD64_v3
1715 TZCNTQ R11, R11
1716
1717 #else
1718 BSFQ R11, R11
1719
1720 #endif
1721 SARQ $0x03, R11
1722 LEAL (R12)(R11*1), R12
1723 JMP repeat_extend_forward_end_encodeBlockAsm4MB
1724
1725 matchlen_match4_repeat_extend_encodeBlockAsm4MB:
1726 CMPL R9, $0x04
1727 JB matchlen_match2_repeat_extend_encodeBlockAsm4MB
1728 MOVL (R10)(R12*1), R11
1729 CMPL (SI)(R12*1), R11
1730 JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB
1731 LEAL -4(R9), R9
1732 LEAL 4(R12), R12
1733
1734 matchlen_match2_repeat_extend_encodeBlockAsm4MB:
1735 CMPL R9, $0x01
1736 JE matchlen_match1_repeat_extend_encodeBlockAsm4MB
1737 JB repeat_extend_forward_end_encodeBlockAsm4MB
1738 MOVW (R10)(R12*1), R11
1739 CMPW (SI)(R12*1), R11
1740 JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB
1741 LEAL 2(R12), R12
1742 SUBL $0x02, R9
1743 JZ repeat_extend_forward_end_encodeBlockAsm4MB
1744
1745 matchlen_match1_repeat_extend_encodeBlockAsm4MB:
1746 MOVB (R10)(R12*1), R11
1747 CMPB (SI)(R12*1), R11
1748 JNE repeat_extend_forward_end_encodeBlockAsm4MB
1749 LEAL 1(R12), R12
1750
1751 repeat_extend_forward_end_encodeBlockAsm4MB:
1752 ADDL R12, DX
1753 MOVL DX, SI
1754 SUBL DI, SI
1755 MOVL 16(SP), DI
1756 TESTL R8, R8
1757 JZ repeat_as_copy_encodeBlockAsm4MB
1758
1759 // emitRepeat
1760 MOVL SI, R8
1761 LEAL -4(SI), SI
1762 CMPL R8, $0x08
1763 JBE repeat_two_match_repeat_encodeBlockAsm4MB
1764 CMPL R8, $0x0c
1765 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB
1766 CMPL DI, $0x00000800
1767 JB repeat_two_offset_match_repeat_encodeBlockAsm4MB
1768
1769 cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1770 CMPL SI, $0x00000104
1771 JB repeat_three_match_repeat_encodeBlockAsm4MB
1772 CMPL SI, $0x00010100
1773 JB repeat_four_match_repeat_encodeBlockAsm4MB
1774 LEAL -65536(SI), SI
1775 MOVL SI, DI
1776 MOVW $0x001d, (CX)
1777 MOVW SI, 2(CX)
1778 SARL $0x10, DI
1779 MOVB DI, 4(CX)
1780 ADDQ $0x05, CX
1781 JMP repeat_end_emit_encodeBlockAsm4MB
1782
1783 repeat_four_match_repeat_encodeBlockAsm4MB:
1784 LEAL -256(SI), SI
1785 MOVW $0x0019, (CX)
1786 MOVW SI, 2(CX)
1787 ADDQ $0x04, CX
1788 JMP repeat_end_emit_encodeBlockAsm4MB
1789
1790 repeat_three_match_repeat_encodeBlockAsm4MB:
1791 LEAL -4(SI), SI
1792 MOVW $0x0015, (CX)
1793 MOVB SI, 2(CX)
1794 ADDQ $0x03, CX
1795 JMP repeat_end_emit_encodeBlockAsm4MB
1796
1797 repeat_two_match_repeat_encodeBlockAsm4MB:
1798 SHLL $0x02, SI
1799 ORL $0x01, SI
1800 MOVW SI, (CX)
1801 ADDQ $0x02, CX
1802 JMP repeat_end_emit_encodeBlockAsm4MB
1803
1804 repeat_two_offset_match_repeat_encodeBlockAsm4MB:
1805 XORQ R8, R8
1806 LEAL 1(R8)(SI*4), SI
1807 MOVB DI, 1(CX)
1808 SARL $0x08, DI
1809 SHLL $0x05, DI
1810 ORL DI, SI
1811 MOVB SI, (CX)
1812 ADDQ $0x02, CX
1813 JMP repeat_end_emit_encodeBlockAsm4MB
1814
1815 repeat_as_copy_encodeBlockAsm4MB:
1816 // emitCopy
1817 CMPL DI, $0x00010000
1818 JB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
1819 CMPL SI, $0x40
1820 JBE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1821 MOVB $0xff, (CX)
1822 MOVL DI, 1(CX)
1823 LEAL -64(SI), SI
1824 ADDQ $0x05, CX
1825 CMPL SI, $0x04
1826 JB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB
1827
1828 // emitRepeat
1829 MOVL SI, R8
1830 LEAL -4(SI), SI
1831 CMPL R8, $0x08
1832 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1833 CMPL R8, $0x0c
1834 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1835 CMPL DI, $0x00000800
1836 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1837
1838 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1839 CMPL SI, $0x00000104
1840 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1841 CMPL SI, $0x00010100
1842 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy
1843 LEAL -65536(SI), SI
1844 MOVL SI, DI
1845 MOVW $0x001d, (CX)
1846 MOVW SI, 2(CX)
1847 SARL $0x10, DI
1848 MOVB DI, 4(CX)
1849 ADDQ $0x05, CX
1850 JMP repeat_end_emit_encodeBlockAsm4MB
1851
1852 repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1853 LEAL -256(SI), SI
1854 MOVW $0x0019, (CX)
1855 MOVW SI, 2(CX)
1856 ADDQ $0x04, CX
1857 JMP repeat_end_emit_encodeBlockAsm4MB
1858
1859 repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1860 LEAL -4(SI), SI
1861 MOVW $0x0015, (CX)
1862 MOVB SI, 2(CX)
1863 ADDQ $0x03, CX
1864 JMP repeat_end_emit_encodeBlockAsm4MB
1865
1866 repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1867 SHLL $0x02, SI
1868 ORL $0x01, SI
1869 MOVW SI, (CX)
1870 ADDQ $0x02, CX
1871 JMP repeat_end_emit_encodeBlockAsm4MB
1872
1873 repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy:
1874 XORQ R8, R8
1875 LEAL 1(R8)(SI*4), SI
1876 MOVB DI, 1(CX)
1877 SARL $0x08, DI
1878 SHLL $0x05, DI
1879 ORL DI, SI
1880 MOVB SI, (CX)
1881 ADDQ $0x02, CX
1882 JMP repeat_end_emit_encodeBlockAsm4MB
1883
1884 four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB:
1885 TESTL SI, SI
1886 JZ repeat_end_emit_encodeBlockAsm4MB
1887 XORL R8, R8
1888 LEAL -1(R8)(SI*4), SI
1889 MOVB SI, (CX)
1890 MOVL DI, 1(CX)
1891 ADDQ $0x05, CX
1892 JMP repeat_end_emit_encodeBlockAsm4MB
1893
1894 two_byte_offset_repeat_as_copy_encodeBlockAsm4MB:
1895 CMPL SI, $0x40
1896 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB
1897 CMPL DI, $0x00000800
1898 JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB
1899 MOVL $0x00000001, R8
1900 LEAL 16(R8), R8
1901 MOVB DI, 1(CX)
1902 SHRL $0x08, DI
1903 SHLL $0x05, DI
1904 ORL DI, R8
1905 MOVB R8, (CX)
1906 ADDQ $0x02, CX
1907 SUBL $0x08, SI
1908
1909 // emitRepeat
1910 LEAL -4(SI), SI
1911 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1912 MOVL SI, R8
1913 LEAL -4(SI), SI
1914 CMPL R8, $0x08
1915 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1916 CMPL R8, $0x0c
1917 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1918 CMPL DI, $0x00000800
1919 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1920
1921 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1922 CMPL SI, $0x00000104
1923 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1924 CMPL SI, $0x00010100
1925 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b
1926 LEAL -65536(SI), SI
1927 MOVL SI, DI
1928 MOVW $0x001d, (CX)
1929 MOVW SI, 2(CX)
1930 SARL $0x10, DI
1931 MOVB DI, 4(CX)
1932 ADDQ $0x05, CX
1933 JMP repeat_end_emit_encodeBlockAsm4MB
1934
1935 repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1936 LEAL -256(SI), SI
1937 MOVW $0x0019, (CX)
1938 MOVW SI, 2(CX)
1939 ADDQ $0x04, CX
1940 JMP repeat_end_emit_encodeBlockAsm4MB
1941
1942 repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1943 LEAL -4(SI), SI
1944 MOVW $0x0015, (CX)
1945 MOVB SI, 2(CX)
1946 ADDQ $0x03, CX
1947 JMP repeat_end_emit_encodeBlockAsm4MB
1948
1949 repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1950 SHLL $0x02, SI
1951 ORL $0x01, SI
1952 MOVW SI, (CX)
1953 ADDQ $0x02, CX
1954 JMP repeat_end_emit_encodeBlockAsm4MB
1955
1956 repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b:
1957 XORQ R8, R8
1958 LEAL 1(R8)(SI*4), SI
1959 MOVB DI, 1(CX)
1960 SARL $0x08, DI
1961 SHLL $0x05, DI
1962 ORL DI, SI
1963 MOVB SI, (CX)
1964 ADDQ $0x02, CX
1965 JMP repeat_end_emit_encodeBlockAsm4MB
1966
1967 long_offset_short_repeat_as_copy_encodeBlockAsm4MB:
1968 MOVB $0xee, (CX)
1969 MOVW DI, 1(CX)
1970 LEAL -60(SI), SI
1971 ADDQ $0x03, CX
1972
1973 // emitRepeat
1974 MOVL SI, R8
1975 LEAL -4(SI), SI
1976 CMPL R8, $0x08
1977 JBE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1978 CMPL R8, $0x0c
1979 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1980 CMPL DI, $0x00000800
1981 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1982
1983 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1984 CMPL SI, $0x00000104
1985 JB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1986 CMPL SI, $0x00010100
1987 JB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short
1988 LEAL -65536(SI), SI
1989 MOVL SI, DI
1990 MOVW $0x001d, (CX)
1991 MOVW SI, 2(CX)
1992 SARL $0x10, DI
1993 MOVB DI, 4(CX)
1994 ADDQ $0x05, CX
1995 JMP repeat_end_emit_encodeBlockAsm4MB
1996
1997 repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
1998 LEAL -256(SI), SI
1999 MOVW $0x0019, (CX)
2000 MOVW SI, 2(CX)
2001 ADDQ $0x04, CX
2002 JMP repeat_end_emit_encodeBlockAsm4MB
2003
2004 repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
2005 LEAL -4(SI), SI
2006 MOVW $0x0015, (CX)
2007 MOVB SI, 2(CX)
2008 ADDQ $0x03, CX
2009 JMP repeat_end_emit_encodeBlockAsm4MB
2010
2011 repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
2012 SHLL $0x02, SI
2013 ORL $0x01, SI
2014 MOVW SI, (CX)
2015 ADDQ $0x02, CX
2016 JMP repeat_end_emit_encodeBlockAsm4MB
2017
2018 repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
2019 XORQ R8, R8
2020 LEAL 1(R8)(SI*4), SI
2021 MOVB DI, 1(CX)
2022 SARL $0x08, DI
2023 SHLL $0x05, DI
2024 ORL DI, SI
2025 MOVB SI, (CX)
2026 ADDQ $0x02, CX
2027 JMP repeat_end_emit_encodeBlockAsm4MB
2028
2029 two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
2030 MOVL SI, R8
2031 SHLL $0x02, R8
2032 CMPL SI, $0x0c
2033 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
2034 CMPL DI, $0x00000800
2035 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB
2036 LEAL -15(R8), R8
2037 MOVB DI, 1(CX)
2038 SHRL $0x08, DI
2039 SHLL $0x05, DI
2040 ORL DI, R8
2041 MOVB R8, (CX)
2042 ADDQ $0x02, CX
2043 JMP repeat_end_emit_encodeBlockAsm4MB
2044
2045 emit_copy_three_repeat_as_copy_encodeBlockAsm4MB:
2046 LEAL -2(R8), R8
2047 MOVB R8, (CX)
2048 MOVW DI, 1(CX)
2049 ADDQ $0x03, CX
2050
2051 repeat_end_emit_encodeBlockAsm4MB:
2052 MOVL DX, 12(SP)
2053 JMP search_loop_encodeBlockAsm4MB
2054
2055 no_repeat_found_encodeBlockAsm4MB:
2056 CMPL (BX)(SI*1), DI
2057 JEQ candidate_match_encodeBlockAsm4MB
2058 SHRQ $0x08, DI
2059 MOVL (AX)(R10*4), SI
2060 LEAL 2(DX), R9
2061 CMPL (BX)(R8*1), DI
2062 JEQ candidate2_match_encodeBlockAsm4MB
2063 MOVL R9, (AX)(R10*4)
2064 SHRQ $0x08, DI
2065 CMPL (BX)(SI*1), DI
2066 JEQ candidate3_match_encodeBlockAsm4MB
2067 MOVL 20(SP), DX
2068 JMP search_loop_encodeBlockAsm4MB
2069
2070 candidate3_match_encodeBlockAsm4MB:
2071 ADDL $0x02, DX
2072 JMP candidate_match_encodeBlockAsm4MB
2073
2074 candidate2_match_encodeBlockAsm4MB:
2075 MOVL R9, (AX)(R10*4)
2076 INCL DX
2077 MOVL R8, SI
2078
2079 candidate_match_encodeBlockAsm4MB:
2080 MOVL 12(SP), DI
2081 TESTL SI, SI
2082 JZ match_extend_back_end_encodeBlockAsm4MB
2083
2084 match_extend_back_loop_encodeBlockAsm4MB:
2085 CMPL DX, DI
2086 JBE match_extend_back_end_encodeBlockAsm4MB
2087 MOVB -1(BX)(SI*1), R8
2088 MOVB -1(BX)(DX*1), R9
2089 CMPB R8, R9
2090 JNE match_extend_back_end_encodeBlockAsm4MB
2091 LEAL -1(DX), DX
2092 DECL SI
2093 JZ match_extend_back_end_encodeBlockAsm4MB
2094 JMP match_extend_back_loop_encodeBlockAsm4MB
2095
2096 match_extend_back_end_encodeBlockAsm4MB:
2097 MOVL DX, DI
2098 SUBL 12(SP), DI
2099 LEAQ 4(CX)(DI*1), DI
2100 CMPQ DI, (SP)
2101 JB match_dst_size_check_encodeBlockAsm4MB
2102 MOVQ $0x00000000, ret+56(FP)
2103 RET
2104
2105 match_dst_size_check_encodeBlockAsm4MB:
2106 MOVL DX, DI
2107 MOVL 12(SP), R8
2108 CMPL R8, DI
2109 JEQ emit_literal_done_match_emit_encodeBlockAsm4MB
2110 MOVL DI, R9
2111 MOVL DI, 12(SP)
2112 LEAQ (BX)(R8*1), DI
2113 SUBL R8, R9
2114 LEAL -1(R9), R8
2115 CMPL R8, $0x3c
2116 JB one_byte_match_emit_encodeBlockAsm4MB
2117 CMPL R8, $0x00000100
2118 JB two_bytes_match_emit_encodeBlockAsm4MB
2119 CMPL R8, $0x00010000
2120 JB three_bytes_match_emit_encodeBlockAsm4MB
2121 MOVL R8, R10
2122 SHRL $0x10, R10
2123 MOVB $0xf8, (CX)
2124 MOVW R8, 1(CX)
2125 MOVB R10, 3(CX)
2126 ADDQ $0x04, CX
2127 JMP memmove_long_match_emit_encodeBlockAsm4MB
2128
2129 three_bytes_match_emit_encodeBlockAsm4MB:
2130 MOVB $0xf4, (CX)
2131 MOVW R8, 1(CX)
2132 ADDQ $0x03, CX
2133 JMP memmove_long_match_emit_encodeBlockAsm4MB
2134
2135 two_bytes_match_emit_encodeBlockAsm4MB:
2136 MOVB $0xf0, (CX)
2137 MOVB R8, 1(CX)
2138 ADDQ $0x02, CX
2139 CMPL R8, $0x40
2140 JB memmove_match_emit_encodeBlockAsm4MB
2141 JMP memmove_long_match_emit_encodeBlockAsm4MB
2142
2143 one_byte_match_emit_encodeBlockAsm4MB:
2144 SHLB $0x02, R8
2145 MOVB R8, (CX)
2146 ADDQ $0x01, CX
2147
2148 memmove_match_emit_encodeBlockAsm4MB:
2149 LEAQ (CX)(R9*1), R8
2150
2151 // genMemMoveShort
2152 CMPQ R9, $0x08
2153 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8
2154 CMPQ R9, $0x10
2155 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16
2156 CMPQ R9, $0x20
2157 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32
2158 JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64
2159
2160 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8:
2161 MOVQ (DI), R10
2162 MOVQ R10, (CX)
2163 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2164
2165 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16:
2166 MOVQ (DI), R10
2167 MOVQ -8(DI)(R9*1), DI
2168 MOVQ R10, (CX)
2169 MOVQ DI, -8(CX)(R9*1)
2170 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2171
2172 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32:
2173 MOVOU (DI), X0
2174 MOVOU -16(DI)(R9*1), X1
2175 MOVOU X0, (CX)
2176 MOVOU X1, -16(CX)(R9*1)
2177 JMP memmove_end_copy_match_emit_encodeBlockAsm4MB
2178
2179 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64:
2180 MOVOU (DI), X0
2181 MOVOU 16(DI), X1
2182 MOVOU -32(DI)(R9*1), X2
2183 MOVOU -16(DI)(R9*1), X3
2184 MOVOU X0, (CX)
2185 MOVOU X1, 16(CX)
2186 MOVOU X2, -32(CX)(R9*1)
2187 MOVOU X3, -16(CX)(R9*1)
2188
2189 memmove_end_copy_match_emit_encodeBlockAsm4MB:
2190 MOVQ R8, CX
2191 JMP emit_literal_done_match_emit_encodeBlockAsm4MB
2192
2193 memmove_long_match_emit_encodeBlockAsm4MB:
2194 LEAQ (CX)(R9*1), R8
2195
2196 // genMemMoveLong
2197 MOVOU (DI), X0
2198 MOVOU 16(DI), X1
2199 MOVOU -32(DI)(R9*1), X2
2200 MOVOU -16(DI)(R9*1), X3
2201 MOVQ R9, R11
2202 SHRQ $0x05, R11
2203 MOVQ CX, R10
2204 ANDL $0x0000001f, R10
2205 MOVQ $0x00000040, R12
2206 SUBQ R10, R12
2207 DECQ R11
2208 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
2209 LEAQ -32(DI)(R12*1), R10
2210 LEAQ -32(CX)(R12*1), R13
2211
2212 emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back:
2213 MOVOU (R10), X4
2214 MOVOU 16(R10), X5
2215 MOVOA X4, (R13)
2216 MOVOA X5, 16(R13)
2217 ADDQ $0x20, R13
2218 ADDQ $0x20, R10
2219 ADDQ $0x20, R12
2220 DECQ R11
2221 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back
2222
2223 emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2224 MOVOU -32(DI)(R12*1), X4
2225 MOVOU -16(DI)(R12*1), X5
2226 MOVOA X4, -32(CX)(R12*1)
2227 MOVOA X5, -16(CX)(R12*1)
2228 ADDQ $0x20, R12
2229 CMPQ R9, R12
2230 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32
2231 MOVOU X0, (CX)
2232 MOVOU X1, 16(CX)
2233 MOVOU X2, -32(CX)(R9*1)
2234 MOVOU X3, -16(CX)(R9*1)
2235 MOVQ R8, CX
2236
2237 emit_literal_done_match_emit_encodeBlockAsm4MB:
2238 match_nolit_loop_encodeBlockAsm4MB:
2239 MOVL DX, DI
2240 SUBL SI, DI
2241 MOVL DI, 16(SP)
2242 ADDL $0x04, DX
2243 ADDL $0x04, SI
2244 MOVQ src_len+32(FP), DI
2245 SUBL DX, DI
2246 LEAQ (BX)(DX*1), R8
2247 LEAQ (BX)(SI*1), SI
2248
2249 // matchLen
2250 XORL R10, R10
2251
2252 matchlen_loopback_16_match_nolit_encodeBlockAsm4MB:
2253 CMPL DI, $0x10
2254 JB matchlen_match8_match_nolit_encodeBlockAsm4MB
2255 MOVQ (R8)(R10*1), R9
2256 MOVQ 8(R8)(R10*1), R11
2257 XORQ (SI)(R10*1), R9
2258 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
2259 XORQ 8(SI)(R10*1), R11
2260 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB
2261 LEAL -16(DI), DI
2262 LEAL 16(R10), R10
2263 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB
2264
2265 matchlen_bsf_16match_nolit_encodeBlockAsm4MB:
2266 #ifdef GOAMD64_v3
2267 TZCNTQ R11, R11
2268
2269 #else
2270 BSFQ R11, R11
2271
2272 #endif
2273 SARQ $0x03, R11
2274 LEAL 8(R10)(R11*1), R10
2275 JMP match_nolit_end_encodeBlockAsm4MB
2276
2277 matchlen_match8_match_nolit_encodeBlockAsm4MB:
2278 CMPL DI, $0x08
2279 JB matchlen_match4_match_nolit_encodeBlockAsm4MB
2280 MOVQ (R8)(R10*1), R9
2281 XORQ (SI)(R10*1), R9
2282 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB
2283 LEAL -8(DI), DI
2284 LEAL 8(R10), R10
2285 JMP matchlen_match4_match_nolit_encodeBlockAsm4MB
2286
2287 matchlen_bsf_8_match_nolit_encodeBlockAsm4MB:
2288 #ifdef GOAMD64_v3
2289 TZCNTQ R9, R9
2290
2291 #else
2292 BSFQ R9, R9
2293
2294 #endif
2295 SARQ $0x03, R9
2296 LEAL (R10)(R9*1), R10
2297 JMP match_nolit_end_encodeBlockAsm4MB
2298
2299 matchlen_match4_match_nolit_encodeBlockAsm4MB:
2300 CMPL DI, $0x04
2301 JB matchlen_match2_match_nolit_encodeBlockAsm4MB
2302 MOVL (R8)(R10*1), R9
2303 CMPL (SI)(R10*1), R9
2304 JNE matchlen_match2_match_nolit_encodeBlockAsm4MB
2305 LEAL -4(DI), DI
2306 LEAL 4(R10), R10
2307
2308 matchlen_match2_match_nolit_encodeBlockAsm4MB:
2309 CMPL DI, $0x01
2310 JE matchlen_match1_match_nolit_encodeBlockAsm4MB
2311 JB match_nolit_end_encodeBlockAsm4MB
2312 MOVW (R8)(R10*1), R9
2313 CMPW (SI)(R10*1), R9
2314 JNE matchlen_match1_match_nolit_encodeBlockAsm4MB
2315 LEAL 2(R10), R10
2316 SUBL $0x02, DI
2317 JZ match_nolit_end_encodeBlockAsm4MB
2318
2319 matchlen_match1_match_nolit_encodeBlockAsm4MB:
2320 MOVB (R8)(R10*1), R9
2321 CMPB (SI)(R10*1), R9
2322 JNE match_nolit_end_encodeBlockAsm4MB
2323 LEAL 1(R10), R10
2324
2325 match_nolit_end_encodeBlockAsm4MB:
2326 ADDL R10, DX
2327 MOVL 16(SP), SI
2328 ADDL $0x04, R10
2329 MOVL DX, 12(SP)
2330
2331 // emitCopy
2332 CMPL SI, $0x00010000
2333 JB two_byte_offset_match_nolit_encodeBlockAsm4MB
2334 CMPL R10, $0x40
2335 JBE four_bytes_remain_match_nolit_encodeBlockAsm4MB
2336 MOVB $0xff, (CX)
2337 MOVL SI, 1(CX)
2338 LEAL -64(R10), R10
2339 ADDQ $0x05, CX
2340 CMPL R10, $0x04
2341 JB four_bytes_remain_match_nolit_encodeBlockAsm4MB
2342
2343 // emitRepeat
2344 MOVL R10, DI
2345 LEAL -4(R10), R10
2346 CMPL DI, $0x08
2347 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy
2348 CMPL DI, $0x0c
2349 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2350 CMPL SI, $0x00000800
2351 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy
2352
2353 cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2354 CMPL R10, $0x00000104
2355 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy
2356 CMPL R10, $0x00010100
2357 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy
2358 LEAL -65536(R10), R10
2359 MOVL R10, SI
2360 MOVW $0x001d, (CX)
2361 MOVW R10, 2(CX)
2362 SARL $0x10, SI
2363 MOVB SI, 4(CX)
2364 ADDQ $0x05, CX
2365 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2366
2367 repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy:
2368 LEAL -256(R10), R10
2369 MOVW $0x0019, (CX)
2370 MOVW R10, 2(CX)
2371 ADDQ $0x04, CX
2372 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2373
2374 repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy:
2375 LEAL -4(R10), R10
2376 MOVW $0x0015, (CX)
2377 MOVB R10, 2(CX)
2378 ADDQ $0x03, CX
2379 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2380
2381 repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy:
2382 SHLL $0x02, R10
2383 ORL $0x01, R10
2384 MOVW R10, (CX)
2385 ADDQ $0x02, CX
2386 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2387
2388 repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy:
2389 XORQ DI, DI
2390 LEAL 1(DI)(R10*4), R10
2391 MOVB SI, 1(CX)
2392 SARL $0x08, SI
2393 SHLL $0x05, SI
2394 ORL SI, R10
2395 MOVB R10, (CX)
2396 ADDQ $0x02, CX
2397 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2398
2399 four_bytes_remain_match_nolit_encodeBlockAsm4MB:
2400 TESTL R10, R10
2401 JZ match_nolit_emitcopy_end_encodeBlockAsm4MB
2402 XORL DI, DI
2403 LEAL -1(DI)(R10*4), R10
2404 MOVB R10, (CX)
2405 MOVL SI, 1(CX)
2406 ADDQ $0x05, CX
2407 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2408
2409 two_byte_offset_match_nolit_encodeBlockAsm4MB:
2410 CMPL R10, $0x40
2411 JBE two_byte_offset_short_match_nolit_encodeBlockAsm4MB
2412 CMPL SI, $0x00000800
2413 JAE long_offset_short_match_nolit_encodeBlockAsm4MB
2414 MOVL $0x00000001, DI
2415 LEAL 16(DI), DI
2416 MOVB SI, 1(CX)
2417 SHRL $0x08, SI
2418 SHLL $0x05, SI
2419 ORL SI, DI
2420 MOVB DI, (CX)
2421 ADDQ $0x02, CX
2422 SUBL $0x08, R10
2423
2424 // emitRepeat
2425 LEAL -4(R10), R10
2426 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2427 MOVL R10, DI
2428 LEAL -4(R10), R10
2429 CMPL DI, $0x08
2430 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2431 CMPL DI, $0x0c
2432 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2433 CMPL SI, $0x00000800
2434 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2435
2436 cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2437 CMPL R10, $0x00000104
2438 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2439 CMPL R10, $0x00010100
2440 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b
2441 LEAL -65536(R10), R10
2442 MOVL R10, SI
2443 MOVW $0x001d, (CX)
2444 MOVW R10, 2(CX)
2445 SARL $0x10, SI
2446 MOVB SI, 4(CX)
2447 ADDQ $0x05, CX
2448 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2449
2450 repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2451 LEAL -256(R10), R10
2452 MOVW $0x0019, (CX)
2453 MOVW R10, 2(CX)
2454 ADDQ $0x04, CX
2455 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2456
2457 repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2458 LEAL -4(R10), R10
2459 MOVW $0x0015, (CX)
2460 MOVB R10, 2(CX)
2461 ADDQ $0x03, CX
2462 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2463
2464 repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2465 SHLL $0x02, R10
2466 ORL $0x01, R10
2467 MOVW R10, (CX)
2468 ADDQ $0x02, CX
2469 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2470
2471 repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b:
2472 XORQ DI, DI
2473 LEAL 1(DI)(R10*4), R10
2474 MOVB SI, 1(CX)
2475 SARL $0x08, SI
2476 SHLL $0x05, SI
2477 ORL SI, R10
2478 MOVB R10, (CX)
2479 ADDQ $0x02, CX
2480 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2481
2482 long_offset_short_match_nolit_encodeBlockAsm4MB:
2483 MOVB $0xee, (CX)
2484 MOVW SI, 1(CX)
2485 LEAL -60(R10), R10
2486 ADDQ $0x03, CX
2487
2488 // emitRepeat
2489 MOVL R10, DI
2490 LEAL -4(R10), R10
2491 CMPL DI, $0x08
2492 JBE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short
2493 CMPL DI, $0x0c
2494 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2495 CMPL SI, $0x00000800
2496 JB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short
2497
2498 cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2499 CMPL R10, $0x00000104
2500 JB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short
2501 CMPL R10, $0x00010100
2502 JB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short
2503 LEAL -65536(R10), R10
2504 MOVL R10, SI
2505 MOVW $0x001d, (CX)
2506 MOVW R10, 2(CX)
2507 SARL $0x10, SI
2508 MOVB SI, 4(CX)
2509 ADDQ $0x05, CX
2510 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2511
2512 repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2513 LEAL -256(R10), R10
2514 MOVW $0x0019, (CX)
2515 MOVW R10, 2(CX)
2516 ADDQ $0x04, CX
2517 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2518
2519 repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2520 LEAL -4(R10), R10
2521 MOVW $0x0015, (CX)
2522 MOVB R10, 2(CX)
2523 ADDQ $0x03, CX
2524 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2525
2526 repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2527 SHLL $0x02, R10
2528 ORL $0x01, R10
2529 MOVW R10, (CX)
2530 ADDQ $0x02, CX
2531 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2532
2533 repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
2534 XORQ DI, DI
2535 LEAL 1(DI)(R10*4), R10
2536 MOVB SI, 1(CX)
2537 SARL $0x08, SI
2538 SHLL $0x05, SI
2539 ORL SI, R10
2540 MOVB R10, (CX)
2541 ADDQ $0x02, CX
2542 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2543
2544 two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
2545 MOVL R10, DI
2546 SHLL $0x02, DI
2547 CMPL R10, $0x0c
2548 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
2549 CMPL SI, $0x00000800
2550 JAE emit_copy_three_match_nolit_encodeBlockAsm4MB
2551 LEAL -15(DI), DI
2552 MOVB SI, 1(CX)
2553 SHRL $0x08, SI
2554 SHLL $0x05, SI
2555 ORL SI, DI
2556 MOVB DI, (CX)
2557 ADDQ $0x02, CX
2558 JMP match_nolit_emitcopy_end_encodeBlockAsm4MB
2559
2560 emit_copy_three_match_nolit_encodeBlockAsm4MB:
2561 LEAL -2(DI), DI
2562 MOVB DI, (CX)
2563 MOVW SI, 1(CX)
2564 ADDQ $0x03, CX
2565
2566 match_nolit_emitcopy_end_encodeBlockAsm4MB:
2567 CMPL DX, 8(SP)
2568 JAE emit_remainder_encodeBlockAsm4MB
2569 MOVQ -2(BX)(DX*1), DI
2570 CMPQ CX, (SP)
2571 JB match_nolit_dst_ok_encodeBlockAsm4MB
2572 MOVQ $0x00000000, ret+56(FP)
2573 RET
2574
2575 match_nolit_dst_ok_encodeBlockAsm4MB:
2576 MOVQ $0x0000cf1bbcdcbf9b, R9
2577 MOVQ DI, R8
2578 SHRQ $0x10, DI
2579 MOVQ DI, SI
2580 SHLQ $0x10, R8
2581 IMULQ R9, R8
2582 SHRQ $0x32, R8
2583 SHLQ $0x10, SI
2584 IMULQ R9, SI
2585 SHRQ $0x32, SI
2586 LEAL -2(DX), R9
2587 LEAQ (AX)(SI*4), R10
2588 MOVL (R10), SI
2589 MOVL R9, (AX)(R8*4)
2590 MOVL DX, (R10)
2591 CMPL (BX)(SI*1), DI
2592 JEQ match_nolit_loop_encodeBlockAsm4MB
2593 INCL DX
2594 JMP search_loop_encodeBlockAsm4MB
2595
2596 emit_remainder_encodeBlockAsm4MB:
2597 MOVQ src_len+32(FP), AX
2598 SUBL 12(SP), AX
2599 LEAQ 4(CX)(AX*1), AX
2600 CMPQ AX, (SP)
2601 JB emit_remainder_ok_encodeBlockAsm4MB
2602 MOVQ $0x00000000, ret+56(FP)
2603 RET
2604
2605 emit_remainder_ok_encodeBlockAsm4MB:
2606 MOVQ src_len+32(FP), AX
2607 MOVL 12(SP), DX
2608 CMPL DX, AX
2609 JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB
2610 MOVL AX, SI
2611 MOVL AX, 12(SP)
2612 LEAQ (BX)(DX*1), AX
2613 SUBL DX, SI
2614 LEAL -1(SI), DX
2615 CMPL DX, $0x3c
2616 JB one_byte_emit_remainder_encodeBlockAsm4MB
2617 CMPL DX, $0x00000100
2618 JB two_bytes_emit_remainder_encodeBlockAsm4MB
2619 CMPL DX, $0x00010000
2620 JB three_bytes_emit_remainder_encodeBlockAsm4MB
2621 MOVL DX, BX
2622 SHRL $0x10, BX
2623 MOVB $0xf8, (CX)
2624 MOVW DX, 1(CX)
2625 MOVB BL, 3(CX)
2626 ADDQ $0x04, CX
2627 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2628
2629 three_bytes_emit_remainder_encodeBlockAsm4MB:
2630 MOVB $0xf4, (CX)
2631 MOVW DX, 1(CX)
2632 ADDQ $0x03, CX
2633 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2634
2635 two_bytes_emit_remainder_encodeBlockAsm4MB:
2636 MOVB $0xf0, (CX)
2637 MOVB DL, 1(CX)
2638 ADDQ $0x02, CX
2639 CMPL DX, $0x40
2640 JB memmove_emit_remainder_encodeBlockAsm4MB
2641 JMP memmove_long_emit_remainder_encodeBlockAsm4MB
2642
2643 one_byte_emit_remainder_encodeBlockAsm4MB:
2644 SHLB $0x02, DL
2645 MOVB DL, (CX)
2646 ADDQ $0x01, CX
2647
2648 memmove_emit_remainder_encodeBlockAsm4MB:
2649 LEAQ (CX)(SI*1), DX
2650 MOVL SI, BX
2651
2652 // genMemMoveShort
2653 CMPQ BX, $0x03
2654 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2
2655 JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3
2656 CMPQ BX, $0x08
2657 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7
2658 CMPQ BX, $0x10
2659 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16
2660 CMPQ BX, $0x20
2661 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32
2662 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64
2663
2664 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2:
2665 MOVB (AX), SI
2666 MOVB -1(AX)(BX*1), AL
2667 MOVB SI, (CX)
2668 MOVB AL, -1(CX)(BX*1)
2669 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2670
2671 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3:
2672 MOVW (AX), SI
2673 MOVB 2(AX), AL
2674 MOVW SI, (CX)
2675 MOVB AL, 2(CX)
2676 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2677
2678 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7:
2679 MOVL (AX), SI
2680 MOVL -4(AX)(BX*1), AX
2681 MOVL SI, (CX)
2682 MOVL AX, -4(CX)(BX*1)
2683 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2684
2685 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16:
2686 MOVQ (AX), SI
2687 MOVQ -8(AX)(BX*1), AX
2688 MOVQ SI, (CX)
2689 MOVQ AX, -8(CX)(BX*1)
2690 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2691
2692 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32:
2693 MOVOU (AX), X0
2694 MOVOU -16(AX)(BX*1), X1
2695 MOVOU X0, (CX)
2696 MOVOU X1, -16(CX)(BX*1)
2697 JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB
2698
2699 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64:
2700 MOVOU (AX), X0
2701 MOVOU 16(AX), X1
2702 MOVOU -32(AX)(BX*1), X2
2703 MOVOU -16(AX)(BX*1), X3
2704 MOVOU X0, (CX)
2705 MOVOU X1, 16(CX)
2706 MOVOU X2, -32(CX)(BX*1)
2707 MOVOU X3, -16(CX)(BX*1)
2708
2709 memmove_end_copy_emit_remainder_encodeBlockAsm4MB:
2710 MOVQ DX, CX
2711 JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB
2712
2713 memmove_long_emit_remainder_encodeBlockAsm4MB:
2714 LEAQ (CX)(SI*1), DX
2715 MOVL SI, BX
2716
2717 // genMemMoveLong
2718 MOVOU (AX), X0
2719 MOVOU 16(AX), X1
2720 MOVOU -32(AX)(BX*1), X2
2721 MOVOU -16(AX)(BX*1), X3
2722 MOVQ BX, DI
2723 SHRQ $0x05, DI
2724 MOVQ CX, SI
2725 ANDL $0x0000001f, SI
2726 MOVQ $0x00000040, R8
2727 SUBQ SI, R8
2728 DECQ DI
2729 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2730 LEAQ -32(AX)(R8*1), SI
2731 LEAQ -32(CX)(R8*1), R9
2732
2733 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back:
2734 MOVOU (SI), X4
2735 MOVOU 16(SI), X5
2736 MOVOA X4, (R9)
2737 MOVOA X5, 16(R9)
2738 ADDQ $0x20, R9
2739 ADDQ $0x20, SI
2740 ADDQ $0x20, R8
2741 DECQ DI
2742 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back
2743
2744 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32:
2745 MOVOU -32(AX)(R8*1), X4
2746 MOVOU -16(AX)(R8*1), X5
2747 MOVOA X4, -32(CX)(R8*1)
2748 MOVOA X5, -16(CX)(R8*1)
2749 ADDQ $0x20, R8
2750 CMPQ BX, R8
2751 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32
2752 MOVOU X0, (CX)
2753 MOVOU X1, 16(CX)
2754 MOVOU X2, -32(CX)(BX*1)
2755 MOVOU X3, -16(CX)(BX*1)
2756 MOVQ DX, CX
2757
2758 emit_literal_done_emit_remainder_encodeBlockAsm4MB:
2759 MOVQ dst_base+0(FP), AX
2760 SUBQ AX, CX
2761 MOVQ CX, ret+56(FP)
2762 RET
2763
2764 // func encodeBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
2765 // Requires: BMI, SSE2
2766 TEXT ·encodeBlockAsm12B(SB), $24-64
2767 MOVQ tmp+48(FP), AX
2768 MOVQ dst_base+0(FP), CX
2769 MOVQ $0x00000080, DX
2770 MOVQ AX, BX
2771 PXOR X0, X0
2772
2773 zero_loop_encodeBlockAsm12B:
2774 MOVOU X0, (BX)
2775 MOVOU X0, 16(BX)
2776 MOVOU X0, 32(BX)
2777 MOVOU X0, 48(BX)
2778 MOVOU X0, 64(BX)
2779 MOVOU X0, 80(BX)
2780 MOVOU X0, 96(BX)
2781 MOVOU X0, 112(BX)
2782 ADDQ $0x80, BX
2783 DECQ DX
2784 JNZ zero_loop_encodeBlockAsm12B
2785 MOVL $0x00000000, 12(SP)
2786 MOVQ src_len+32(FP), DX
2787 LEAQ -9(DX), BX
2788 LEAQ -8(DX), SI
2789 MOVL SI, 8(SP)
2790 SHRQ $0x05, DX
2791 SUBL DX, BX
2792 LEAQ (CX)(BX*1), BX
2793 MOVQ BX, (SP)
2794 MOVL $0x00000001, DX
2795 MOVL DX, 16(SP)
2796 MOVQ src_base+24(FP), BX
2797
2798 search_loop_encodeBlockAsm12B:
2799 MOVL DX, SI
2800 SUBL 12(SP), SI
2801 SHRL $0x05, SI
2802 LEAL 4(DX)(SI*1), SI
2803 CMPL SI, 8(SP)
2804 JAE emit_remainder_encodeBlockAsm12B
2805 MOVQ (BX)(DX*1), DI
2806 MOVL SI, 20(SP)
2807 MOVQ $0x000000cf1bbcdcbb, R9
2808 MOVQ DI, R10
2809 MOVQ DI, R11
2810 SHRQ $0x08, R11
2811 SHLQ $0x18, R10
2812 IMULQ R9, R10
2813 SHRQ $0x34, R10
2814 SHLQ $0x18, R11
2815 IMULQ R9, R11
2816 SHRQ $0x34, R11
2817 MOVL (AX)(R10*4), SI
2818 MOVL (AX)(R11*4), R8
2819 MOVL DX, (AX)(R10*4)
2820 LEAL 1(DX), R10
2821 MOVL R10, (AX)(R11*4)
2822 MOVQ DI, R10
2823 SHRQ $0x10, R10
2824 SHLQ $0x18, R10
2825 IMULQ R9, R10
2826 SHRQ $0x34, R10
2827 MOVL DX, R9
2828 SUBL 16(SP), R9
2829 MOVL 1(BX)(R9*1), R11
2830 MOVQ DI, R9
2831 SHRQ $0x08, R9
2832 CMPL R9, R11
2833 JNE no_repeat_found_encodeBlockAsm12B
2834 LEAL 1(DX), DI
2835 MOVL 12(SP), R8
2836 MOVL DI, SI
2837 SUBL 16(SP), SI
2838 JZ repeat_extend_back_end_encodeBlockAsm12B
2839
2840 repeat_extend_back_loop_encodeBlockAsm12B:
2841 CMPL DI, R8
2842 JBE repeat_extend_back_end_encodeBlockAsm12B
2843 MOVB -1(BX)(SI*1), R9
2844 MOVB -1(BX)(DI*1), R10
2845 CMPB R9, R10
2846 JNE repeat_extend_back_end_encodeBlockAsm12B
2847 LEAL -1(DI), DI
2848 DECL SI
2849 JNZ repeat_extend_back_loop_encodeBlockAsm12B
2850
2851 repeat_extend_back_end_encodeBlockAsm12B:
2852 MOVL DI, SI
2853 SUBL 12(SP), SI
2854 LEAQ 3(CX)(SI*1), SI
2855 CMPQ SI, (SP)
2856 JB repeat_dst_size_check_encodeBlockAsm12B
2857 MOVQ $0x00000000, ret+56(FP)
2858 RET
2859
2860 repeat_dst_size_check_encodeBlockAsm12B:
2861 MOVL 12(SP), SI
2862 CMPL SI, DI
2863 JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B
2864 MOVL DI, R9
2865 MOVL DI, 12(SP)
2866 LEAQ (BX)(SI*1), R10
2867 SUBL SI, R9
2868 LEAL -1(R9), SI
2869 CMPL SI, $0x3c
2870 JB one_byte_repeat_emit_encodeBlockAsm12B
2871 CMPL SI, $0x00000100
2872 JB two_bytes_repeat_emit_encodeBlockAsm12B
2873 JB three_bytes_repeat_emit_encodeBlockAsm12B
2874
2875 three_bytes_repeat_emit_encodeBlockAsm12B:
2876 MOVB $0xf4, (CX)
2877 MOVW SI, 1(CX)
2878 ADDQ $0x03, CX
2879 JMP memmove_long_repeat_emit_encodeBlockAsm12B
2880
2881 two_bytes_repeat_emit_encodeBlockAsm12B:
2882 MOVB $0xf0, (CX)
2883 MOVB SI, 1(CX)
2884 ADDQ $0x02, CX
2885 CMPL SI, $0x40
2886 JB memmove_repeat_emit_encodeBlockAsm12B
2887 JMP memmove_long_repeat_emit_encodeBlockAsm12B
2888
2889 one_byte_repeat_emit_encodeBlockAsm12B:
2890 SHLB $0x02, SI
2891 MOVB SI, (CX)
2892 ADDQ $0x01, CX
2893
2894 memmove_repeat_emit_encodeBlockAsm12B:
2895 LEAQ (CX)(R9*1), SI
2896
2897 // genMemMoveShort
2898 CMPQ R9, $0x08
2899 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8
2900 CMPQ R9, $0x10
2901 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16
2902 CMPQ R9, $0x20
2903 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32
2904 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64
2905
2906 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8:
2907 MOVQ (R10), R11
2908 MOVQ R11, (CX)
2909 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2910
2911 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16:
2912 MOVQ (R10), R11
2913 MOVQ -8(R10)(R9*1), R10
2914 MOVQ R11, (CX)
2915 MOVQ R10, -8(CX)(R9*1)
2916 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2917
2918 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32:
2919 MOVOU (R10), X0
2920 MOVOU -16(R10)(R9*1), X1
2921 MOVOU X0, (CX)
2922 MOVOU X1, -16(CX)(R9*1)
2923 JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B
2924
2925 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64:
2926 MOVOU (R10), X0
2927 MOVOU 16(R10), X1
2928 MOVOU -32(R10)(R9*1), X2
2929 MOVOU -16(R10)(R9*1), X3
2930 MOVOU X0, (CX)
2931 MOVOU X1, 16(CX)
2932 MOVOU X2, -32(CX)(R9*1)
2933 MOVOU X3, -16(CX)(R9*1)
2934
2935 memmove_end_copy_repeat_emit_encodeBlockAsm12B:
2936 MOVQ SI, CX
2937 JMP emit_literal_done_repeat_emit_encodeBlockAsm12B
2938
2939 memmove_long_repeat_emit_encodeBlockAsm12B:
2940 LEAQ (CX)(R9*1), SI
2941
2942 // genMemMoveLong
2943 MOVOU (R10), X0
2944 MOVOU 16(R10), X1
2945 MOVOU -32(R10)(R9*1), X2
2946 MOVOU -16(R10)(R9*1), X3
2947 MOVQ R9, R12
2948 SHRQ $0x05, R12
2949 MOVQ CX, R11
2950 ANDL $0x0000001f, R11
2951 MOVQ $0x00000040, R13
2952 SUBQ R11, R13
2953 DECQ R12
2954 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2955 LEAQ -32(R10)(R13*1), R11
2956 LEAQ -32(CX)(R13*1), R14
2957
2958 emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back:
2959 MOVOU (R11), X4
2960 MOVOU 16(R11), X5
2961 MOVOA X4, (R14)
2962 MOVOA X5, 16(R14)
2963 ADDQ $0x20, R14
2964 ADDQ $0x20, R11
2965 ADDQ $0x20, R13
2966 DECQ R12
2967 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back
2968
2969 emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
2970 MOVOU -32(R10)(R13*1), X4
2971 MOVOU -16(R10)(R13*1), X5
2972 MOVOA X4, -32(CX)(R13*1)
2973 MOVOA X5, -16(CX)(R13*1)
2974 ADDQ $0x20, R13
2975 CMPQ R9, R13
2976 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
2977 MOVOU X0, (CX)
2978 MOVOU X1, 16(CX)
2979 MOVOU X2, -32(CX)(R9*1)
2980 MOVOU X3, -16(CX)(R9*1)
2981 MOVQ SI, CX
2982
2983 emit_literal_done_repeat_emit_encodeBlockAsm12B:
2984 ADDL $0x05, DX
2985 MOVL DX, SI
2986 SUBL 16(SP), SI
2987 MOVQ src_len+32(FP), R9
2988 SUBL DX, R9
2989 LEAQ (BX)(DX*1), R10
2990 LEAQ (BX)(SI*1), SI
2991
2992 // matchLen
2993 XORL R12, R12
2994
2995 matchlen_loopback_16_repeat_extend_encodeBlockAsm12B:
2996 CMPL R9, $0x10
2997 JB matchlen_match8_repeat_extend_encodeBlockAsm12B
2998 MOVQ (R10)(R12*1), R11
2999 MOVQ 8(R10)(R12*1), R13
3000 XORQ (SI)(R12*1), R11
3001 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
3002 XORQ 8(SI)(R12*1), R13
3003 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B
3004 LEAL -16(R9), R9
3005 LEAL 16(R12), R12
3006 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B
3007
3008 matchlen_bsf_16repeat_extend_encodeBlockAsm12B:
3009 #ifdef GOAMD64_v3
3010 TZCNTQ R13, R13
3011
3012 #else
3013 BSFQ R13, R13
3014
3015 #endif
3016 SARQ $0x03, R13
3017 LEAL 8(R12)(R13*1), R12
3018 JMP repeat_extend_forward_end_encodeBlockAsm12B
3019
3020 matchlen_match8_repeat_extend_encodeBlockAsm12B:
3021 CMPL R9, $0x08
3022 JB matchlen_match4_repeat_extend_encodeBlockAsm12B
3023 MOVQ (R10)(R12*1), R11
3024 XORQ (SI)(R12*1), R11
3025 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B
3026 LEAL -8(R9), R9
3027 LEAL 8(R12), R12
3028 JMP matchlen_match4_repeat_extend_encodeBlockAsm12B
3029
3030 matchlen_bsf_8_repeat_extend_encodeBlockAsm12B:
3031 #ifdef GOAMD64_v3
3032 TZCNTQ R11, R11
3033
3034 #else
3035 BSFQ R11, R11
3036
3037 #endif
3038 SARQ $0x03, R11
3039 LEAL (R12)(R11*1), R12
3040 JMP repeat_extend_forward_end_encodeBlockAsm12B
3041
3042 matchlen_match4_repeat_extend_encodeBlockAsm12B:
3043 CMPL R9, $0x04
3044 JB matchlen_match2_repeat_extend_encodeBlockAsm12B
3045 MOVL (R10)(R12*1), R11
3046 CMPL (SI)(R12*1), R11
3047 JNE matchlen_match2_repeat_extend_encodeBlockAsm12B
3048 LEAL -4(R9), R9
3049 LEAL 4(R12), R12
3050
3051 matchlen_match2_repeat_extend_encodeBlockAsm12B:
3052 CMPL R9, $0x01
3053 JE matchlen_match1_repeat_extend_encodeBlockAsm12B
3054 JB repeat_extend_forward_end_encodeBlockAsm12B
3055 MOVW (R10)(R12*1), R11
3056 CMPW (SI)(R12*1), R11
3057 JNE matchlen_match1_repeat_extend_encodeBlockAsm12B
3058 LEAL 2(R12), R12
3059 SUBL $0x02, R9
3060 JZ repeat_extend_forward_end_encodeBlockAsm12B
3061
3062 matchlen_match1_repeat_extend_encodeBlockAsm12B:
3063 MOVB (R10)(R12*1), R11
3064 CMPB (SI)(R12*1), R11
3065 JNE repeat_extend_forward_end_encodeBlockAsm12B
3066 LEAL 1(R12), R12
3067
3068 repeat_extend_forward_end_encodeBlockAsm12B:
3069 ADDL R12, DX
3070 MOVL DX, SI
3071 SUBL DI, SI
3072 MOVL 16(SP), DI
3073 TESTL R8, R8
3074 JZ repeat_as_copy_encodeBlockAsm12B
3075
3076 // emitRepeat
3077 MOVL SI, R8
3078 LEAL -4(SI), SI
3079 CMPL R8, $0x08
3080 JBE repeat_two_match_repeat_encodeBlockAsm12B
3081 CMPL R8, $0x0c
3082 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B
3083 CMPL DI, $0x00000800
3084 JB repeat_two_offset_match_repeat_encodeBlockAsm12B
3085
3086 cant_repeat_two_offset_match_repeat_encodeBlockAsm12B:
3087 CMPL SI, $0x00000104
3088 JB repeat_three_match_repeat_encodeBlockAsm12B
3089 LEAL -256(SI), SI
3090 MOVW $0x0019, (CX)
3091 MOVW SI, 2(CX)
3092 ADDQ $0x04, CX
3093 JMP repeat_end_emit_encodeBlockAsm12B
3094
3095 repeat_three_match_repeat_encodeBlockAsm12B:
3096 LEAL -4(SI), SI
3097 MOVW $0x0015, (CX)
3098 MOVB SI, 2(CX)
3099 ADDQ $0x03, CX
3100 JMP repeat_end_emit_encodeBlockAsm12B
3101
3102 repeat_two_match_repeat_encodeBlockAsm12B:
3103 SHLL $0x02, SI
3104 ORL $0x01, SI
3105 MOVW SI, (CX)
3106 ADDQ $0x02, CX
3107 JMP repeat_end_emit_encodeBlockAsm12B
3108
3109 repeat_two_offset_match_repeat_encodeBlockAsm12B:
3110 XORQ R8, R8
3111 LEAL 1(R8)(SI*4), SI
3112 MOVB DI, 1(CX)
3113 SARL $0x08, DI
3114 SHLL $0x05, DI
3115 ORL DI, SI
3116 MOVB SI, (CX)
3117 ADDQ $0x02, CX
3118 JMP repeat_end_emit_encodeBlockAsm12B
3119
3120 repeat_as_copy_encodeBlockAsm12B:
3121 // emitCopy
3122 CMPL SI, $0x40
3123 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
3124 CMPL DI, $0x00000800
3125 JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B
3126 MOVL $0x00000001, R8
3127 LEAL 16(R8), R8
3128 MOVB DI, 1(CX)
3129 SHRL $0x08, DI
3130 SHLL $0x05, DI
3131 ORL DI, R8
3132 MOVB R8, (CX)
3133 ADDQ $0x02, CX
3134 SUBL $0x08, SI
3135
3136 // emitRepeat
3137 LEAL -4(SI), SI
3138 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3139 MOVL SI, R8
3140 LEAL -4(SI), SI
3141 CMPL R8, $0x08
3142 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3143 CMPL R8, $0x0c
3144 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3145 CMPL DI, $0x00000800
3146 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3147
3148 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3149 CMPL SI, $0x00000104
3150 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b
3151 LEAL -256(SI), SI
3152 MOVW $0x0019, (CX)
3153 MOVW SI, 2(CX)
3154 ADDQ $0x04, CX
3155 JMP repeat_end_emit_encodeBlockAsm12B
3156
3157 repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3158 LEAL -4(SI), SI
3159 MOVW $0x0015, (CX)
3160 MOVB SI, 2(CX)
3161 ADDQ $0x03, CX
3162 JMP repeat_end_emit_encodeBlockAsm12B
3163
3164 repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3165 SHLL $0x02, SI
3166 ORL $0x01, SI
3167 MOVW SI, (CX)
3168 ADDQ $0x02, CX
3169 JMP repeat_end_emit_encodeBlockAsm12B
3170
3171 repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b:
3172 XORQ R8, R8
3173 LEAL 1(R8)(SI*4), SI
3174 MOVB DI, 1(CX)
3175 SARL $0x08, DI
3176 SHLL $0x05, DI
3177 ORL DI, SI
3178 MOVB SI, (CX)
3179 ADDQ $0x02, CX
3180 JMP repeat_end_emit_encodeBlockAsm12B
3181
3182 long_offset_short_repeat_as_copy_encodeBlockAsm12B:
3183 MOVB $0xee, (CX)
3184 MOVW DI, 1(CX)
3185 LEAL -60(SI), SI
3186 ADDQ $0x03, CX
3187
3188 // emitRepeat
3189 MOVL SI, R8
3190 LEAL -4(SI), SI
3191 CMPL R8, $0x08
3192 JBE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3193 CMPL R8, $0x0c
3194 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3195 CMPL DI, $0x00000800
3196 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3197
3198 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3199 CMPL SI, $0x00000104
3200 JB repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short
3201 LEAL -256(SI), SI
3202 MOVW $0x0019, (CX)
3203 MOVW SI, 2(CX)
3204 ADDQ $0x04, CX
3205 JMP repeat_end_emit_encodeBlockAsm12B
3206
3207 repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3208 LEAL -4(SI), SI
3209 MOVW $0x0015, (CX)
3210 MOVB SI, 2(CX)
3211 ADDQ $0x03, CX
3212 JMP repeat_end_emit_encodeBlockAsm12B
3213
3214 repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3215 SHLL $0x02, SI
3216 ORL $0x01, SI
3217 MOVW SI, (CX)
3218 ADDQ $0x02, CX
3219 JMP repeat_end_emit_encodeBlockAsm12B
3220
3221 repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
3222 XORQ R8, R8
3223 LEAL 1(R8)(SI*4), SI
3224 MOVB DI, 1(CX)
3225 SARL $0x08, DI
3226 SHLL $0x05, DI
3227 ORL DI, SI
3228 MOVB SI, (CX)
3229 ADDQ $0x02, CX
3230 JMP repeat_end_emit_encodeBlockAsm12B
3231
3232 two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
3233 MOVL SI, R8
3234 SHLL $0x02, R8
3235 CMPL SI, $0x0c
3236 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
3237 CMPL DI, $0x00000800
3238 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm12B
3239 LEAL -15(R8), R8
3240 MOVB DI, 1(CX)
3241 SHRL $0x08, DI
3242 SHLL $0x05, DI
3243 ORL DI, R8
3244 MOVB R8, (CX)
3245 ADDQ $0x02, CX
3246 JMP repeat_end_emit_encodeBlockAsm12B
3247
3248 emit_copy_three_repeat_as_copy_encodeBlockAsm12B:
3249 LEAL -2(R8), R8
3250 MOVB R8, (CX)
3251 MOVW DI, 1(CX)
3252 ADDQ $0x03, CX
3253
3254 repeat_end_emit_encodeBlockAsm12B:
3255 MOVL DX, 12(SP)
3256 JMP search_loop_encodeBlockAsm12B
3257
3258 no_repeat_found_encodeBlockAsm12B:
3259 CMPL (BX)(SI*1), DI
3260 JEQ candidate_match_encodeBlockAsm12B
3261 SHRQ $0x08, DI
3262 MOVL (AX)(R10*4), SI
3263 LEAL 2(DX), R9
3264 CMPL (BX)(R8*1), DI
3265 JEQ candidate2_match_encodeBlockAsm12B
3266 MOVL R9, (AX)(R10*4)
3267 SHRQ $0x08, DI
3268 CMPL (BX)(SI*1), DI
3269 JEQ candidate3_match_encodeBlockAsm12B
3270 MOVL 20(SP), DX
3271 JMP search_loop_encodeBlockAsm12B
3272
3273 candidate3_match_encodeBlockAsm12B:
3274 ADDL $0x02, DX
3275 JMP candidate_match_encodeBlockAsm12B
3276
3277 candidate2_match_encodeBlockAsm12B:
3278 MOVL R9, (AX)(R10*4)
3279 INCL DX
3280 MOVL R8, SI
3281
3282 candidate_match_encodeBlockAsm12B:
3283 MOVL 12(SP), DI
3284 TESTL SI, SI
3285 JZ match_extend_back_end_encodeBlockAsm12B
3286
3287 match_extend_back_loop_encodeBlockAsm12B:
3288 CMPL DX, DI
3289 JBE match_extend_back_end_encodeBlockAsm12B
3290 MOVB -1(BX)(SI*1), R8
3291 MOVB -1(BX)(DX*1), R9
3292 CMPB R8, R9
3293 JNE match_extend_back_end_encodeBlockAsm12B
3294 LEAL -1(DX), DX
3295 DECL SI
3296 JZ match_extend_back_end_encodeBlockAsm12B
3297 JMP match_extend_back_loop_encodeBlockAsm12B
3298
3299 match_extend_back_end_encodeBlockAsm12B:
3300 MOVL DX, DI
3301 SUBL 12(SP), DI
3302 LEAQ 3(CX)(DI*1), DI
3303 CMPQ DI, (SP)
3304 JB match_dst_size_check_encodeBlockAsm12B
3305 MOVQ $0x00000000, ret+56(FP)
3306 RET
3307
3308 match_dst_size_check_encodeBlockAsm12B:
3309 MOVL DX, DI
3310 MOVL 12(SP), R8
3311 CMPL R8, DI
3312 JEQ emit_literal_done_match_emit_encodeBlockAsm12B
3313 MOVL DI, R9
3314 MOVL DI, 12(SP)
3315 LEAQ (BX)(R8*1), DI
3316 SUBL R8, R9
3317 LEAL -1(R9), R8
3318 CMPL R8, $0x3c
3319 JB one_byte_match_emit_encodeBlockAsm12B
3320 CMPL R8, $0x00000100
3321 JB two_bytes_match_emit_encodeBlockAsm12B
3322 JB three_bytes_match_emit_encodeBlockAsm12B
3323
3324 three_bytes_match_emit_encodeBlockAsm12B:
3325 MOVB $0xf4, (CX)
3326 MOVW R8, 1(CX)
3327 ADDQ $0x03, CX
3328 JMP memmove_long_match_emit_encodeBlockAsm12B
3329
3330 two_bytes_match_emit_encodeBlockAsm12B:
3331 MOVB $0xf0, (CX)
3332 MOVB R8, 1(CX)
3333 ADDQ $0x02, CX
3334 CMPL R8, $0x40
3335 JB memmove_match_emit_encodeBlockAsm12B
3336 JMP memmove_long_match_emit_encodeBlockAsm12B
3337
3338 one_byte_match_emit_encodeBlockAsm12B:
3339 SHLB $0x02, R8
3340 MOVB R8, (CX)
3341 ADDQ $0x01, CX
3342
3343 memmove_match_emit_encodeBlockAsm12B:
3344 LEAQ (CX)(R9*1), R8
3345
3346 // genMemMoveShort
3347 CMPQ R9, $0x08
3348 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8
3349 CMPQ R9, $0x10
3350 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16
3351 CMPQ R9, $0x20
3352 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32
3353 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64
3354
3355 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8:
3356 MOVQ (DI), R10
3357 MOVQ R10, (CX)
3358 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3359
3360 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16:
3361 MOVQ (DI), R10
3362 MOVQ -8(DI)(R9*1), DI
3363 MOVQ R10, (CX)
3364 MOVQ DI, -8(CX)(R9*1)
3365 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3366
3367 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32:
3368 MOVOU (DI), X0
3369 MOVOU -16(DI)(R9*1), X1
3370 MOVOU X0, (CX)
3371 MOVOU X1, -16(CX)(R9*1)
3372 JMP memmove_end_copy_match_emit_encodeBlockAsm12B
3373
3374 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64:
3375 MOVOU (DI), X0
3376 MOVOU 16(DI), X1
3377 MOVOU -32(DI)(R9*1), X2
3378 MOVOU -16(DI)(R9*1), X3
3379 MOVOU X0, (CX)
3380 MOVOU X1, 16(CX)
3381 MOVOU X2, -32(CX)(R9*1)
3382 MOVOU X3, -16(CX)(R9*1)
3383
3384 memmove_end_copy_match_emit_encodeBlockAsm12B:
3385 MOVQ R8, CX
3386 JMP emit_literal_done_match_emit_encodeBlockAsm12B
3387
3388 memmove_long_match_emit_encodeBlockAsm12B:
3389 LEAQ (CX)(R9*1), R8
3390
3391 // genMemMoveLong
3392 MOVOU (DI), X0
3393 MOVOU 16(DI), X1
3394 MOVOU -32(DI)(R9*1), X2
3395 MOVOU -16(DI)(R9*1), X3
3396 MOVQ R9, R11
3397 SHRQ $0x05, R11
3398 MOVQ CX, R10
3399 ANDL $0x0000001f, R10
3400 MOVQ $0x00000040, R12
3401 SUBQ R10, R12
3402 DECQ R11
3403 JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
3404 LEAQ -32(DI)(R12*1), R10
3405 LEAQ -32(CX)(R12*1), R13
3406
3407 emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back:
3408 MOVOU (R10), X4
3409 MOVOU 16(R10), X5
3410 MOVOA X4, (R13)
3411 MOVOA X5, 16(R13)
3412 ADDQ $0x20, R13
3413 ADDQ $0x20, R10
3414 ADDQ $0x20, R12
3415 DECQ R11
3416 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back
3417
3418 emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32:
3419 MOVOU -32(DI)(R12*1), X4
3420 MOVOU -16(DI)(R12*1), X5
3421 MOVOA X4, -32(CX)(R12*1)
3422 MOVOA X5, -16(CX)(R12*1)
3423 ADDQ $0x20, R12
3424 CMPQ R9, R12
3425 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32
3426 MOVOU X0, (CX)
3427 MOVOU X1, 16(CX)
3428 MOVOU X2, -32(CX)(R9*1)
3429 MOVOU X3, -16(CX)(R9*1)
3430 MOVQ R8, CX
3431
3432 emit_literal_done_match_emit_encodeBlockAsm12B:
3433 match_nolit_loop_encodeBlockAsm12B:
3434 MOVL DX, DI
3435 SUBL SI, DI
3436 MOVL DI, 16(SP)
3437 ADDL $0x04, DX
3438 ADDL $0x04, SI
3439 MOVQ src_len+32(FP), DI
3440 SUBL DX, DI
3441 LEAQ (BX)(DX*1), R8
3442 LEAQ (BX)(SI*1), SI
3443
3444 // matchLen
3445 XORL R10, R10
3446
3447 matchlen_loopback_16_match_nolit_encodeBlockAsm12B:
3448 CMPL DI, $0x10
3449 JB matchlen_match8_match_nolit_encodeBlockAsm12B
3450 MOVQ (R8)(R10*1), R9
3451 MOVQ 8(R8)(R10*1), R11
3452 XORQ (SI)(R10*1), R9
3453 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
3454 XORQ 8(SI)(R10*1), R11
3455 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B
3456 LEAL -16(DI), DI
3457 LEAL 16(R10), R10
3458 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B
3459
3460 matchlen_bsf_16match_nolit_encodeBlockAsm12B:
3461 #ifdef GOAMD64_v3
3462 TZCNTQ R11, R11
3463
3464 #else
3465 BSFQ R11, R11
3466
3467 #endif
3468 SARQ $0x03, R11
3469 LEAL 8(R10)(R11*1), R10
3470 JMP match_nolit_end_encodeBlockAsm12B
3471
3472 matchlen_match8_match_nolit_encodeBlockAsm12B:
3473 CMPL DI, $0x08
3474 JB matchlen_match4_match_nolit_encodeBlockAsm12B
3475 MOVQ (R8)(R10*1), R9
3476 XORQ (SI)(R10*1), R9
3477 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B
3478 LEAL -8(DI), DI
3479 LEAL 8(R10), R10
3480 JMP matchlen_match4_match_nolit_encodeBlockAsm12B
3481
3482 matchlen_bsf_8_match_nolit_encodeBlockAsm12B:
3483 #ifdef GOAMD64_v3
3484 TZCNTQ R9, R9
3485
3486 #else
3487 BSFQ R9, R9
3488
3489 #endif
3490 SARQ $0x03, R9
3491 LEAL (R10)(R9*1), R10
3492 JMP match_nolit_end_encodeBlockAsm12B
3493
3494 matchlen_match4_match_nolit_encodeBlockAsm12B:
3495 CMPL DI, $0x04
3496 JB matchlen_match2_match_nolit_encodeBlockAsm12B
3497 MOVL (R8)(R10*1), R9
3498 CMPL (SI)(R10*1), R9
3499 JNE matchlen_match2_match_nolit_encodeBlockAsm12B
3500 LEAL -4(DI), DI
3501 LEAL 4(R10), R10
3502
3503 matchlen_match2_match_nolit_encodeBlockAsm12B:
3504 CMPL DI, $0x01
3505 JE matchlen_match1_match_nolit_encodeBlockAsm12B
3506 JB match_nolit_end_encodeBlockAsm12B
3507 MOVW (R8)(R10*1), R9
3508 CMPW (SI)(R10*1), R9
3509 JNE matchlen_match1_match_nolit_encodeBlockAsm12B
3510 LEAL 2(R10), R10
3511 SUBL $0x02, DI
3512 JZ match_nolit_end_encodeBlockAsm12B
3513
3514 matchlen_match1_match_nolit_encodeBlockAsm12B:
3515 MOVB (R8)(R10*1), R9
3516 CMPB (SI)(R10*1), R9
3517 JNE match_nolit_end_encodeBlockAsm12B
3518 LEAL 1(R10), R10
3519
3520 match_nolit_end_encodeBlockAsm12B:
3521 ADDL R10, DX
3522 MOVL 16(SP), SI
3523 ADDL $0x04, R10
3524 MOVL DX, 12(SP)
3525
3526 // emitCopy
3527 CMPL R10, $0x40
3528 JBE two_byte_offset_short_match_nolit_encodeBlockAsm12B
3529 CMPL SI, $0x00000800
3530 JAE long_offset_short_match_nolit_encodeBlockAsm12B
3531 MOVL $0x00000001, DI
3532 LEAL 16(DI), DI
3533 MOVB SI, 1(CX)
3534 SHRL $0x08, SI
3535 SHLL $0x05, SI
3536 ORL SI, DI
3537 MOVB DI, (CX)
3538 ADDQ $0x02, CX
3539 SUBL $0x08, R10
3540
3541 // emitRepeat
3542 LEAL -4(R10), R10
3543 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3544 MOVL R10, DI
3545 LEAL -4(R10), R10
3546 CMPL DI, $0x08
3547 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3548 CMPL DI, $0x0c
3549 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3550 CMPL SI, $0x00000800
3551 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3552
3553 cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3554 CMPL R10, $0x00000104
3555 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b
3556 LEAL -256(R10), R10
3557 MOVW $0x0019, (CX)
3558 MOVW R10, 2(CX)
3559 ADDQ $0x04, CX
3560 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3561
3562 repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3563 LEAL -4(R10), R10
3564 MOVW $0x0015, (CX)
3565 MOVB R10, 2(CX)
3566 ADDQ $0x03, CX
3567 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3568
3569 repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3570 SHLL $0x02, R10
3571 ORL $0x01, R10
3572 MOVW R10, (CX)
3573 ADDQ $0x02, CX
3574 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3575
3576 repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b:
3577 XORQ DI, DI
3578 LEAL 1(DI)(R10*4), R10
3579 MOVB SI, 1(CX)
3580 SARL $0x08, SI
3581 SHLL $0x05, SI
3582 ORL SI, R10
3583 MOVB R10, (CX)
3584 ADDQ $0x02, CX
3585 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3586
3587 long_offset_short_match_nolit_encodeBlockAsm12B:
3588 MOVB $0xee, (CX)
3589 MOVW SI, 1(CX)
3590 LEAL -60(R10), R10
3591 ADDQ $0x03, CX
3592
3593 // emitRepeat
3594 MOVL R10, DI
3595 LEAL -4(R10), R10
3596 CMPL DI, $0x08
3597 JBE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short
3598 CMPL DI, $0x0c
3599 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3600 CMPL SI, $0x00000800
3601 JB repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short
3602
3603 cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3604 CMPL R10, $0x00000104
3605 JB repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short
3606 LEAL -256(R10), R10
3607 MOVW $0x0019, (CX)
3608 MOVW R10, 2(CX)
3609 ADDQ $0x04, CX
3610 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3611
3612 repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short:
3613 LEAL -4(R10), R10
3614 MOVW $0x0015, (CX)
3615 MOVB R10, 2(CX)
3616 ADDQ $0x03, CX
3617 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3618
3619 repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short:
3620 SHLL $0x02, R10
3621 ORL $0x01, R10
3622 MOVW R10, (CX)
3623 ADDQ $0x02, CX
3624 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3625
3626 repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
3627 XORQ DI, DI
3628 LEAL 1(DI)(R10*4), R10
3629 MOVB SI, 1(CX)
3630 SARL $0x08, SI
3631 SHLL $0x05, SI
3632 ORL SI, R10
3633 MOVB R10, (CX)
3634 ADDQ $0x02, CX
3635 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3636
3637 two_byte_offset_short_match_nolit_encodeBlockAsm12B:
3638 MOVL R10, DI
3639 SHLL $0x02, DI
3640 CMPL R10, $0x0c
3641 JAE emit_copy_three_match_nolit_encodeBlockAsm12B
3642 CMPL SI, $0x00000800
3643 JAE emit_copy_three_match_nolit_encodeBlockAsm12B
3644 LEAL -15(DI), DI
3645 MOVB SI, 1(CX)
3646 SHRL $0x08, SI
3647 SHLL $0x05, SI
3648 ORL SI, DI
3649 MOVB DI, (CX)
3650 ADDQ $0x02, CX
3651 JMP match_nolit_emitcopy_end_encodeBlockAsm12B
3652
3653 emit_copy_three_match_nolit_encodeBlockAsm12B:
3654 LEAL -2(DI), DI
3655 MOVB DI, (CX)
3656 MOVW SI, 1(CX)
3657 ADDQ $0x03, CX
3658
3659 match_nolit_emitcopy_end_encodeBlockAsm12B:
3660 CMPL DX, 8(SP)
3661 JAE emit_remainder_encodeBlockAsm12B
3662 MOVQ -2(BX)(DX*1), DI
3663 CMPQ CX, (SP)
3664 JB match_nolit_dst_ok_encodeBlockAsm12B
3665 MOVQ $0x00000000, ret+56(FP)
3666 RET
3667
3668 match_nolit_dst_ok_encodeBlockAsm12B:
3669 MOVQ $0x000000cf1bbcdcbb, R9
3670 MOVQ DI, R8
3671 SHRQ $0x10, DI
3672 MOVQ DI, SI
3673 SHLQ $0x18, R8
3674 IMULQ R9, R8
3675 SHRQ $0x34, R8
3676 SHLQ $0x18, SI
3677 IMULQ R9, SI
3678 SHRQ $0x34, SI
3679 LEAL -2(DX), R9
3680 LEAQ (AX)(SI*4), R10
3681 MOVL (R10), SI
3682 MOVL R9, (AX)(R8*4)
3683 MOVL DX, (R10)
3684 CMPL (BX)(SI*1), DI
3685 JEQ match_nolit_loop_encodeBlockAsm12B
3686 INCL DX
3687 JMP search_loop_encodeBlockAsm12B
3688
3689 emit_remainder_encodeBlockAsm12B:
3690 MOVQ src_len+32(FP), AX
3691 SUBL 12(SP), AX
3692 LEAQ 3(CX)(AX*1), AX
3693 CMPQ AX, (SP)
3694 JB emit_remainder_ok_encodeBlockAsm12B
3695 MOVQ $0x00000000, ret+56(FP)
3696 RET
3697
3698 emit_remainder_ok_encodeBlockAsm12B:
3699 MOVQ src_len+32(FP), AX
3700 MOVL 12(SP), DX
3701 CMPL DX, AX
3702 JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B
3703 MOVL AX, SI
3704 MOVL AX, 12(SP)
3705 LEAQ (BX)(DX*1), AX
3706 SUBL DX, SI
3707 LEAL -1(SI), DX
3708 CMPL DX, $0x3c
3709 JB one_byte_emit_remainder_encodeBlockAsm12B
3710 CMPL DX, $0x00000100
3711 JB two_bytes_emit_remainder_encodeBlockAsm12B
3712 JB three_bytes_emit_remainder_encodeBlockAsm12B
3713
3714 three_bytes_emit_remainder_encodeBlockAsm12B:
3715 MOVB $0xf4, (CX)
3716 MOVW DX, 1(CX)
3717 ADDQ $0x03, CX
3718 JMP memmove_long_emit_remainder_encodeBlockAsm12B
3719
3720 two_bytes_emit_remainder_encodeBlockAsm12B:
3721 MOVB $0xf0, (CX)
3722 MOVB DL, 1(CX)
3723 ADDQ $0x02, CX
3724 CMPL DX, $0x40
3725 JB memmove_emit_remainder_encodeBlockAsm12B
3726 JMP memmove_long_emit_remainder_encodeBlockAsm12B
3727
3728 one_byte_emit_remainder_encodeBlockAsm12B:
3729 SHLB $0x02, DL
3730 MOVB DL, (CX)
3731 ADDQ $0x01, CX
3732
3733 memmove_emit_remainder_encodeBlockAsm12B:
3734 LEAQ (CX)(SI*1), DX
3735 MOVL SI, BX
3736
3737 // genMemMoveShort
3738 CMPQ BX, $0x03
3739 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2
3740 JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3
3741 CMPQ BX, $0x08
3742 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7
3743 CMPQ BX, $0x10
3744 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16
3745 CMPQ BX, $0x20
3746 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32
3747 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64
3748
3749 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2:
3750 MOVB (AX), SI
3751 MOVB -1(AX)(BX*1), AL
3752 MOVB SI, (CX)
3753 MOVB AL, -1(CX)(BX*1)
3754 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3755
3756 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3:
3757 MOVW (AX), SI
3758 MOVB 2(AX), AL
3759 MOVW SI, (CX)
3760 MOVB AL, 2(CX)
3761 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3762
3763 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7:
3764 MOVL (AX), SI
3765 MOVL -4(AX)(BX*1), AX
3766 MOVL SI, (CX)
3767 MOVL AX, -4(CX)(BX*1)
3768 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3769
3770 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16:
3771 MOVQ (AX), SI
3772 MOVQ -8(AX)(BX*1), AX
3773 MOVQ SI, (CX)
3774 MOVQ AX, -8(CX)(BX*1)
3775 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3776
3777 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32:
3778 MOVOU (AX), X0
3779 MOVOU -16(AX)(BX*1), X1
3780 MOVOU X0, (CX)
3781 MOVOU X1, -16(CX)(BX*1)
3782 JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B
3783
3784 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64:
3785 MOVOU (AX), X0
3786 MOVOU 16(AX), X1
3787 MOVOU -32(AX)(BX*1), X2
3788 MOVOU -16(AX)(BX*1), X3
3789 MOVOU X0, (CX)
3790 MOVOU X1, 16(CX)
3791 MOVOU X2, -32(CX)(BX*1)
3792 MOVOU X3, -16(CX)(BX*1)
3793
3794 memmove_end_copy_emit_remainder_encodeBlockAsm12B:
3795 MOVQ DX, CX
3796 JMP emit_literal_done_emit_remainder_encodeBlockAsm12B
3797
3798 memmove_long_emit_remainder_encodeBlockAsm12B:
3799 LEAQ (CX)(SI*1), DX
3800 MOVL SI, BX
3801
3802 // genMemMoveLong
3803 MOVOU (AX), X0
3804 MOVOU 16(AX), X1
3805 MOVOU -32(AX)(BX*1), X2
3806 MOVOU -16(AX)(BX*1), X3
3807 MOVQ BX, DI
3808 SHRQ $0x05, DI
3809 MOVQ CX, SI
3810 ANDL $0x0000001f, SI
3811 MOVQ $0x00000040, R8
3812 SUBQ SI, R8
3813 DECQ DI
3814 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3815 LEAQ -32(AX)(R8*1), SI
3816 LEAQ -32(CX)(R8*1), R9
3817
3818 emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back:
3819 MOVOU (SI), X4
3820 MOVOU 16(SI), X5
3821 MOVOA X4, (R9)
3822 MOVOA X5, 16(R9)
3823 ADDQ $0x20, R9
3824 ADDQ $0x20, SI
3825 ADDQ $0x20, R8
3826 DECQ DI
3827 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back
3828
3829 emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32:
3830 MOVOU -32(AX)(R8*1), X4
3831 MOVOU -16(AX)(R8*1), X5
3832 MOVOA X4, -32(CX)(R8*1)
3833 MOVOA X5, -16(CX)(R8*1)
3834 ADDQ $0x20, R8
3835 CMPQ BX, R8
3836 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32
3837 MOVOU X0, (CX)
3838 MOVOU X1, 16(CX)
3839 MOVOU X2, -32(CX)(BX*1)
3840 MOVOU X3, -16(CX)(BX*1)
3841 MOVQ DX, CX
3842
3843 emit_literal_done_emit_remainder_encodeBlockAsm12B:
3844 MOVQ dst_base+0(FP), AX
3845 SUBQ AX, CX
3846 MOVQ CX, ret+56(FP)
3847 RET
3848
3849 // func encodeBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
3850 // Requires: BMI, SSE2
3851 TEXT ·encodeBlockAsm10B(SB), $24-64
3852 MOVQ tmp+48(FP), AX
3853 MOVQ dst_base+0(FP), CX
3854 MOVQ $0x00000020, DX
3855 MOVQ AX, BX
3856 PXOR X0, X0
3857
3858 zero_loop_encodeBlockAsm10B:
3859 MOVOU X0, (BX)
3860 MOVOU X0, 16(BX)
3861 MOVOU X0, 32(BX)
3862 MOVOU X0, 48(BX)
3863 MOVOU X0, 64(BX)
3864 MOVOU X0, 80(BX)
3865 MOVOU X0, 96(BX)
3866 MOVOU X0, 112(BX)
3867 ADDQ $0x80, BX
3868 DECQ DX
3869 JNZ zero_loop_encodeBlockAsm10B
3870 MOVL $0x00000000, 12(SP)
3871 MOVQ src_len+32(FP), DX
3872 LEAQ -9(DX), BX
3873 LEAQ -8(DX), SI
3874 MOVL SI, 8(SP)
3875 SHRQ $0x05, DX
3876 SUBL DX, BX
3877 LEAQ (CX)(BX*1), BX
3878 MOVQ BX, (SP)
3879 MOVL $0x00000001, DX
3880 MOVL DX, 16(SP)
3881 MOVQ src_base+24(FP), BX
3882
3883 search_loop_encodeBlockAsm10B:
3884 MOVL DX, SI
3885 SUBL 12(SP), SI
3886 SHRL $0x05, SI
3887 LEAL 4(DX)(SI*1), SI
3888 CMPL SI, 8(SP)
3889 JAE emit_remainder_encodeBlockAsm10B
3890 MOVQ (BX)(DX*1), DI
3891 MOVL SI, 20(SP)
3892 MOVQ $0x9e3779b1, R9
3893 MOVQ DI, R10
3894 MOVQ DI, R11
3895 SHRQ $0x08, R11
3896 SHLQ $0x20, R10
3897 IMULQ R9, R10
3898 SHRQ $0x36, R10
3899 SHLQ $0x20, R11
3900 IMULQ R9, R11
3901 SHRQ $0x36, R11
3902 MOVL (AX)(R10*4), SI
3903 MOVL (AX)(R11*4), R8
3904 MOVL DX, (AX)(R10*4)
3905 LEAL 1(DX), R10
3906 MOVL R10, (AX)(R11*4)
3907 MOVQ DI, R10
3908 SHRQ $0x10, R10
3909 SHLQ $0x20, R10
3910 IMULQ R9, R10
3911 SHRQ $0x36, R10
3912 MOVL DX, R9
3913 SUBL 16(SP), R9
3914 MOVL 1(BX)(R9*1), R11
3915 MOVQ DI, R9
3916 SHRQ $0x08, R9
3917 CMPL R9, R11
3918 JNE no_repeat_found_encodeBlockAsm10B
3919 LEAL 1(DX), DI
3920 MOVL 12(SP), R8
3921 MOVL DI, SI
3922 SUBL 16(SP), SI
3923 JZ repeat_extend_back_end_encodeBlockAsm10B
3924
3925 repeat_extend_back_loop_encodeBlockAsm10B:
3926 CMPL DI, R8
3927 JBE repeat_extend_back_end_encodeBlockAsm10B
3928 MOVB -1(BX)(SI*1), R9
3929 MOVB -1(BX)(DI*1), R10
3930 CMPB R9, R10
3931 JNE repeat_extend_back_end_encodeBlockAsm10B
3932 LEAL -1(DI), DI
3933 DECL SI
3934 JNZ repeat_extend_back_loop_encodeBlockAsm10B
3935
3936 repeat_extend_back_end_encodeBlockAsm10B:
3937 MOVL DI, SI
3938 SUBL 12(SP), SI
3939 LEAQ 3(CX)(SI*1), SI
3940 CMPQ SI, (SP)
3941 JB repeat_dst_size_check_encodeBlockAsm10B
3942 MOVQ $0x00000000, ret+56(FP)
3943 RET
3944
3945 repeat_dst_size_check_encodeBlockAsm10B:
3946 MOVL 12(SP), SI
3947 CMPL SI, DI
3948 JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B
3949 MOVL DI, R9
3950 MOVL DI, 12(SP)
3951 LEAQ (BX)(SI*1), R10
3952 SUBL SI, R9
3953 LEAL -1(R9), SI
3954 CMPL SI, $0x3c
3955 JB one_byte_repeat_emit_encodeBlockAsm10B
3956 CMPL SI, $0x00000100
3957 JB two_bytes_repeat_emit_encodeBlockAsm10B
3958 JB three_bytes_repeat_emit_encodeBlockAsm10B
3959
3960 three_bytes_repeat_emit_encodeBlockAsm10B:
3961 MOVB $0xf4, (CX)
3962 MOVW SI, 1(CX)
3963 ADDQ $0x03, CX
3964 JMP memmove_long_repeat_emit_encodeBlockAsm10B
3965
3966 two_bytes_repeat_emit_encodeBlockAsm10B:
3967 MOVB $0xf0, (CX)
3968 MOVB SI, 1(CX)
3969 ADDQ $0x02, CX
3970 CMPL SI, $0x40
3971 JB memmove_repeat_emit_encodeBlockAsm10B
3972 JMP memmove_long_repeat_emit_encodeBlockAsm10B
3973
3974 one_byte_repeat_emit_encodeBlockAsm10B:
3975 SHLB $0x02, SI
3976 MOVB SI, (CX)
3977 ADDQ $0x01, CX
3978
3979 memmove_repeat_emit_encodeBlockAsm10B:
3980 LEAQ (CX)(R9*1), SI
3981
3982 // genMemMoveShort
3983 CMPQ R9, $0x08
3984 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8
3985 CMPQ R9, $0x10
3986 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16
3987 CMPQ R9, $0x20
3988 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32
3989 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64
3990
3991 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8:
3992 MOVQ (R10), R11
3993 MOVQ R11, (CX)
3994 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
3995
3996 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16:
3997 MOVQ (R10), R11
3998 MOVQ -8(R10)(R9*1), R10
3999 MOVQ R11, (CX)
4000 MOVQ R10, -8(CX)(R9*1)
4001 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
4002
4003 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32:
4004 MOVOU (R10), X0
4005 MOVOU -16(R10)(R9*1), X1
4006 MOVOU X0, (CX)
4007 MOVOU X1, -16(CX)(R9*1)
4008 JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B
4009
4010 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64:
4011 MOVOU (R10), X0
4012 MOVOU 16(R10), X1
4013 MOVOU -32(R10)(R9*1), X2
4014 MOVOU -16(R10)(R9*1), X3
4015 MOVOU X0, (CX)
4016 MOVOU X1, 16(CX)
4017 MOVOU X2, -32(CX)(R9*1)
4018 MOVOU X3, -16(CX)(R9*1)
4019
4020 memmove_end_copy_repeat_emit_encodeBlockAsm10B:
4021 MOVQ SI, CX
4022 JMP emit_literal_done_repeat_emit_encodeBlockAsm10B
4023
4024 memmove_long_repeat_emit_encodeBlockAsm10B:
4025 LEAQ (CX)(R9*1), SI
4026
4027 // genMemMoveLong
4028 MOVOU (R10), X0
4029 MOVOU 16(R10), X1
4030 MOVOU -32(R10)(R9*1), X2
4031 MOVOU -16(R10)(R9*1), X3
4032 MOVQ R9, R12
4033 SHRQ $0x05, R12
4034 MOVQ CX, R11
4035 ANDL $0x0000001f, R11
4036 MOVQ $0x00000040, R13
4037 SUBQ R11, R13
4038 DECQ R12
4039 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4040 LEAQ -32(R10)(R13*1), R11
4041 LEAQ -32(CX)(R13*1), R14
4042
4043 emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back:
4044 MOVOU (R11), X4
4045 MOVOU 16(R11), X5
4046 MOVOA X4, (R14)
4047 MOVOA X5, 16(R14)
4048 ADDQ $0x20, R14
4049 ADDQ $0x20, R11
4050 ADDQ $0x20, R13
4051 DECQ R12
4052 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back
4053
4054 emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
4055 MOVOU -32(R10)(R13*1), X4
4056 MOVOU -16(R10)(R13*1), X5
4057 MOVOA X4, -32(CX)(R13*1)
4058 MOVOA X5, -16(CX)(R13*1)
4059 ADDQ $0x20, R13
4060 CMPQ R9, R13
4061 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4062 MOVOU X0, (CX)
4063 MOVOU X1, 16(CX)
4064 MOVOU X2, -32(CX)(R9*1)
4065 MOVOU X3, -16(CX)(R9*1)
4066 MOVQ SI, CX
4067
4068 emit_literal_done_repeat_emit_encodeBlockAsm10B:
4069 ADDL $0x05, DX
4070 MOVL DX, SI
4071 SUBL 16(SP), SI
4072 MOVQ src_len+32(FP), R9
4073 SUBL DX, R9
4074 LEAQ (BX)(DX*1), R10
4075 LEAQ (BX)(SI*1), SI
4076
4077 // matchLen
4078 XORL R12, R12
4079
4080 matchlen_loopback_16_repeat_extend_encodeBlockAsm10B:
4081 CMPL R9, $0x10
4082 JB matchlen_match8_repeat_extend_encodeBlockAsm10B
4083 MOVQ (R10)(R12*1), R11
4084 MOVQ 8(R10)(R12*1), R13
4085 XORQ (SI)(R12*1), R11
4086 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
4087 XORQ 8(SI)(R12*1), R13
4088 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B
4089 LEAL -16(R9), R9
4090 LEAL 16(R12), R12
4091 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B
4092
4093 matchlen_bsf_16repeat_extend_encodeBlockAsm10B:
4094 #ifdef GOAMD64_v3
4095 TZCNTQ R13, R13
4096
4097 #else
4098 BSFQ R13, R13
4099
4100 #endif
4101 SARQ $0x03, R13
4102 LEAL 8(R12)(R13*1), R12
4103 JMP repeat_extend_forward_end_encodeBlockAsm10B
4104
4105 matchlen_match8_repeat_extend_encodeBlockAsm10B:
4106 CMPL R9, $0x08
4107 JB matchlen_match4_repeat_extend_encodeBlockAsm10B
4108 MOVQ (R10)(R12*1), R11
4109 XORQ (SI)(R12*1), R11
4110 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B
4111 LEAL -8(R9), R9
4112 LEAL 8(R12), R12
4113 JMP matchlen_match4_repeat_extend_encodeBlockAsm10B
4114
4115 matchlen_bsf_8_repeat_extend_encodeBlockAsm10B:
4116 #ifdef GOAMD64_v3
4117 TZCNTQ R11, R11
4118
4119 #else
4120 BSFQ R11, R11
4121
4122 #endif
4123 SARQ $0x03, R11
4124 LEAL (R12)(R11*1), R12
4125 JMP repeat_extend_forward_end_encodeBlockAsm10B
4126
4127 matchlen_match4_repeat_extend_encodeBlockAsm10B:
4128 CMPL R9, $0x04
4129 JB matchlen_match2_repeat_extend_encodeBlockAsm10B
4130 MOVL (R10)(R12*1), R11
4131 CMPL (SI)(R12*1), R11
4132 JNE matchlen_match2_repeat_extend_encodeBlockAsm10B
4133 LEAL -4(R9), R9
4134 LEAL 4(R12), R12
4135
4136 matchlen_match2_repeat_extend_encodeBlockAsm10B:
4137 CMPL R9, $0x01
4138 JE matchlen_match1_repeat_extend_encodeBlockAsm10B
4139 JB repeat_extend_forward_end_encodeBlockAsm10B
4140 MOVW (R10)(R12*1), R11
4141 CMPW (SI)(R12*1), R11
4142 JNE matchlen_match1_repeat_extend_encodeBlockAsm10B
4143 LEAL 2(R12), R12
4144 SUBL $0x02, R9
4145 JZ repeat_extend_forward_end_encodeBlockAsm10B
4146
4147 matchlen_match1_repeat_extend_encodeBlockAsm10B:
4148 MOVB (R10)(R12*1), R11
4149 CMPB (SI)(R12*1), R11
4150 JNE repeat_extend_forward_end_encodeBlockAsm10B
4151 LEAL 1(R12), R12
4152
4153 repeat_extend_forward_end_encodeBlockAsm10B:
4154 ADDL R12, DX
4155 MOVL DX, SI
4156 SUBL DI, SI
4157 MOVL 16(SP), DI
4158 TESTL R8, R8
4159 JZ repeat_as_copy_encodeBlockAsm10B
4160
4161 // emitRepeat
4162 MOVL SI, R8
4163 LEAL -4(SI), SI
4164 CMPL R8, $0x08
4165 JBE repeat_two_match_repeat_encodeBlockAsm10B
4166 CMPL R8, $0x0c
4167 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B
4168 CMPL DI, $0x00000800
4169 JB repeat_two_offset_match_repeat_encodeBlockAsm10B
4170
4171 cant_repeat_two_offset_match_repeat_encodeBlockAsm10B:
4172 CMPL SI, $0x00000104
4173 JB repeat_three_match_repeat_encodeBlockAsm10B
4174 LEAL -256(SI), SI
4175 MOVW $0x0019, (CX)
4176 MOVW SI, 2(CX)
4177 ADDQ $0x04, CX
4178 JMP repeat_end_emit_encodeBlockAsm10B
4179
4180 repeat_three_match_repeat_encodeBlockAsm10B:
4181 LEAL -4(SI), SI
4182 MOVW $0x0015, (CX)
4183 MOVB SI, 2(CX)
4184 ADDQ $0x03, CX
4185 JMP repeat_end_emit_encodeBlockAsm10B
4186
4187 repeat_two_match_repeat_encodeBlockAsm10B:
4188 SHLL $0x02, SI
4189 ORL $0x01, SI
4190 MOVW SI, (CX)
4191 ADDQ $0x02, CX
4192 JMP repeat_end_emit_encodeBlockAsm10B
4193
4194 repeat_two_offset_match_repeat_encodeBlockAsm10B:
4195 XORQ R8, R8
4196 LEAL 1(R8)(SI*4), SI
4197 MOVB DI, 1(CX)
4198 SARL $0x08, DI
4199 SHLL $0x05, DI
4200 ORL DI, SI
4201 MOVB SI, (CX)
4202 ADDQ $0x02, CX
4203 JMP repeat_end_emit_encodeBlockAsm10B
4204
4205 repeat_as_copy_encodeBlockAsm10B:
4206 // emitCopy
4207 CMPL SI, $0x40
4208 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
4209 CMPL DI, $0x00000800
4210 JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B
4211 MOVL $0x00000001, R8
4212 LEAL 16(R8), R8
4213 MOVB DI, 1(CX)
4214 SHRL $0x08, DI
4215 SHLL $0x05, DI
4216 ORL DI, R8
4217 MOVB R8, (CX)
4218 ADDQ $0x02, CX
4219 SUBL $0x08, SI
4220
4221 // emitRepeat
4222 LEAL -4(SI), SI
4223 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4224 MOVL SI, R8
4225 LEAL -4(SI), SI
4226 CMPL R8, $0x08
4227 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4228 CMPL R8, $0x0c
4229 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4230 CMPL DI, $0x00000800
4231 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4232
4233 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4234 CMPL SI, $0x00000104
4235 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b
4236 LEAL -256(SI), SI
4237 MOVW $0x0019, (CX)
4238 MOVW SI, 2(CX)
4239 ADDQ $0x04, CX
4240 JMP repeat_end_emit_encodeBlockAsm10B
4241
4242 repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4243 LEAL -4(SI), SI
4244 MOVW $0x0015, (CX)
4245 MOVB SI, 2(CX)
4246 ADDQ $0x03, CX
4247 JMP repeat_end_emit_encodeBlockAsm10B
4248
4249 repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4250 SHLL $0x02, SI
4251 ORL $0x01, SI
4252 MOVW SI, (CX)
4253 ADDQ $0x02, CX
4254 JMP repeat_end_emit_encodeBlockAsm10B
4255
4256 repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b:
4257 XORQ R8, R8
4258 LEAL 1(R8)(SI*4), SI
4259 MOVB DI, 1(CX)
4260 SARL $0x08, DI
4261 SHLL $0x05, DI
4262 ORL DI, SI
4263 MOVB SI, (CX)
4264 ADDQ $0x02, CX
4265 JMP repeat_end_emit_encodeBlockAsm10B
4266
4267 long_offset_short_repeat_as_copy_encodeBlockAsm10B:
4268 MOVB $0xee, (CX)
4269 MOVW DI, 1(CX)
4270 LEAL -60(SI), SI
4271 ADDQ $0x03, CX
4272
4273 // emitRepeat
4274 MOVL SI, R8
4275 LEAL -4(SI), SI
4276 CMPL R8, $0x08
4277 JBE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4278 CMPL R8, $0x0c
4279 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4280 CMPL DI, $0x00000800
4281 JB repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4282
4283 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4284 CMPL SI, $0x00000104
4285 JB repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short
4286 LEAL -256(SI), SI
4287 MOVW $0x0019, (CX)
4288 MOVW SI, 2(CX)
4289 ADDQ $0x04, CX
4290 JMP repeat_end_emit_encodeBlockAsm10B
4291
4292 repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4293 LEAL -4(SI), SI
4294 MOVW $0x0015, (CX)
4295 MOVB SI, 2(CX)
4296 ADDQ $0x03, CX
4297 JMP repeat_end_emit_encodeBlockAsm10B
4298
4299 repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4300 SHLL $0x02, SI
4301 ORL $0x01, SI
4302 MOVW SI, (CX)
4303 ADDQ $0x02, CX
4304 JMP repeat_end_emit_encodeBlockAsm10B
4305
4306 repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
4307 XORQ R8, R8
4308 LEAL 1(R8)(SI*4), SI
4309 MOVB DI, 1(CX)
4310 SARL $0x08, DI
4311 SHLL $0x05, DI
4312 ORL DI, SI
4313 MOVB SI, (CX)
4314 ADDQ $0x02, CX
4315 JMP repeat_end_emit_encodeBlockAsm10B
4316
4317 two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
4318 MOVL SI, R8
4319 SHLL $0x02, R8
4320 CMPL SI, $0x0c
4321 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
4322 CMPL DI, $0x00000800
4323 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm10B
4324 LEAL -15(R8), R8
4325 MOVB DI, 1(CX)
4326 SHRL $0x08, DI
4327 SHLL $0x05, DI
4328 ORL DI, R8
4329 MOVB R8, (CX)
4330 ADDQ $0x02, CX
4331 JMP repeat_end_emit_encodeBlockAsm10B
4332
4333 emit_copy_three_repeat_as_copy_encodeBlockAsm10B:
4334 LEAL -2(R8), R8
4335 MOVB R8, (CX)
4336 MOVW DI, 1(CX)
4337 ADDQ $0x03, CX
4338
4339 repeat_end_emit_encodeBlockAsm10B:
4340 MOVL DX, 12(SP)
4341 JMP search_loop_encodeBlockAsm10B
4342
4343 no_repeat_found_encodeBlockAsm10B:
4344 CMPL (BX)(SI*1), DI
4345 JEQ candidate_match_encodeBlockAsm10B
4346 SHRQ $0x08, DI
4347 MOVL (AX)(R10*4), SI
4348 LEAL 2(DX), R9
4349 CMPL (BX)(R8*1), DI
4350 JEQ candidate2_match_encodeBlockAsm10B
4351 MOVL R9, (AX)(R10*4)
4352 SHRQ $0x08, DI
4353 CMPL (BX)(SI*1), DI
4354 JEQ candidate3_match_encodeBlockAsm10B
4355 MOVL 20(SP), DX
4356 JMP search_loop_encodeBlockAsm10B
4357
4358 candidate3_match_encodeBlockAsm10B:
4359 ADDL $0x02, DX
4360 JMP candidate_match_encodeBlockAsm10B
4361
4362 candidate2_match_encodeBlockAsm10B:
4363 MOVL R9, (AX)(R10*4)
4364 INCL DX
4365 MOVL R8, SI
4366
4367 candidate_match_encodeBlockAsm10B:
4368 MOVL 12(SP), DI
4369 TESTL SI, SI
4370 JZ match_extend_back_end_encodeBlockAsm10B
4371
4372 match_extend_back_loop_encodeBlockAsm10B:
4373 CMPL DX, DI
4374 JBE match_extend_back_end_encodeBlockAsm10B
4375 MOVB -1(BX)(SI*1), R8
4376 MOVB -1(BX)(DX*1), R9
4377 CMPB R8, R9
4378 JNE match_extend_back_end_encodeBlockAsm10B
4379 LEAL -1(DX), DX
4380 DECL SI
4381 JZ match_extend_back_end_encodeBlockAsm10B
4382 JMP match_extend_back_loop_encodeBlockAsm10B
4383
4384 match_extend_back_end_encodeBlockAsm10B:
4385 MOVL DX, DI
4386 SUBL 12(SP), DI
4387 LEAQ 3(CX)(DI*1), DI
4388 CMPQ DI, (SP)
4389 JB match_dst_size_check_encodeBlockAsm10B
4390 MOVQ $0x00000000, ret+56(FP)
4391 RET
4392
4393 match_dst_size_check_encodeBlockAsm10B:
4394 MOVL DX, DI
4395 MOVL 12(SP), R8
4396 CMPL R8, DI
4397 JEQ emit_literal_done_match_emit_encodeBlockAsm10B
4398 MOVL DI, R9
4399 MOVL DI, 12(SP)
4400 LEAQ (BX)(R8*1), DI
4401 SUBL R8, R9
4402 LEAL -1(R9), R8
4403 CMPL R8, $0x3c
4404 JB one_byte_match_emit_encodeBlockAsm10B
4405 CMPL R8, $0x00000100
4406 JB two_bytes_match_emit_encodeBlockAsm10B
4407 JB three_bytes_match_emit_encodeBlockAsm10B
4408
4409 three_bytes_match_emit_encodeBlockAsm10B:
4410 MOVB $0xf4, (CX)
4411 MOVW R8, 1(CX)
4412 ADDQ $0x03, CX
4413 JMP memmove_long_match_emit_encodeBlockAsm10B
4414
4415 two_bytes_match_emit_encodeBlockAsm10B:
4416 MOVB $0xf0, (CX)
4417 MOVB R8, 1(CX)
4418 ADDQ $0x02, CX
4419 CMPL R8, $0x40
4420 JB memmove_match_emit_encodeBlockAsm10B
4421 JMP memmove_long_match_emit_encodeBlockAsm10B
4422
4423 one_byte_match_emit_encodeBlockAsm10B:
4424 SHLB $0x02, R8
4425 MOVB R8, (CX)
4426 ADDQ $0x01, CX
4427
4428 memmove_match_emit_encodeBlockAsm10B:
4429 LEAQ (CX)(R9*1), R8
4430
4431 // genMemMoveShort
4432 CMPQ R9, $0x08
4433 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8
4434 CMPQ R9, $0x10
4435 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16
4436 CMPQ R9, $0x20
4437 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32
4438 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64
4439
4440 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8:
4441 MOVQ (DI), R10
4442 MOVQ R10, (CX)
4443 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4444
4445 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16:
4446 MOVQ (DI), R10
4447 MOVQ -8(DI)(R9*1), DI
4448 MOVQ R10, (CX)
4449 MOVQ DI, -8(CX)(R9*1)
4450 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4451
4452 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32:
4453 MOVOU (DI), X0
4454 MOVOU -16(DI)(R9*1), X1
4455 MOVOU X0, (CX)
4456 MOVOU X1, -16(CX)(R9*1)
4457 JMP memmove_end_copy_match_emit_encodeBlockAsm10B
4458
4459 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64:
4460 MOVOU (DI), X0
4461 MOVOU 16(DI), X1
4462 MOVOU -32(DI)(R9*1), X2
4463 MOVOU -16(DI)(R9*1), X3
4464 MOVOU X0, (CX)
4465 MOVOU X1, 16(CX)
4466 MOVOU X2, -32(CX)(R9*1)
4467 MOVOU X3, -16(CX)(R9*1)
4468
4469 memmove_end_copy_match_emit_encodeBlockAsm10B:
4470 MOVQ R8, CX
4471 JMP emit_literal_done_match_emit_encodeBlockAsm10B
4472
4473 memmove_long_match_emit_encodeBlockAsm10B:
4474 LEAQ (CX)(R9*1), R8
4475
4476 // genMemMoveLong
4477 MOVOU (DI), X0
4478 MOVOU 16(DI), X1
4479 MOVOU -32(DI)(R9*1), X2
4480 MOVOU -16(DI)(R9*1), X3
4481 MOVQ R9, R11
4482 SHRQ $0x05, R11
4483 MOVQ CX, R10
4484 ANDL $0x0000001f, R10
4485 MOVQ $0x00000040, R12
4486 SUBQ R10, R12
4487 DECQ R11
4488 JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4489 LEAQ -32(DI)(R12*1), R10
4490 LEAQ -32(CX)(R12*1), R13
4491
4492 emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back:
4493 MOVOU (R10), X4
4494 MOVOU 16(R10), X5
4495 MOVOA X4, (R13)
4496 MOVOA X5, 16(R13)
4497 ADDQ $0x20, R13
4498 ADDQ $0x20, R10
4499 ADDQ $0x20, R12
4500 DECQ R11
4501 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back
4502
4503 emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32:
4504 MOVOU -32(DI)(R12*1), X4
4505 MOVOU -16(DI)(R12*1), X5
4506 MOVOA X4, -32(CX)(R12*1)
4507 MOVOA X5, -16(CX)(R12*1)
4508 ADDQ $0x20, R12
4509 CMPQ R9, R12
4510 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32
4511 MOVOU X0, (CX)
4512 MOVOU X1, 16(CX)
4513 MOVOU X2, -32(CX)(R9*1)
4514 MOVOU X3, -16(CX)(R9*1)
4515 MOVQ R8, CX
4516
4517 emit_literal_done_match_emit_encodeBlockAsm10B:
4518 match_nolit_loop_encodeBlockAsm10B:
4519 MOVL DX, DI
4520 SUBL SI, DI
4521 MOVL DI, 16(SP)
4522 ADDL $0x04, DX
4523 ADDL $0x04, SI
4524 MOVQ src_len+32(FP), DI
4525 SUBL DX, DI
4526 LEAQ (BX)(DX*1), R8
4527 LEAQ (BX)(SI*1), SI
4528
4529 // matchLen
4530 XORL R10, R10
4531
4532 matchlen_loopback_16_match_nolit_encodeBlockAsm10B:
4533 CMPL DI, $0x10
4534 JB matchlen_match8_match_nolit_encodeBlockAsm10B
4535 MOVQ (R8)(R10*1), R9
4536 MOVQ 8(R8)(R10*1), R11
4537 XORQ (SI)(R10*1), R9
4538 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
4539 XORQ 8(SI)(R10*1), R11
4540 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B
4541 LEAL -16(DI), DI
4542 LEAL 16(R10), R10
4543 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B
4544
4545 matchlen_bsf_16match_nolit_encodeBlockAsm10B:
4546 #ifdef GOAMD64_v3
4547 TZCNTQ R11, R11
4548
4549 #else
4550 BSFQ R11, R11
4551
4552 #endif
4553 SARQ $0x03, R11
4554 LEAL 8(R10)(R11*1), R10
4555 JMP match_nolit_end_encodeBlockAsm10B
4556
4557 matchlen_match8_match_nolit_encodeBlockAsm10B:
4558 CMPL DI, $0x08
4559 JB matchlen_match4_match_nolit_encodeBlockAsm10B
4560 MOVQ (R8)(R10*1), R9
4561 XORQ (SI)(R10*1), R9
4562 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B
4563 LEAL -8(DI), DI
4564 LEAL 8(R10), R10
4565 JMP matchlen_match4_match_nolit_encodeBlockAsm10B
4566
4567 matchlen_bsf_8_match_nolit_encodeBlockAsm10B:
4568 #ifdef GOAMD64_v3
4569 TZCNTQ R9, R9
4570
4571 #else
4572 BSFQ R9, R9
4573
4574 #endif
4575 SARQ $0x03, R9
4576 LEAL (R10)(R9*1), R10
4577 JMP match_nolit_end_encodeBlockAsm10B
4578
4579 matchlen_match4_match_nolit_encodeBlockAsm10B:
4580 CMPL DI, $0x04
4581 JB matchlen_match2_match_nolit_encodeBlockAsm10B
4582 MOVL (R8)(R10*1), R9
4583 CMPL (SI)(R10*1), R9
4584 JNE matchlen_match2_match_nolit_encodeBlockAsm10B
4585 LEAL -4(DI), DI
4586 LEAL 4(R10), R10
4587
4588 matchlen_match2_match_nolit_encodeBlockAsm10B:
4589 CMPL DI, $0x01
4590 JE matchlen_match1_match_nolit_encodeBlockAsm10B
4591 JB match_nolit_end_encodeBlockAsm10B
4592 MOVW (R8)(R10*1), R9
4593 CMPW (SI)(R10*1), R9
4594 JNE matchlen_match1_match_nolit_encodeBlockAsm10B
4595 LEAL 2(R10), R10
4596 SUBL $0x02, DI
4597 JZ match_nolit_end_encodeBlockAsm10B
4598
4599 matchlen_match1_match_nolit_encodeBlockAsm10B:
4600 MOVB (R8)(R10*1), R9
4601 CMPB (SI)(R10*1), R9
4602 JNE match_nolit_end_encodeBlockAsm10B
4603 LEAL 1(R10), R10
4604
4605 match_nolit_end_encodeBlockAsm10B:
4606 ADDL R10, DX
4607 MOVL 16(SP), SI
4608 ADDL $0x04, R10
4609 MOVL DX, 12(SP)
4610
4611 // emitCopy
4612 CMPL R10, $0x40
4613 JBE two_byte_offset_short_match_nolit_encodeBlockAsm10B
4614 CMPL SI, $0x00000800
4615 JAE long_offset_short_match_nolit_encodeBlockAsm10B
4616 MOVL $0x00000001, DI
4617 LEAL 16(DI), DI
4618 MOVB SI, 1(CX)
4619 SHRL $0x08, SI
4620 SHLL $0x05, SI
4621 ORL SI, DI
4622 MOVB DI, (CX)
4623 ADDQ $0x02, CX
4624 SUBL $0x08, R10
4625
4626 // emitRepeat
4627 LEAL -4(R10), R10
4628 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4629 MOVL R10, DI
4630 LEAL -4(R10), R10
4631 CMPL DI, $0x08
4632 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4633 CMPL DI, $0x0c
4634 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4635 CMPL SI, $0x00000800
4636 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4637
4638 cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4639 CMPL R10, $0x00000104
4640 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b
4641 LEAL -256(R10), R10
4642 MOVW $0x0019, (CX)
4643 MOVW R10, 2(CX)
4644 ADDQ $0x04, CX
4645 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4646
4647 repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4648 LEAL -4(R10), R10
4649 MOVW $0x0015, (CX)
4650 MOVB R10, 2(CX)
4651 ADDQ $0x03, CX
4652 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4653
4654 repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4655 SHLL $0x02, R10
4656 ORL $0x01, R10
4657 MOVW R10, (CX)
4658 ADDQ $0x02, CX
4659 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4660
4661 repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b:
4662 XORQ DI, DI
4663 LEAL 1(DI)(R10*4), R10
4664 MOVB SI, 1(CX)
4665 SARL $0x08, SI
4666 SHLL $0x05, SI
4667 ORL SI, R10
4668 MOVB R10, (CX)
4669 ADDQ $0x02, CX
4670 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4671
4672 long_offset_short_match_nolit_encodeBlockAsm10B:
4673 MOVB $0xee, (CX)
4674 MOVW SI, 1(CX)
4675 LEAL -60(R10), R10
4676 ADDQ $0x03, CX
4677
4678 // emitRepeat
4679 MOVL R10, DI
4680 LEAL -4(R10), R10
4681 CMPL DI, $0x08
4682 JBE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short
4683 CMPL DI, $0x0c
4684 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
4685 CMPL SI, $0x00000800
4686 JB repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short
4687
4688 cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
4689 CMPL R10, $0x00000104
4690 JB repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short
4691 LEAL -256(R10), R10
4692 MOVW $0x0019, (CX)
4693 MOVW R10, 2(CX)
4694 ADDQ $0x04, CX
4695 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4696
4697 repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short:
4698 LEAL -4(R10), R10
4699 MOVW $0x0015, (CX)
4700 MOVB R10, 2(CX)
4701 ADDQ $0x03, CX
4702 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4703
4704 repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short:
4705 SHLL $0x02, R10
4706 ORL $0x01, R10
4707 MOVW R10, (CX)
4708 ADDQ $0x02, CX
4709 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4710
4711 repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
4712 XORQ DI, DI
4713 LEAL 1(DI)(R10*4), R10
4714 MOVB SI, 1(CX)
4715 SARL $0x08, SI
4716 SHLL $0x05, SI
4717 ORL SI, R10
4718 MOVB R10, (CX)
4719 ADDQ $0x02, CX
4720 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4721
4722 two_byte_offset_short_match_nolit_encodeBlockAsm10B:
4723 MOVL R10, DI
4724 SHLL $0x02, DI
4725 CMPL R10, $0x0c
4726 JAE emit_copy_three_match_nolit_encodeBlockAsm10B
4727 CMPL SI, $0x00000800
4728 JAE emit_copy_three_match_nolit_encodeBlockAsm10B
4729 LEAL -15(DI), DI
4730 MOVB SI, 1(CX)
4731 SHRL $0x08, SI
4732 SHLL $0x05, SI
4733 ORL SI, DI
4734 MOVB DI, (CX)
4735 ADDQ $0x02, CX
4736 JMP match_nolit_emitcopy_end_encodeBlockAsm10B
4737
4738 emit_copy_three_match_nolit_encodeBlockAsm10B:
4739 LEAL -2(DI), DI
4740 MOVB DI, (CX)
4741 MOVW SI, 1(CX)
4742 ADDQ $0x03, CX
4743
4744 match_nolit_emitcopy_end_encodeBlockAsm10B:
4745 CMPL DX, 8(SP)
4746 JAE emit_remainder_encodeBlockAsm10B
4747 MOVQ -2(BX)(DX*1), DI
4748 CMPQ CX, (SP)
4749 JB match_nolit_dst_ok_encodeBlockAsm10B
4750 MOVQ $0x00000000, ret+56(FP)
4751 RET
4752
4753 match_nolit_dst_ok_encodeBlockAsm10B:
4754 MOVQ $0x9e3779b1, R9
4755 MOVQ DI, R8
4756 SHRQ $0x10, DI
4757 MOVQ DI, SI
4758 SHLQ $0x20, R8
4759 IMULQ R9, R8
4760 SHRQ $0x36, R8
4761 SHLQ $0x20, SI
4762 IMULQ R9, SI
4763 SHRQ $0x36, SI
4764 LEAL -2(DX), R9
4765 LEAQ (AX)(SI*4), R10
4766 MOVL (R10), SI
4767 MOVL R9, (AX)(R8*4)
4768 MOVL DX, (R10)
4769 CMPL (BX)(SI*1), DI
4770 JEQ match_nolit_loop_encodeBlockAsm10B
4771 INCL DX
4772 JMP search_loop_encodeBlockAsm10B
4773
4774 emit_remainder_encodeBlockAsm10B:
4775 MOVQ src_len+32(FP), AX
4776 SUBL 12(SP), AX
4777 LEAQ 3(CX)(AX*1), AX
4778 CMPQ AX, (SP)
4779 JB emit_remainder_ok_encodeBlockAsm10B
4780 MOVQ $0x00000000, ret+56(FP)
4781 RET
4782
4783 emit_remainder_ok_encodeBlockAsm10B:
4784 MOVQ src_len+32(FP), AX
4785 MOVL 12(SP), DX
4786 CMPL DX, AX
4787 JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B
4788 MOVL AX, SI
4789 MOVL AX, 12(SP)
4790 LEAQ (BX)(DX*1), AX
4791 SUBL DX, SI
4792 LEAL -1(SI), DX
4793 CMPL DX, $0x3c
4794 JB one_byte_emit_remainder_encodeBlockAsm10B
4795 CMPL DX, $0x00000100
4796 JB two_bytes_emit_remainder_encodeBlockAsm10B
4797 JB three_bytes_emit_remainder_encodeBlockAsm10B
4798
4799 three_bytes_emit_remainder_encodeBlockAsm10B:
4800 MOVB $0xf4, (CX)
4801 MOVW DX, 1(CX)
4802 ADDQ $0x03, CX
4803 JMP memmove_long_emit_remainder_encodeBlockAsm10B
4804
4805 two_bytes_emit_remainder_encodeBlockAsm10B:
4806 MOVB $0xf0, (CX)
4807 MOVB DL, 1(CX)
4808 ADDQ $0x02, CX
4809 CMPL DX, $0x40
4810 JB memmove_emit_remainder_encodeBlockAsm10B
4811 JMP memmove_long_emit_remainder_encodeBlockAsm10B
4812
4813 one_byte_emit_remainder_encodeBlockAsm10B:
4814 SHLB $0x02, DL
4815 MOVB DL, (CX)
4816 ADDQ $0x01, CX
4817
4818 memmove_emit_remainder_encodeBlockAsm10B:
4819 LEAQ (CX)(SI*1), DX
4820 MOVL SI, BX
4821
4822 // genMemMoveShort
4823 CMPQ BX, $0x03
4824 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2
4825 JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3
4826 CMPQ BX, $0x08
4827 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7
4828 CMPQ BX, $0x10
4829 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16
4830 CMPQ BX, $0x20
4831 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32
4832 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64
4833
4834 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2:
4835 MOVB (AX), SI
4836 MOVB -1(AX)(BX*1), AL
4837 MOVB SI, (CX)
4838 MOVB AL, -1(CX)(BX*1)
4839 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4840
4841 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3:
4842 MOVW (AX), SI
4843 MOVB 2(AX), AL
4844 MOVW SI, (CX)
4845 MOVB AL, 2(CX)
4846 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4847
4848 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7:
4849 MOVL (AX), SI
4850 MOVL -4(AX)(BX*1), AX
4851 MOVL SI, (CX)
4852 MOVL AX, -4(CX)(BX*1)
4853 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4854
4855 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16:
4856 MOVQ (AX), SI
4857 MOVQ -8(AX)(BX*1), AX
4858 MOVQ SI, (CX)
4859 MOVQ AX, -8(CX)(BX*1)
4860 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4861
4862 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32:
4863 MOVOU (AX), X0
4864 MOVOU -16(AX)(BX*1), X1
4865 MOVOU X0, (CX)
4866 MOVOU X1, -16(CX)(BX*1)
4867 JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B
4868
4869 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64:
4870 MOVOU (AX), X0
4871 MOVOU 16(AX), X1
4872 MOVOU -32(AX)(BX*1), X2
4873 MOVOU -16(AX)(BX*1), X3
4874 MOVOU X0, (CX)
4875 MOVOU X1, 16(CX)
4876 MOVOU X2, -32(CX)(BX*1)
4877 MOVOU X3, -16(CX)(BX*1)
4878
4879 memmove_end_copy_emit_remainder_encodeBlockAsm10B:
4880 MOVQ DX, CX
4881 JMP emit_literal_done_emit_remainder_encodeBlockAsm10B
4882
4883 memmove_long_emit_remainder_encodeBlockAsm10B:
4884 LEAQ (CX)(SI*1), DX
4885 MOVL SI, BX
4886
4887 // genMemMoveLong
4888 MOVOU (AX), X0
4889 MOVOU 16(AX), X1
4890 MOVOU -32(AX)(BX*1), X2
4891 MOVOU -16(AX)(BX*1), X3
4892 MOVQ BX, DI
4893 SHRQ $0x05, DI
4894 MOVQ CX, SI
4895 ANDL $0x0000001f, SI
4896 MOVQ $0x00000040, R8
4897 SUBQ SI, R8
4898 DECQ DI
4899 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4900 LEAQ -32(AX)(R8*1), SI
4901 LEAQ -32(CX)(R8*1), R9
4902
4903 emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back:
4904 MOVOU (SI), X4
4905 MOVOU 16(SI), X5
4906 MOVOA X4, (R9)
4907 MOVOA X5, 16(R9)
4908 ADDQ $0x20, R9
4909 ADDQ $0x20, SI
4910 ADDQ $0x20, R8
4911 DECQ DI
4912 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back
4913
4914 emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32:
4915 MOVOU -32(AX)(R8*1), X4
4916 MOVOU -16(AX)(R8*1), X5
4917 MOVOA X4, -32(CX)(R8*1)
4918 MOVOA X5, -16(CX)(R8*1)
4919 ADDQ $0x20, R8
4920 CMPQ BX, R8
4921 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32
4922 MOVOU X0, (CX)
4923 MOVOU X1, 16(CX)
4924 MOVOU X2, -32(CX)(BX*1)
4925 MOVOU X3, -16(CX)(BX*1)
4926 MOVQ DX, CX
4927
4928 emit_literal_done_emit_remainder_encodeBlockAsm10B:
4929 MOVQ dst_base+0(FP), AX
4930 SUBQ AX, CX
4931 MOVQ CX, ret+56(FP)
4932 RET
4933
4934 // func encodeBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
4935 // Requires: BMI, SSE2
4936 TEXT ·encodeBlockAsm8B(SB), $24-64
4937 MOVQ tmp+48(FP), AX
4938 MOVQ dst_base+0(FP), CX
4939 MOVQ $0x00000008, DX
4940 MOVQ AX, BX
4941 PXOR X0, X0
4942
4943 zero_loop_encodeBlockAsm8B:
4944 MOVOU X0, (BX)
4945 MOVOU X0, 16(BX)
4946 MOVOU X0, 32(BX)
4947 MOVOU X0, 48(BX)
4948 MOVOU X0, 64(BX)
4949 MOVOU X0, 80(BX)
4950 MOVOU X0, 96(BX)
4951 MOVOU X0, 112(BX)
4952 ADDQ $0x80, BX
4953 DECQ DX
4954 JNZ zero_loop_encodeBlockAsm8B
4955 MOVL $0x00000000, 12(SP)
4956 MOVQ src_len+32(FP), DX
4957 LEAQ -9(DX), BX
4958 LEAQ -8(DX), SI
4959 MOVL SI, 8(SP)
4960 SHRQ $0x05, DX
4961 SUBL DX, BX
4962 LEAQ (CX)(BX*1), BX
4963 MOVQ BX, (SP)
4964 MOVL $0x00000001, DX
4965 MOVL DX, 16(SP)
4966 MOVQ src_base+24(FP), BX
4967
4968 search_loop_encodeBlockAsm8B:
4969 MOVL DX, SI
4970 SUBL 12(SP), SI
4971 SHRL $0x04, SI
4972 LEAL 4(DX)(SI*1), SI
4973 CMPL SI, 8(SP)
4974 JAE emit_remainder_encodeBlockAsm8B
4975 MOVQ (BX)(DX*1), DI
4976 MOVL SI, 20(SP)
4977 MOVQ $0x9e3779b1, R9
4978 MOVQ DI, R10
4979 MOVQ DI, R11
4980 SHRQ $0x08, R11
4981 SHLQ $0x20, R10
4982 IMULQ R9, R10
4983 SHRQ $0x38, R10
4984 SHLQ $0x20, R11
4985 IMULQ R9, R11
4986 SHRQ $0x38, R11
4987 MOVL (AX)(R10*4), SI
4988 MOVL (AX)(R11*4), R8
4989 MOVL DX, (AX)(R10*4)
4990 LEAL 1(DX), R10
4991 MOVL R10, (AX)(R11*4)
4992 MOVQ DI, R10
4993 SHRQ $0x10, R10
4994 SHLQ $0x20, R10
4995 IMULQ R9, R10
4996 SHRQ $0x38, R10
4997 MOVL DX, R9
4998 SUBL 16(SP), R9
4999 MOVL 1(BX)(R9*1), R11
5000 MOVQ DI, R9
5001 SHRQ $0x08, R9
5002 CMPL R9, R11
5003 JNE no_repeat_found_encodeBlockAsm8B
5004 LEAL 1(DX), DI
5005 MOVL 12(SP), R8
5006 MOVL DI, SI
5007 SUBL 16(SP), SI
5008 JZ repeat_extend_back_end_encodeBlockAsm8B
5009
5010 repeat_extend_back_loop_encodeBlockAsm8B:
5011 CMPL DI, R8
5012 JBE repeat_extend_back_end_encodeBlockAsm8B
5013 MOVB -1(BX)(SI*1), R9
5014 MOVB -1(BX)(DI*1), R10
5015 CMPB R9, R10
5016 JNE repeat_extend_back_end_encodeBlockAsm8B
5017 LEAL -1(DI), DI
5018 DECL SI
5019 JNZ repeat_extend_back_loop_encodeBlockAsm8B
5020
5021 repeat_extend_back_end_encodeBlockAsm8B:
5022 MOVL DI, SI
5023 SUBL 12(SP), SI
5024 LEAQ 3(CX)(SI*1), SI
5025 CMPQ SI, (SP)
5026 JB repeat_dst_size_check_encodeBlockAsm8B
5027 MOVQ $0x00000000, ret+56(FP)
5028 RET
5029
5030 repeat_dst_size_check_encodeBlockAsm8B:
5031 MOVL 12(SP), SI
5032 CMPL SI, DI
5033 JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B
5034 MOVL DI, R9
5035 MOVL DI, 12(SP)
5036 LEAQ (BX)(SI*1), R10
5037 SUBL SI, R9
5038 LEAL -1(R9), SI
5039 CMPL SI, $0x3c
5040 JB one_byte_repeat_emit_encodeBlockAsm8B
5041 CMPL SI, $0x00000100
5042 JB two_bytes_repeat_emit_encodeBlockAsm8B
5043 JB three_bytes_repeat_emit_encodeBlockAsm8B
5044
5045 three_bytes_repeat_emit_encodeBlockAsm8B:
5046 MOVB $0xf4, (CX)
5047 MOVW SI, 1(CX)
5048 ADDQ $0x03, CX
5049 JMP memmove_long_repeat_emit_encodeBlockAsm8B
5050
5051 two_bytes_repeat_emit_encodeBlockAsm8B:
5052 MOVB $0xf0, (CX)
5053 MOVB SI, 1(CX)
5054 ADDQ $0x02, CX
5055 CMPL SI, $0x40
5056 JB memmove_repeat_emit_encodeBlockAsm8B
5057 JMP memmove_long_repeat_emit_encodeBlockAsm8B
5058
5059 one_byte_repeat_emit_encodeBlockAsm8B:
5060 SHLB $0x02, SI
5061 MOVB SI, (CX)
5062 ADDQ $0x01, CX
5063
5064 memmove_repeat_emit_encodeBlockAsm8B:
5065 LEAQ (CX)(R9*1), SI
5066
5067 // genMemMoveShort
5068 CMPQ R9, $0x08
5069 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8
5070 CMPQ R9, $0x10
5071 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16
5072 CMPQ R9, $0x20
5073 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32
5074 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64
5075
5076 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8:
5077 MOVQ (R10), R11
5078 MOVQ R11, (CX)
5079 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5080
5081 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16:
5082 MOVQ (R10), R11
5083 MOVQ -8(R10)(R9*1), R10
5084 MOVQ R11, (CX)
5085 MOVQ R10, -8(CX)(R9*1)
5086 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5087
5088 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32:
5089 MOVOU (R10), X0
5090 MOVOU -16(R10)(R9*1), X1
5091 MOVOU X0, (CX)
5092 MOVOU X1, -16(CX)(R9*1)
5093 JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B
5094
5095 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64:
5096 MOVOU (R10), X0
5097 MOVOU 16(R10), X1
5098 MOVOU -32(R10)(R9*1), X2
5099 MOVOU -16(R10)(R9*1), X3
5100 MOVOU X0, (CX)
5101 MOVOU X1, 16(CX)
5102 MOVOU X2, -32(CX)(R9*1)
5103 MOVOU X3, -16(CX)(R9*1)
5104
5105 memmove_end_copy_repeat_emit_encodeBlockAsm8B:
5106 MOVQ SI, CX
5107 JMP emit_literal_done_repeat_emit_encodeBlockAsm8B
5108
5109 memmove_long_repeat_emit_encodeBlockAsm8B:
5110 LEAQ (CX)(R9*1), SI
5111
5112 // genMemMoveLong
5113 MOVOU (R10), X0
5114 MOVOU 16(R10), X1
5115 MOVOU -32(R10)(R9*1), X2
5116 MOVOU -16(R10)(R9*1), X3
5117 MOVQ R9, R12
5118 SHRQ $0x05, R12
5119 MOVQ CX, R11
5120 ANDL $0x0000001f, R11
5121 MOVQ $0x00000040, R13
5122 SUBQ R11, R13
5123 DECQ R12
5124 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5125 LEAQ -32(R10)(R13*1), R11
5126 LEAQ -32(CX)(R13*1), R14
5127
5128 emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back:
5129 MOVOU (R11), X4
5130 MOVOU 16(R11), X5
5131 MOVOA X4, (R14)
5132 MOVOA X5, 16(R14)
5133 ADDQ $0x20, R14
5134 ADDQ $0x20, R11
5135 ADDQ $0x20, R13
5136 DECQ R12
5137 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back
5138
5139 emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
5140 MOVOU -32(R10)(R13*1), X4
5141 MOVOU -16(R10)(R13*1), X5
5142 MOVOA X4, -32(CX)(R13*1)
5143 MOVOA X5, -16(CX)(R13*1)
5144 ADDQ $0x20, R13
5145 CMPQ R9, R13
5146 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5147 MOVOU X0, (CX)
5148 MOVOU X1, 16(CX)
5149 MOVOU X2, -32(CX)(R9*1)
5150 MOVOU X3, -16(CX)(R9*1)
5151 MOVQ SI, CX
5152
5153 emit_literal_done_repeat_emit_encodeBlockAsm8B:
5154 ADDL $0x05, DX
5155 MOVL DX, SI
5156 SUBL 16(SP), SI
5157 MOVQ src_len+32(FP), R9
5158 SUBL DX, R9
5159 LEAQ (BX)(DX*1), R10
5160 LEAQ (BX)(SI*1), SI
5161
5162 // matchLen
5163 XORL R12, R12
5164
5165 matchlen_loopback_16_repeat_extend_encodeBlockAsm8B:
5166 CMPL R9, $0x10
5167 JB matchlen_match8_repeat_extend_encodeBlockAsm8B
5168 MOVQ (R10)(R12*1), R11
5169 MOVQ 8(R10)(R12*1), R13
5170 XORQ (SI)(R12*1), R11
5171 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
5172 XORQ 8(SI)(R12*1), R13
5173 JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B
5174 LEAL -16(R9), R9
5175 LEAL 16(R12), R12
5176 JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B
5177
5178 matchlen_bsf_16repeat_extend_encodeBlockAsm8B:
5179 #ifdef GOAMD64_v3
5180 TZCNTQ R13, R13
5181
5182 #else
5183 BSFQ R13, R13
5184
5185 #endif
5186 SARQ $0x03, R13
5187 LEAL 8(R12)(R13*1), R12
5188 JMP repeat_extend_forward_end_encodeBlockAsm8B
5189
5190 matchlen_match8_repeat_extend_encodeBlockAsm8B:
5191 CMPL R9, $0x08
5192 JB matchlen_match4_repeat_extend_encodeBlockAsm8B
5193 MOVQ (R10)(R12*1), R11
5194 XORQ (SI)(R12*1), R11
5195 JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B
5196 LEAL -8(R9), R9
5197 LEAL 8(R12), R12
5198 JMP matchlen_match4_repeat_extend_encodeBlockAsm8B
5199
5200 matchlen_bsf_8_repeat_extend_encodeBlockAsm8B:
5201 #ifdef GOAMD64_v3
5202 TZCNTQ R11, R11
5203
5204 #else
5205 BSFQ R11, R11
5206
5207 #endif
5208 SARQ $0x03, R11
5209 LEAL (R12)(R11*1), R12
5210 JMP repeat_extend_forward_end_encodeBlockAsm8B
5211
5212 matchlen_match4_repeat_extend_encodeBlockAsm8B:
5213 CMPL R9, $0x04
5214 JB matchlen_match2_repeat_extend_encodeBlockAsm8B
5215 MOVL (R10)(R12*1), R11
5216 CMPL (SI)(R12*1), R11
5217 JNE matchlen_match2_repeat_extend_encodeBlockAsm8B
5218 LEAL -4(R9), R9
5219 LEAL 4(R12), R12
5220
5221 matchlen_match2_repeat_extend_encodeBlockAsm8B:
5222 CMPL R9, $0x01
5223 JE matchlen_match1_repeat_extend_encodeBlockAsm8B
5224 JB repeat_extend_forward_end_encodeBlockAsm8B
5225 MOVW (R10)(R12*1), R11
5226 CMPW (SI)(R12*1), R11
5227 JNE matchlen_match1_repeat_extend_encodeBlockAsm8B
5228 LEAL 2(R12), R12
5229 SUBL $0x02, R9
5230 JZ repeat_extend_forward_end_encodeBlockAsm8B
5231
5232 matchlen_match1_repeat_extend_encodeBlockAsm8B:
5233 MOVB (R10)(R12*1), R11
5234 CMPB (SI)(R12*1), R11
5235 JNE repeat_extend_forward_end_encodeBlockAsm8B
5236 LEAL 1(R12), R12
5237
5238 repeat_extend_forward_end_encodeBlockAsm8B:
5239 ADDL R12, DX
5240 MOVL DX, SI
5241 SUBL DI, SI
5242 MOVL 16(SP), DI
5243 TESTL R8, R8
5244 JZ repeat_as_copy_encodeBlockAsm8B
5245
5246 // emitRepeat
5247 MOVL SI, DI
5248 LEAL -4(SI), SI
5249 CMPL DI, $0x08
5250 JBE repeat_two_match_repeat_encodeBlockAsm8B
5251 CMPL DI, $0x0c
5252 JAE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B
5253
5254 cant_repeat_two_offset_match_repeat_encodeBlockAsm8B:
5255 CMPL SI, $0x00000104
5256 JB repeat_three_match_repeat_encodeBlockAsm8B
5257 LEAL -256(SI), SI
5258 MOVW $0x0019, (CX)
5259 MOVW SI, 2(CX)
5260 ADDQ $0x04, CX
5261 JMP repeat_end_emit_encodeBlockAsm8B
5262
5263 repeat_three_match_repeat_encodeBlockAsm8B:
5264 LEAL -4(SI), SI
5265 MOVW $0x0015, (CX)
5266 MOVB SI, 2(CX)
5267 ADDQ $0x03, CX
5268 JMP repeat_end_emit_encodeBlockAsm8B
5269
5270 repeat_two_match_repeat_encodeBlockAsm8B:
5271 SHLL $0x02, SI
5272 ORL $0x01, SI
5273 MOVW SI, (CX)
5274 ADDQ $0x02, CX
5275 JMP repeat_end_emit_encodeBlockAsm8B
5276 XORQ R8, R8
5277 LEAL 1(R8)(SI*4), SI
5278 MOVB DI, 1(CX)
5279 SARL $0x08, DI
5280 SHLL $0x05, DI
5281 ORL DI, SI
5282 MOVB SI, (CX)
5283 ADDQ $0x02, CX
5284 JMP repeat_end_emit_encodeBlockAsm8B
5285
5286 repeat_as_copy_encodeBlockAsm8B:
5287 // emitCopy
5288 CMPL SI, $0x40
5289 JBE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
5290 CMPL DI, $0x00000800
5291 JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B
5292 MOVL $0x00000001, R8
5293 LEAL 16(R8), R8
5294 MOVB DI, 1(CX)
5295 SHRL $0x08, DI
5296 SHLL $0x05, DI
5297 ORL DI, R8
5298 MOVB R8, (CX)
5299 ADDQ $0x02, CX
5300 SUBL $0x08, SI
5301
5302 // emitRepeat
5303 LEAL -4(SI), SI
5304 JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5305 MOVL SI, DI
5306 LEAL -4(SI), SI
5307 CMPL DI, $0x08
5308 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5309 CMPL DI, $0x0c
5310 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5311
5312 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5313 CMPL SI, $0x00000104
5314 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b
5315 LEAL -256(SI), SI
5316 MOVW $0x0019, (CX)
5317 MOVW SI, 2(CX)
5318 ADDQ $0x04, CX
5319 JMP repeat_end_emit_encodeBlockAsm8B
5320
5321 repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5322 LEAL -4(SI), SI
5323 MOVW $0x0015, (CX)
5324 MOVB SI, 2(CX)
5325 ADDQ $0x03, CX
5326 JMP repeat_end_emit_encodeBlockAsm8B
5327
5328 repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b:
5329 SHLL $0x02, SI
5330 ORL $0x01, SI
5331 MOVW SI, (CX)
5332 ADDQ $0x02, CX
5333 JMP repeat_end_emit_encodeBlockAsm8B
5334 XORQ R8, R8
5335 LEAL 1(R8)(SI*4), SI
5336 MOVB DI, 1(CX)
5337 SARL $0x08, DI
5338 SHLL $0x05, DI
5339 ORL DI, SI
5340 MOVB SI, (CX)
5341 ADDQ $0x02, CX
5342 JMP repeat_end_emit_encodeBlockAsm8B
5343
5344 long_offset_short_repeat_as_copy_encodeBlockAsm8B:
5345 MOVB $0xee, (CX)
5346 MOVW DI, 1(CX)
5347 LEAL -60(SI), SI
5348 ADDQ $0x03, CX
5349
5350 // emitRepeat
5351 MOVL SI, DI
5352 LEAL -4(SI), SI
5353 CMPL DI, $0x08
5354 JBE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5355 CMPL DI, $0x0c
5356 JAE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5357
5358 cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5359 CMPL SI, $0x00000104
5360 JB repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short
5361 LEAL -256(SI), SI
5362 MOVW $0x0019, (CX)
5363 MOVW SI, 2(CX)
5364 ADDQ $0x04, CX
5365 JMP repeat_end_emit_encodeBlockAsm8B
5366
5367 repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5368 LEAL -4(SI), SI
5369 MOVW $0x0015, (CX)
5370 MOVB SI, 2(CX)
5371 ADDQ $0x03, CX
5372 JMP repeat_end_emit_encodeBlockAsm8B
5373
5374 repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
5375 SHLL $0x02, SI
5376 ORL $0x01, SI
5377 MOVW SI, (CX)
5378 ADDQ $0x02, CX
5379 JMP repeat_end_emit_encodeBlockAsm8B
5380 XORQ R8, R8
5381 LEAL 1(R8)(SI*4), SI
5382 MOVB DI, 1(CX)
5383 SARL $0x08, DI
5384 SHLL $0x05, DI
5385 ORL DI, SI
5386 MOVB SI, (CX)
5387 ADDQ $0x02, CX
5388 JMP repeat_end_emit_encodeBlockAsm8B
5389
5390 two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
5391 MOVL SI, R8
5392 SHLL $0x02, R8
5393 CMPL SI, $0x0c
5394 JAE emit_copy_three_repeat_as_copy_encodeBlockAsm8B
5395 LEAL -15(R8), R8
5396 MOVB DI, 1(CX)
5397 SHRL $0x08, DI
5398 SHLL $0x05, DI
5399 ORL DI, R8
5400 MOVB R8, (CX)
5401 ADDQ $0x02, CX
5402 JMP repeat_end_emit_encodeBlockAsm8B
5403
5404 emit_copy_three_repeat_as_copy_encodeBlockAsm8B:
5405 LEAL -2(R8), R8
5406 MOVB R8, (CX)
5407 MOVW DI, 1(CX)
5408 ADDQ $0x03, CX
5409
5410 repeat_end_emit_encodeBlockAsm8B:
5411 MOVL DX, 12(SP)
5412 JMP search_loop_encodeBlockAsm8B
5413
5414 no_repeat_found_encodeBlockAsm8B:
5415 CMPL (BX)(SI*1), DI
5416 JEQ candidate_match_encodeBlockAsm8B
5417 SHRQ $0x08, DI
5418 MOVL (AX)(R10*4), SI
5419 LEAL 2(DX), R9
5420 CMPL (BX)(R8*1), DI
5421 JEQ candidate2_match_encodeBlockAsm8B
5422 MOVL R9, (AX)(R10*4)
5423 SHRQ $0x08, DI
5424 CMPL (BX)(SI*1), DI
5425 JEQ candidate3_match_encodeBlockAsm8B
5426 MOVL 20(SP), DX
5427 JMP search_loop_encodeBlockAsm8B
5428
5429 candidate3_match_encodeBlockAsm8B:
5430 ADDL $0x02, DX
5431 JMP candidate_match_encodeBlockAsm8B
5432
5433 candidate2_match_encodeBlockAsm8B:
5434 MOVL R9, (AX)(R10*4)
5435 INCL DX
5436 MOVL R8, SI
5437
5438 candidate_match_encodeBlockAsm8B:
5439 MOVL 12(SP), DI
5440 TESTL SI, SI
5441 JZ match_extend_back_end_encodeBlockAsm8B
5442
5443 match_extend_back_loop_encodeBlockAsm8B:
5444 CMPL DX, DI
5445 JBE match_extend_back_end_encodeBlockAsm8B
5446 MOVB -1(BX)(SI*1), R8
5447 MOVB -1(BX)(DX*1), R9
5448 CMPB R8, R9
5449 JNE match_extend_back_end_encodeBlockAsm8B
5450 LEAL -1(DX), DX
5451 DECL SI
5452 JZ match_extend_back_end_encodeBlockAsm8B
5453 JMP match_extend_back_loop_encodeBlockAsm8B
5454
5455 match_extend_back_end_encodeBlockAsm8B:
5456 MOVL DX, DI
5457 SUBL 12(SP), DI
5458 LEAQ 3(CX)(DI*1), DI
5459 CMPQ DI, (SP)
5460 JB match_dst_size_check_encodeBlockAsm8B
5461 MOVQ $0x00000000, ret+56(FP)
5462 RET
5463
5464 match_dst_size_check_encodeBlockAsm8B:
5465 MOVL DX, DI
5466 MOVL 12(SP), R8
5467 CMPL R8, DI
5468 JEQ emit_literal_done_match_emit_encodeBlockAsm8B
5469 MOVL DI, R9
5470 MOVL DI, 12(SP)
5471 LEAQ (BX)(R8*1), DI
5472 SUBL R8, R9
5473 LEAL -1(R9), R8
5474 CMPL R8, $0x3c
5475 JB one_byte_match_emit_encodeBlockAsm8B
5476 CMPL R8, $0x00000100
5477 JB two_bytes_match_emit_encodeBlockAsm8B
5478 JB three_bytes_match_emit_encodeBlockAsm8B
5479
5480 three_bytes_match_emit_encodeBlockAsm8B:
5481 MOVB $0xf4, (CX)
5482 MOVW R8, 1(CX)
5483 ADDQ $0x03, CX
5484 JMP memmove_long_match_emit_encodeBlockAsm8B
5485
5486 two_bytes_match_emit_encodeBlockAsm8B:
5487 MOVB $0xf0, (CX)
5488 MOVB R8, 1(CX)
5489 ADDQ $0x02, CX
5490 CMPL R8, $0x40
5491 JB memmove_match_emit_encodeBlockAsm8B
5492 JMP memmove_long_match_emit_encodeBlockAsm8B
5493
5494 one_byte_match_emit_encodeBlockAsm8B:
5495 SHLB $0x02, R8
5496 MOVB R8, (CX)
5497 ADDQ $0x01, CX
5498
5499 memmove_match_emit_encodeBlockAsm8B:
5500 LEAQ (CX)(R9*1), R8
5501
5502 // genMemMoveShort
5503 CMPQ R9, $0x08
5504 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8
5505 CMPQ R9, $0x10
5506 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16
5507 CMPQ R9, $0x20
5508 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32
5509 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64
5510
5511 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8:
5512 MOVQ (DI), R10
5513 MOVQ R10, (CX)
5514 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5515
5516 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16:
5517 MOVQ (DI), R10
5518 MOVQ -8(DI)(R9*1), DI
5519 MOVQ R10, (CX)
5520 MOVQ DI, -8(CX)(R9*1)
5521 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5522
5523 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32:
5524 MOVOU (DI), X0
5525 MOVOU -16(DI)(R9*1), X1
5526 MOVOU X0, (CX)
5527 MOVOU X1, -16(CX)(R9*1)
5528 JMP memmove_end_copy_match_emit_encodeBlockAsm8B
5529
5530 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64:
5531 MOVOU (DI), X0
5532 MOVOU 16(DI), X1
5533 MOVOU -32(DI)(R9*1), X2
5534 MOVOU -16(DI)(R9*1), X3
5535 MOVOU X0, (CX)
5536 MOVOU X1, 16(CX)
5537 MOVOU X2, -32(CX)(R9*1)
5538 MOVOU X3, -16(CX)(R9*1)
5539
5540 memmove_end_copy_match_emit_encodeBlockAsm8B:
5541 MOVQ R8, CX
5542 JMP emit_literal_done_match_emit_encodeBlockAsm8B
5543
5544 memmove_long_match_emit_encodeBlockAsm8B:
5545 LEAQ (CX)(R9*1), R8
5546
5547 // genMemMoveLong
5548 MOVOU (DI), X0
5549 MOVOU 16(DI), X1
5550 MOVOU -32(DI)(R9*1), X2
5551 MOVOU -16(DI)(R9*1), X3
5552 MOVQ R9, R11
5553 SHRQ $0x05, R11
5554 MOVQ CX, R10
5555 ANDL $0x0000001f, R10
5556 MOVQ $0x00000040, R12
5557 SUBQ R10, R12
5558 DECQ R11
5559 JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5560 LEAQ -32(DI)(R12*1), R10
5561 LEAQ -32(CX)(R12*1), R13
5562
5563 emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back:
5564 MOVOU (R10), X4
5565 MOVOU 16(R10), X5
5566 MOVOA X4, (R13)
5567 MOVOA X5, 16(R13)
5568 ADDQ $0x20, R13
5569 ADDQ $0x20, R10
5570 ADDQ $0x20, R12
5571 DECQ R11
5572 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back
5573
5574 emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32:
5575 MOVOU -32(DI)(R12*1), X4
5576 MOVOU -16(DI)(R12*1), X5
5577 MOVOA X4, -32(CX)(R12*1)
5578 MOVOA X5, -16(CX)(R12*1)
5579 ADDQ $0x20, R12
5580 CMPQ R9, R12
5581 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32
5582 MOVOU X0, (CX)
5583 MOVOU X1, 16(CX)
5584 MOVOU X2, -32(CX)(R9*1)
5585 MOVOU X3, -16(CX)(R9*1)
5586 MOVQ R8, CX
5587
5588 emit_literal_done_match_emit_encodeBlockAsm8B:
5589 match_nolit_loop_encodeBlockAsm8B:
5590 MOVL DX, DI
5591 SUBL SI, DI
5592 MOVL DI, 16(SP)
5593 ADDL $0x04, DX
5594 ADDL $0x04, SI
5595 MOVQ src_len+32(FP), DI
5596 SUBL DX, DI
5597 LEAQ (BX)(DX*1), R8
5598 LEAQ (BX)(SI*1), SI
5599
5600 // matchLen
5601 XORL R10, R10
5602
5603 matchlen_loopback_16_match_nolit_encodeBlockAsm8B:
5604 CMPL DI, $0x10
5605 JB matchlen_match8_match_nolit_encodeBlockAsm8B
5606 MOVQ (R8)(R10*1), R9
5607 MOVQ 8(R8)(R10*1), R11
5608 XORQ (SI)(R10*1), R9
5609 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
5610 XORQ 8(SI)(R10*1), R11
5611 JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B
5612 LEAL -16(DI), DI
5613 LEAL 16(R10), R10
5614 JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B
5615
5616 matchlen_bsf_16match_nolit_encodeBlockAsm8B:
5617 #ifdef GOAMD64_v3
5618 TZCNTQ R11, R11
5619
5620 #else
5621 BSFQ R11, R11
5622
5623 #endif
5624 SARQ $0x03, R11
5625 LEAL 8(R10)(R11*1), R10
5626 JMP match_nolit_end_encodeBlockAsm8B
5627
5628 matchlen_match8_match_nolit_encodeBlockAsm8B:
5629 CMPL DI, $0x08
5630 JB matchlen_match4_match_nolit_encodeBlockAsm8B
5631 MOVQ (R8)(R10*1), R9
5632 XORQ (SI)(R10*1), R9
5633 JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B
5634 LEAL -8(DI), DI
5635 LEAL 8(R10), R10
5636 JMP matchlen_match4_match_nolit_encodeBlockAsm8B
5637
5638 matchlen_bsf_8_match_nolit_encodeBlockAsm8B:
5639 #ifdef GOAMD64_v3
5640 TZCNTQ R9, R9
5641
5642 #else
5643 BSFQ R9, R9
5644
5645 #endif
5646 SARQ $0x03, R9
5647 LEAL (R10)(R9*1), R10
5648 JMP match_nolit_end_encodeBlockAsm8B
5649
5650 matchlen_match4_match_nolit_encodeBlockAsm8B:
5651 CMPL DI, $0x04
5652 JB matchlen_match2_match_nolit_encodeBlockAsm8B
5653 MOVL (R8)(R10*1), R9
5654 CMPL (SI)(R10*1), R9
5655 JNE matchlen_match2_match_nolit_encodeBlockAsm8B
5656 LEAL -4(DI), DI
5657 LEAL 4(R10), R10
5658
5659 matchlen_match2_match_nolit_encodeBlockAsm8B:
5660 CMPL DI, $0x01
5661 JE matchlen_match1_match_nolit_encodeBlockAsm8B
5662 JB match_nolit_end_encodeBlockAsm8B
5663 MOVW (R8)(R10*1), R9
5664 CMPW (SI)(R10*1), R9
5665 JNE matchlen_match1_match_nolit_encodeBlockAsm8B
5666 LEAL 2(R10), R10
5667 SUBL $0x02, DI
5668 JZ match_nolit_end_encodeBlockAsm8B
5669
5670 matchlen_match1_match_nolit_encodeBlockAsm8B:
5671 MOVB (R8)(R10*1), R9
5672 CMPB (SI)(R10*1), R9
5673 JNE match_nolit_end_encodeBlockAsm8B
5674 LEAL 1(R10), R10
5675
5676 match_nolit_end_encodeBlockAsm8B:
5677 ADDL R10, DX
5678 MOVL 16(SP), SI
5679 ADDL $0x04, R10
5680 MOVL DX, 12(SP)
5681
5682 // emitCopy
5683 CMPL R10, $0x40
5684 JBE two_byte_offset_short_match_nolit_encodeBlockAsm8B
5685 CMPL SI, $0x00000800
5686 JAE long_offset_short_match_nolit_encodeBlockAsm8B
5687 MOVL $0x00000001, DI
5688 LEAL 16(DI), DI
5689 MOVB SI, 1(CX)
5690 SHRL $0x08, SI
5691 SHLL $0x05, SI
5692 ORL SI, DI
5693 MOVB DI, (CX)
5694 ADDQ $0x02, CX
5695 SUBL $0x08, R10
5696
5697 // emitRepeat
5698 LEAL -4(R10), R10
5699 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5700 MOVL R10, SI
5701 LEAL -4(R10), R10
5702 CMPL SI, $0x08
5703 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5704 CMPL SI, $0x0c
5705 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5706
5707 cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5708 CMPL R10, $0x00000104
5709 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b
5710 LEAL -256(R10), R10
5711 MOVW $0x0019, (CX)
5712 MOVW R10, 2(CX)
5713 ADDQ $0x04, CX
5714 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5715
5716 repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5717 LEAL -4(R10), R10
5718 MOVW $0x0015, (CX)
5719 MOVB R10, 2(CX)
5720 ADDQ $0x03, CX
5721 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5722
5723 repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b:
5724 SHLL $0x02, R10
5725 ORL $0x01, R10
5726 MOVW R10, (CX)
5727 ADDQ $0x02, CX
5728 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5729 XORQ DI, DI
5730 LEAL 1(DI)(R10*4), R10
5731 MOVB SI, 1(CX)
5732 SARL $0x08, SI
5733 SHLL $0x05, SI
5734 ORL SI, R10
5735 MOVB R10, (CX)
5736 ADDQ $0x02, CX
5737 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5738
5739 long_offset_short_match_nolit_encodeBlockAsm8B:
5740 MOVB $0xee, (CX)
5741 MOVW SI, 1(CX)
5742 LEAL -60(R10), R10
5743 ADDQ $0x03, CX
5744
5745 // emitRepeat
5746 MOVL R10, SI
5747 LEAL -4(R10), R10
5748 CMPL SI, $0x08
5749 JBE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short
5750 CMPL SI, $0x0c
5751 JAE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short
5752
5753 cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short:
5754 CMPL R10, $0x00000104
5755 JB repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short
5756 LEAL -256(R10), R10
5757 MOVW $0x0019, (CX)
5758 MOVW R10, 2(CX)
5759 ADDQ $0x04, CX
5760 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5761
5762 repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short:
5763 LEAL -4(R10), R10
5764 MOVW $0x0015, (CX)
5765 MOVB R10, 2(CX)
5766 ADDQ $0x03, CX
5767 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5768
5769 repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
5770 SHLL $0x02, R10
5771 ORL $0x01, R10
5772 MOVW R10, (CX)
5773 ADDQ $0x02, CX
5774 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5775 XORQ DI, DI
5776 LEAL 1(DI)(R10*4), R10
5777 MOVB SI, 1(CX)
5778 SARL $0x08, SI
5779 SHLL $0x05, SI
5780 ORL SI, R10
5781 MOVB R10, (CX)
5782 ADDQ $0x02, CX
5783 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5784
5785 two_byte_offset_short_match_nolit_encodeBlockAsm8B:
5786 MOVL R10, DI
5787 SHLL $0x02, DI
5788 CMPL R10, $0x0c
5789 JAE emit_copy_three_match_nolit_encodeBlockAsm8B
5790 LEAL -15(DI), DI
5791 MOVB SI, 1(CX)
5792 SHRL $0x08, SI
5793 SHLL $0x05, SI
5794 ORL SI, DI
5795 MOVB DI, (CX)
5796 ADDQ $0x02, CX
5797 JMP match_nolit_emitcopy_end_encodeBlockAsm8B
5798
5799 emit_copy_three_match_nolit_encodeBlockAsm8B:
5800 LEAL -2(DI), DI
5801 MOVB DI, (CX)
5802 MOVW SI, 1(CX)
5803 ADDQ $0x03, CX
5804
5805 match_nolit_emitcopy_end_encodeBlockAsm8B:
5806 CMPL DX, 8(SP)
5807 JAE emit_remainder_encodeBlockAsm8B
5808 MOVQ -2(BX)(DX*1), DI
5809 CMPQ CX, (SP)
5810 JB match_nolit_dst_ok_encodeBlockAsm8B
5811 MOVQ $0x00000000, ret+56(FP)
5812 RET
5813
5814 match_nolit_dst_ok_encodeBlockAsm8B:
5815 MOVQ $0x9e3779b1, R9
5816 MOVQ DI, R8
5817 SHRQ $0x10, DI
5818 MOVQ DI, SI
5819 SHLQ $0x20, R8
5820 IMULQ R9, R8
5821 SHRQ $0x38, R8
5822 SHLQ $0x20, SI
5823 IMULQ R9, SI
5824 SHRQ $0x38, SI
5825 LEAL -2(DX), R9
5826 LEAQ (AX)(SI*4), R10
5827 MOVL (R10), SI
5828 MOVL R9, (AX)(R8*4)
5829 MOVL DX, (R10)
5830 CMPL (BX)(SI*1), DI
5831 JEQ match_nolit_loop_encodeBlockAsm8B
5832 INCL DX
5833 JMP search_loop_encodeBlockAsm8B
5834
5835 emit_remainder_encodeBlockAsm8B:
5836 MOVQ src_len+32(FP), AX
5837 SUBL 12(SP), AX
5838 LEAQ 3(CX)(AX*1), AX
5839 CMPQ AX, (SP)
5840 JB emit_remainder_ok_encodeBlockAsm8B
5841 MOVQ $0x00000000, ret+56(FP)
5842 RET
5843
5844 emit_remainder_ok_encodeBlockAsm8B:
5845 MOVQ src_len+32(FP), AX
5846 MOVL 12(SP), DX
5847 CMPL DX, AX
5848 JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B
5849 MOVL AX, SI
5850 MOVL AX, 12(SP)
5851 LEAQ (BX)(DX*1), AX
5852 SUBL DX, SI
5853 LEAL -1(SI), DX
5854 CMPL DX, $0x3c
5855 JB one_byte_emit_remainder_encodeBlockAsm8B
5856 CMPL DX, $0x00000100
5857 JB two_bytes_emit_remainder_encodeBlockAsm8B
5858 JB three_bytes_emit_remainder_encodeBlockAsm8B
5859
5860 three_bytes_emit_remainder_encodeBlockAsm8B:
5861 MOVB $0xf4, (CX)
5862 MOVW DX, 1(CX)
5863 ADDQ $0x03, CX
5864 JMP memmove_long_emit_remainder_encodeBlockAsm8B
5865
5866 two_bytes_emit_remainder_encodeBlockAsm8B:
5867 MOVB $0xf0, (CX)
5868 MOVB DL, 1(CX)
5869 ADDQ $0x02, CX
5870 CMPL DX, $0x40
5871 JB memmove_emit_remainder_encodeBlockAsm8B
5872 JMP memmove_long_emit_remainder_encodeBlockAsm8B
5873
5874 one_byte_emit_remainder_encodeBlockAsm8B:
5875 SHLB $0x02, DL
5876 MOVB DL, (CX)
5877 ADDQ $0x01, CX
5878
5879 memmove_emit_remainder_encodeBlockAsm8B:
5880 LEAQ (CX)(SI*1), DX
5881 MOVL SI, BX
5882
5883 // genMemMoveShort
5884 CMPQ BX, $0x03
5885 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2
5886 JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3
5887 CMPQ BX, $0x08
5888 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7
5889 CMPQ BX, $0x10
5890 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16
5891 CMPQ BX, $0x20
5892 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32
5893 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64
5894
5895 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2:
5896 MOVB (AX), SI
5897 MOVB -1(AX)(BX*1), AL
5898 MOVB SI, (CX)
5899 MOVB AL, -1(CX)(BX*1)
5900 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5901
5902 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3:
5903 MOVW (AX), SI
5904 MOVB 2(AX), AL
5905 MOVW SI, (CX)
5906 MOVB AL, 2(CX)
5907 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5908
5909 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7:
5910 MOVL (AX), SI
5911 MOVL -4(AX)(BX*1), AX
5912 MOVL SI, (CX)
5913 MOVL AX, -4(CX)(BX*1)
5914 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5915
5916 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16:
5917 MOVQ (AX), SI
5918 MOVQ -8(AX)(BX*1), AX
5919 MOVQ SI, (CX)
5920 MOVQ AX, -8(CX)(BX*1)
5921 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5922
5923 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32:
5924 MOVOU (AX), X0
5925 MOVOU -16(AX)(BX*1), X1
5926 MOVOU X0, (CX)
5927 MOVOU X1, -16(CX)(BX*1)
5928 JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B
5929
5930 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64:
5931 MOVOU (AX), X0
5932 MOVOU 16(AX), X1
5933 MOVOU -32(AX)(BX*1), X2
5934 MOVOU -16(AX)(BX*1), X3
5935 MOVOU X0, (CX)
5936 MOVOU X1, 16(CX)
5937 MOVOU X2, -32(CX)(BX*1)
5938 MOVOU X3, -16(CX)(BX*1)
5939
5940 memmove_end_copy_emit_remainder_encodeBlockAsm8B:
5941 MOVQ DX, CX
5942 JMP emit_literal_done_emit_remainder_encodeBlockAsm8B
5943
5944 memmove_long_emit_remainder_encodeBlockAsm8B:
5945 LEAQ (CX)(SI*1), DX
5946 MOVL SI, BX
5947
5948 // genMemMoveLong
5949 MOVOU (AX), X0
5950 MOVOU 16(AX), X1
5951 MOVOU -32(AX)(BX*1), X2
5952 MOVOU -16(AX)(BX*1), X3
5953 MOVQ BX, DI
5954 SHRQ $0x05, DI
5955 MOVQ CX, SI
5956 ANDL $0x0000001f, SI
5957 MOVQ $0x00000040, R8
5958 SUBQ SI, R8
5959 DECQ DI
5960 JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5961 LEAQ -32(AX)(R8*1), SI
5962 LEAQ -32(CX)(R8*1), R9
5963
5964 emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back:
5965 MOVOU (SI), X4
5966 MOVOU 16(SI), X5
5967 MOVOA X4, (R9)
5968 MOVOA X5, 16(R9)
5969 ADDQ $0x20, R9
5970 ADDQ $0x20, SI
5971 ADDQ $0x20, R8
5972 DECQ DI
5973 JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back
5974
5975 emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32:
5976 MOVOU -32(AX)(R8*1), X4
5977 MOVOU -16(AX)(R8*1), X5
5978 MOVOA X4, -32(CX)(R8*1)
5979 MOVOA X5, -16(CX)(R8*1)
5980 ADDQ $0x20, R8
5981 CMPQ BX, R8
5982 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32
5983 MOVOU X0, (CX)
5984 MOVOU X1, 16(CX)
5985 MOVOU X2, -32(CX)(BX*1)
5986 MOVOU X3, -16(CX)(BX*1)
5987 MOVQ DX, CX
5988
5989 emit_literal_done_emit_remainder_encodeBlockAsm8B:
5990 MOVQ dst_base+0(FP), AX
5991 SUBQ AX, CX
5992 MOVQ CX, ret+56(FP)
5993 RET
5994
5995 // func encodeBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
5996 // Requires: BMI, SSE2
5997 TEXT ·encodeBetterBlockAsm(SB), $24-64
5998 MOVQ tmp+48(FP), AX
5999 MOVQ dst_base+0(FP), CX
6000 MOVQ $0x00001200, DX
6001 MOVQ AX, BX
6002 PXOR X0, X0
6003
6004 zero_loop_encodeBetterBlockAsm:
6005 MOVOU X0, (BX)
6006 MOVOU X0, 16(BX)
6007 MOVOU X0, 32(BX)
6008 MOVOU X0, 48(BX)
6009 MOVOU X0, 64(BX)
6010 MOVOU X0, 80(BX)
6011 MOVOU X0, 96(BX)
6012 MOVOU X0, 112(BX)
6013 ADDQ $0x80, BX
6014 DECQ DX
6015 JNZ zero_loop_encodeBetterBlockAsm
6016 MOVL $0x00000000, 12(SP)
6017 MOVQ src_len+32(FP), DX
6018 LEAQ -6(DX), BX
6019 LEAQ -8(DX), SI
6020 MOVL SI, 8(SP)
6021 SHRQ $0x05, DX
6022 SUBL DX, BX
6023 LEAQ (CX)(BX*1), BX
6024 MOVQ BX, (SP)
6025 MOVL $0x00000001, DX
6026 MOVL $0x00000000, 16(SP)
6027 MOVQ src_base+24(FP), BX
6028
6029 search_loop_encodeBetterBlockAsm:
6030 MOVL DX, SI
6031 SUBL 12(SP), SI
6032 SHRL $0x07, SI
6033 CMPL SI, $0x63
6034 JBE check_maxskip_ok_encodeBetterBlockAsm
6035 LEAL 100(DX), SI
6036 JMP check_maxskip_cont_encodeBetterBlockAsm
6037
6038 check_maxskip_ok_encodeBetterBlockAsm:
6039 LEAL 1(DX)(SI*1), SI
6040
6041 check_maxskip_cont_encodeBetterBlockAsm:
6042 CMPL SI, 8(SP)
6043 JAE emit_remainder_encodeBetterBlockAsm
6044 MOVQ (BX)(DX*1), DI
6045 MOVL SI, 20(SP)
6046 MOVQ $0x00cf1bbcdcbfa563, R9
6047 MOVQ $0x9e3779b1, SI
6048 MOVQ DI, R10
6049 MOVQ DI, R11
6050 SHLQ $0x08, R10
6051 IMULQ R9, R10
6052 SHRQ $0x2f, R10
6053 SHLQ $0x20, R11
6054 IMULQ SI, R11
6055 SHRQ $0x32, R11
6056 MOVL (AX)(R10*4), SI
6057 MOVL 524288(AX)(R11*4), R8
6058 MOVL DX, (AX)(R10*4)
6059 MOVL DX, 524288(AX)(R11*4)
6060 MOVQ (BX)(SI*1), R10
6061 MOVQ (BX)(R8*1), R11
6062 CMPQ R10, DI
6063 JEQ candidate_match_encodeBetterBlockAsm
6064 CMPQ R11, DI
6065 JNE no_short_found_encodeBetterBlockAsm
6066 MOVL R8, SI
6067 JMP candidate_match_encodeBetterBlockAsm
6068
6069 no_short_found_encodeBetterBlockAsm:
6070 CMPL R10, DI
6071 JEQ candidate_match_encodeBetterBlockAsm
6072 CMPL R11, DI
6073 JEQ candidateS_match_encodeBetterBlockAsm
6074 MOVL 20(SP), DX
6075 JMP search_loop_encodeBetterBlockAsm
6076
6077 candidateS_match_encodeBetterBlockAsm:
6078 SHRQ $0x08, DI
6079 MOVQ DI, R10
6080 SHLQ $0x08, R10
6081 IMULQ R9, R10
6082 SHRQ $0x2f, R10
6083 MOVL (AX)(R10*4), SI
6084 INCL DX
6085 MOVL DX, (AX)(R10*4)
6086 CMPL (BX)(SI*1), DI
6087 JEQ candidate_match_encodeBetterBlockAsm
6088 DECL DX
6089 MOVL R8, SI
6090
6091 candidate_match_encodeBetterBlockAsm:
6092 MOVL 12(SP), DI
6093 TESTL SI, SI
6094 JZ match_extend_back_end_encodeBetterBlockAsm
6095
6096 match_extend_back_loop_encodeBetterBlockAsm:
6097 CMPL DX, DI
6098 JBE match_extend_back_end_encodeBetterBlockAsm
6099 MOVB -1(BX)(SI*1), R8
6100 MOVB -1(BX)(DX*1), R9
6101 CMPB R8, R9
6102 JNE match_extend_back_end_encodeBetterBlockAsm
6103 LEAL -1(DX), DX
6104 DECL SI
6105 JZ match_extend_back_end_encodeBetterBlockAsm
6106 JMP match_extend_back_loop_encodeBetterBlockAsm
6107
6108 match_extend_back_end_encodeBetterBlockAsm:
6109 MOVL DX, DI
6110 SUBL 12(SP), DI
6111 LEAQ 5(CX)(DI*1), DI
6112 CMPQ DI, (SP)
6113 JB match_dst_size_check_encodeBetterBlockAsm
6114 MOVQ $0x00000000, ret+56(FP)
6115 RET
6116
6117 match_dst_size_check_encodeBetterBlockAsm:
6118 MOVL DX, DI
6119 ADDL $0x04, DX
6120 ADDL $0x04, SI
6121 MOVQ src_len+32(FP), R8
6122 SUBL DX, R8
6123 LEAQ (BX)(DX*1), R9
6124 LEAQ (BX)(SI*1), R10
6125
6126 // matchLen
6127 XORL R12, R12
6128
6129 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm:
6130 CMPL R8, $0x10
6131 JB matchlen_match8_match_nolit_encodeBetterBlockAsm
6132 MOVQ (R9)(R12*1), R11
6133 MOVQ 8(R9)(R12*1), R13
6134 XORQ (R10)(R12*1), R11
6135 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
6136 XORQ 8(R10)(R12*1), R13
6137 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm
6138 LEAL -16(R8), R8
6139 LEAL 16(R12), R12
6140 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm
6141
6142 matchlen_bsf_16match_nolit_encodeBetterBlockAsm:
6143 #ifdef GOAMD64_v3
6144 TZCNTQ R13, R13
6145
6146 #else
6147 BSFQ R13, R13
6148
6149 #endif
6150 SARQ $0x03, R13
6151 LEAL 8(R12)(R13*1), R12
6152 JMP match_nolit_end_encodeBetterBlockAsm
6153
6154 matchlen_match8_match_nolit_encodeBetterBlockAsm:
6155 CMPL R8, $0x08
6156 JB matchlen_match4_match_nolit_encodeBetterBlockAsm
6157 MOVQ (R9)(R12*1), R11
6158 XORQ (R10)(R12*1), R11
6159 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm
6160 LEAL -8(R8), R8
6161 LEAL 8(R12), R12
6162 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm
6163
6164 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm:
6165 #ifdef GOAMD64_v3
6166 TZCNTQ R11, R11
6167
6168 #else
6169 BSFQ R11, R11
6170
6171 #endif
6172 SARQ $0x03, R11
6173 LEAL (R12)(R11*1), R12
6174 JMP match_nolit_end_encodeBetterBlockAsm
6175
6176 matchlen_match4_match_nolit_encodeBetterBlockAsm:
6177 CMPL R8, $0x04
6178 JB matchlen_match2_match_nolit_encodeBetterBlockAsm
6179 MOVL (R9)(R12*1), R11
6180 CMPL (R10)(R12*1), R11
6181 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm
6182 LEAL -4(R8), R8
6183 LEAL 4(R12), R12
6184
6185 matchlen_match2_match_nolit_encodeBetterBlockAsm:
6186 CMPL R8, $0x01
6187 JE matchlen_match1_match_nolit_encodeBetterBlockAsm
6188 JB match_nolit_end_encodeBetterBlockAsm
6189 MOVW (R9)(R12*1), R11
6190 CMPW (R10)(R12*1), R11
6191 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm
6192 LEAL 2(R12), R12
6193 SUBL $0x02, R8
6194 JZ match_nolit_end_encodeBetterBlockAsm
6195
6196 matchlen_match1_match_nolit_encodeBetterBlockAsm:
6197 MOVB (R9)(R12*1), R11
6198 CMPB (R10)(R12*1), R11
6199 JNE match_nolit_end_encodeBetterBlockAsm
6200 LEAL 1(R12), R12
6201
6202 match_nolit_end_encodeBetterBlockAsm:
6203 MOVL DX, R8
6204 SUBL SI, R8
6205
6206 // Check if repeat
6207 CMPL 16(SP), R8
6208 JEQ match_is_repeat_encodeBetterBlockAsm
6209 CMPL R12, $0x01
6210 JA match_length_ok_encodeBetterBlockAsm
6211 CMPL R8, $0x0000ffff
6212 JBE match_length_ok_encodeBetterBlockAsm
6213 MOVL 20(SP), DX
6214 INCL DX
6215 JMP search_loop_encodeBetterBlockAsm
6216
6217 match_length_ok_encodeBetterBlockAsm:
6218 MOVL R8, 16(SP)
6219 MOVL 12(SP), SI
6220 CMPL SI, DI
6221 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm
6222 MOVL DI, R9
6223 MOVL DI, 12(SP)
6224 LEAQ (BX)(SI*1), R10
6225 SUBL SI, R9
6226 LEAL -1(R9), SI
6227 CMPL SI, $0x3c
6228 JB one_byte_match_emit_encodeBetterBlockAsm
6229 CMPL SI, $0x00000100
6230 JB two_bytes_match_emit_encodeBetterBlockAsm
6231 CMPL SI, $0x00010000
6232 JB three_bytes_match_emit_encodeBetterBlockAsm
6233 CMPL SI, $0x01000000
6234 JB four_bytes_match_emit_encodeBetterBlockAsm
6235 MOVB $0xfc, (CX)
6236 MOVL SI, 1(CX)
6237 ADDQ $0x05, CX
6238 JMP memmove_long_match_emit_encodeBetterBlockAsm
6239
6240 four_bytes_match_emit_encodeBetterBlockAsm:
6241 MOVL SI, R11
6242 SHRL $0x10, R11
6243 MOVB $0xf8, (CX)
6244 MOVW SI, 1(CX)
6245 MOVB R11, 3(CX)
6246 ADDQ $0x04, CX
6247 JMP memmove_long_match_emit_encodeBetterBlockAsm
6248
6249 three_bytes_match_emit_encodeBetterBlockAsm:
6250 MOVB $0xf4, (CX)
6251 MOVW SI, 1(CX)
6252 ADDQ $0x03, CX
6253 JMP memmove_long_match_emit_encodeBetterBlockAsm
6254
6255 two_bytes_match_emit_encodeBetterBlockAsm:
6256 MOVB $0xf0, (CX)
6257 MOVB SI, 1(CX)
6258 ADDQ $0x02, CX
6259 CMPL SI, $0x40
6260 JB memmove_match_emit_encodeBetterBlockAsm
6261 JMP memmove_long_match_emit_encodeBetterBlockAsm
6262
6263 one_byte_match_emit_encodeBetterBlockAsm:
6264 SHLB $0x02, SI
6265 MOVB SI, (CX)
6266 ADDQ $0x01, CX
6267
6268 memmove_match_emit_encodeBetterBlockAsm:
6269 LEAQ (CX)(R9*1), SI
6270
6271 // genMemMoveShort
6272 CMPQ R9, $0x04
6273 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4
6274 CMPQ R9, $0x08
6275 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7
6276 CMPQ R9, $0x10
6277 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16
6278 CMPQ R9, $0x20
6279 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32
6280 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64
6281
6282 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4:
6283 MOVL (R10), R11
6284 MOVL R11, (CX)
6285 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6286
6287 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7:
6288 MOVL (R10), R11
6289 MOVL -4(R10)(R9*1), R10
6290 MOVL R11, (CX)
6291 MOVL R10, -4(CX)(R9*1)
6292 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6293
6294 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16:
6295 MOVQ (R10), R11
6296 MOVQ -8(R10)(R9*1), R10
6297 MOVQ R11, (CX)
6298 MOVQ R10, -8(CX)(R9*1)
6299 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6300
6301 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32:
6302 MOVOU (R10), X0
6303 MOVOU -16(R10)(R9*1), X1
6304 MOVOU X0, (CX)
6305 MOVOU X1, -16(CX)(R9*1)
6306 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm
6307
6308 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64:
6309 MOVOU (R10), X0
6310 MOVOU 16(R10), X1
6311 MOVOU -32(R10)(R9*1), X2
6312 MOVOU -16(R10)(R9*1), X3
6313 MOVOU X0, (CX)
6314 MOVOU X1, 16(CX)
6315 MOVOU X2, -32(CX)(R9*1)
6316 MOVOU X3, -16(CX)(R9*1)
6317
6318 memmove_end_copy_match_emit_encodeBetterBlockAsm:
6319 MOVQ SI, CX
6320 JMP emit_literal_done_match_emit_encodeBetterBlockAsm
6321
6322 memmove_long_match_emit_encodeBetterBlockAsm:
6323 LEAQ (CX)(R9*1), SI
6324
6325 // genMemMoveLong
6326 MOVOU (R10), X0
6327 MOVOU 16(R10), X1
6328 MOVOU -32(R10)(R9*1), X2
6329 MOVOU -16(R10)(R9*1), X3
6330 MOVQ R9, R13
6331 SHRQ $0x05, R13
6332 MOVQ CX, R11
6333 ANDL $0x0000001f, R11
6334 MOVQ $0x00000040, R14
6335 SUBQ R11, R14
6336 DECQ R13
6337 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
6338 LEAQ -32(R10)(R14*1), R11
6339 LEAQ -32(CX)(R14*1), R15
6340
6341 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back:
6342 MOVOU (R11), X4
6343 MOVOU 16(R11), X5
6344 MOVOA X4, (R15)
6345 MOVOA X5, 16(R15)
6346 ADDQ $0x20, R15
6347 ADDQ $0x20, R11
6348 ADDQ $0x20, R14
6349 DECQ R13
6350 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back
6351
6352 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32:
6353 MOVOU -32(R10)(R14*1), X4
6354 MOVOU -16(R10)(R14*1), X5
6355 MOVOA X4, -32(CX)(R14*1)
6356 MOVOA X5, -16(CX)(R14*1)
6357 ADDQ $0x20, R14
6358 CMPQ R9, R14
6359 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32
6360 MOVOU X0, (CX)
6361 MOVOU X1, 16(CX)
6362 MOVOU X2, -32(CX)(R9*1)
6363 MOVOU X3, -16(CX)(R9*1)
6364 MOVQ SI, CX
6365
6366 emit_literal_done_match_emit_encodeBetterBlockAsm:
6367 ADDL R12, DX
6368 ADDL $0x04, R12
6369 MOVL DX, 12(SP)
6370
6371 // emitCopy
6372 CMPL R8, $0x00010000
6373 JB two_byte_offset_match_nolit_encodeBetterBlockAsm
6374 CMPL R12, $0x40
6375 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm
6376 MOVB $0xff, (CX)
6377 MOVL R8, 1(CX)
6378 LEAL -64(R12), R12
6379 ADDQ $0x05, CX
6380 CMPL R12, $0x04
6381 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm
6382
6383 // emitRepeat
6384 emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy:
6385 MOVL R12, SI
6386 LEAL -4(R12), R12
6387 CMPL SI, $0x08
6388 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy
6389 CMPL SI, $0x0c
6390 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
6391 CMPL R8, $0x00000800
6392 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy
6393
6394 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
6395 CMPL R12, $0x00000104
6396 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy
6397 CMPL R12, $0x00010100
6398 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy
6399 CMPL R12, $0x0100ffff
6400 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy
6401 LEAL -16842747(R12), R12
6402 MOVL $0xfffb001d, (CX)
6403 MOVB $0xff, 4(CX)
6404 ADDQ $0x05, CX
6405 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy
6406
6407 repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy:
6408 LEAL -65536(R12), R12
6409 MOVL R12, R8
6410 MOVW $0x001d, (CX)
6411 MOVW R12, 2(CX)
6412 SARL $0x10, R8
6413 MOVB R8, 4(CX)
6414 ADDQ $0x05, CX
6415 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6416
6417 repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy:
6418 LEAL -256(R12), R12
6419 MOVW $0x0019, (CX)
6420 MOVW R12, 2(CX)
6421 ADDQ $0x04, CX
6422 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6423
6424 repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy:
6425 LEAL -4(R12), R12
6426 MOVW $0x0015, (CX)
6427 MOVB R12, 2(CX)
6428 ADDQ $0x03, CX
6429 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6430
6431 repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy:
6432 SHLL $0x02, R12
6433 ORL $0x01, R12
6434 MOVW R12, (CX)
6435 ADDQ $0x02, CX
6436 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6437
6438 repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy:
6439 XORQ SI, SI
6440 LEAL 1(SI)(R12*4), R12
6441 MOVB R8, 1(CX)
6442 SARL $0x08, R8
6443 SHLL $0x05, R8
6444 ORL R8, R12
6445 MOVB R12, (CX)
6446 ADDQ $0x02, CX
6447 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6448
6449 four_bytes_remain_match_nolit_encodeBetterBlockAsm:
6450 TESTL R12, R12
6451 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm
6452 XORL SI, SI
6453 LEAL -1(SI)(R12*4), R12
6454 MOVB R12, (CX)
6455 MOVL R8, 1(CX)
6456 ADDQ $0x05, CX
6457 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6458
6459 two_byte_offset_match_nolit_encodeBetterBlockAsm:
6460 CMPL R12, $0x40
6461 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm
6462 CMPL R8, $0x00000800
6463 JAE long_offset_short_match_nolit_encodeBetterBlockAsm
6464 MOVL $0x00000001, SI
6465 LEAL 16(SI), SI
6466 MOVB R8, 1(CX)
6467 MOVL R8, R9
6468 SHRL $0x08, R9
6469 SHLL $0x05, R9
6470 ORL R9, SI
6471 MOVB SI, (CX)
6472 ADDQ $0x02, CX
6473 SUBL $0x08, R12
6474
6475 // emitRepeat
6476 LEAL -4(R12), R12
6477 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6478
6479 emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6480 MOVL R12, SI
6481 LEAL -4(R12), R12
6482 CMPL SI, $0x08
6483 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6484 CMPL SI, $0x0c
6485 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6486 CMPL R8, $0x00000800
6487 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6488
6489 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6490 CMPL R12, $0x00000104
6491 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6492 CMPL R12, $0x00010100
6493 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6494 CMPL R12, $0x0100ffff
6495 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6496 LEAL -16842747(R12), R12
6497 MOVL $0xfffb001d, (CX)
6498 MOVB $0xff, 4(CX)
6499 ADDQ $0x05, CX
6500 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b
6501
6502 repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6503 LEAL -65536(R12), R12
6504 MOVL R12, R8
6505 MOVW $0x001d, (CX)
6506 MOVW R12, 2(CX)
6507 SARL $0x10, R8
6508 MOVB R8, 4(CX)
6509 ADDQ $0x05, CX
6510 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6511
6512 repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6513 LEAL -256(R12), R12
6514 MOVW $0x0019, (CX)
6515 MOVW R12, 2(CX)
6516 ADDQ $0x04, CX
6517 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6518
6519 repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6520 LEAL -4(R12), R12
6521 MOVW $0x0015, (CX)
6522 MOVB R12, 2(CX)
6523 ADDQ $0x03, CX
6524 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6525
6526 repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6527 SHLL $0x02, R12
6528 ORL $0x01, R12
6529 MOVW R12, (CX)
6530 ADDQ $0x02, CX
6531 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6532
6533 repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b:
6534 XORQ SI, SI
6535 LEAL 1(SI)(R12*4), R12
6536 MOVB R8, 1(CX)
6537 SARL $0x08, R8
6538 SHLL $0x05, R8
6539 ORL R8, R12
6540 MOVB R12, (CX)
6541 ADDQ $0x02, CX
6542 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6543
6544 long_offset_short_match_nolit_encodeBetterBlockAsm:
6545 MOVB $0xee, (CX)
6546 MOVW R8, 1(CX)
6547 LEAL -60(R12), R12
6548 ADDQ $0x03, CX
6549
6550 // emitRepeat
6551 emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6552 MOVL R12, SI
6553 LEAL -4(R12), R12
6554 CMPL SI, $0x08
6555 JBE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short
6556 CMPL SI, $0x0c
6557 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
6558 CMPL R8, $0x00000800
6559 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short
6560
6561 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6562 CMPL R12, $0x00000104
6563 JB repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short
6564 CMPL R12, $0x00010100
6565 JB repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short
6566 CMPL R12, $0x0100ffff
6567 JB repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short
6568 LEAL -16842747(R12), R12
6569 MOVL $0xfffb001d, (CX)
6570 MOVB $0xff, 4(CX)
6571 ADDQ $0x05, CX
6572 JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short
6573
6574 repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6575 LEAL -65536(R12), R12
6576 MOVL R12, R8
6577 MOVW $0x001d, (CX)
6578 MOVW R12, 2(CX)
6579 SARL $0x10, R8
6580 MOVB R8, 4(CX)
6581 ADDQ $0x05, CX
6582 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6583
6584 repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6585 LEAL -256(R12), R12
6586 MOVW $0x0019, (CX)
6587 MOVW R12, 2(CX)
6588 ADDQ $0x04, CX
6589 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6590
6591 repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6592 LEAL -4(R12), R12
6593 MOVW $0x0015, (CX)
6594 MOVB R12, 2(CX)
6595 ADDQ $0x03, CX
6596 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6597
6598 repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6599 SHLL $0x02, R12
6600 ORL $0x01, R12
6601 MOVW R12, (CX)
6602 ADDQ $0x02, CX
6603 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6604
6605 repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
6606 XORQ SI, SI
6607 LEAL 1(SI)(R12*4), R12
6608 MOVB R8, 1(CX)
6609 SARL $0x08, R8
6610 SHLL $0x05, R8
6611 ORL R8, R12
6612 MOVB R12, (CX)
6613 ADDQ $0x02, CX
6614 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6615
6616 two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
6617 MOVL R12, SI
6618 SHLL $0x02, SI
6619 CMPL R12, $0x0c
6620 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
6621 CMPL R8, $0x00000800
6622 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm
6623 LEAL -15(SI), SI
6624 MOVB R8, 1(CX)
6625 SHRL $0x08, R8
6626 SHLL $0x05, R8
6627 ORL R8, SI
6628 MOVB SI, (CX)
6629 ADDQ $0x02, CX
6630 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6631
6632 emit_copy_three_match_nolit_encodeBetterBlockAsm:
6633 LEAL -2(SI), SI
6634 MOVB SI, (CX)
6635 MOVW R8, 1(CX)
6636 ADDQ $0x03, CX
6637 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6638
6639 match_is_repeat_encodeBetterBlockAsm:
6640 MOVL 12(SP), SI
6641 CMPL SI, DI
6642 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
6643 MOVL DI, R9
6644 MOVL DI, 12(SP)
6645 LEAQ (BX)(SI*1), R10
6646 SUBL SI, R9
6647 LEAL -1(R9), SI
6648 CMPL SI, $0x3c
6649 JB one_byte_match_emit_repeat_encodeBetterBlockAsm
6650 CMPL SI, $0x00000100
6651 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm
6652 CMPL SI, $0x00010000
6653 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm
6654 CMPL SI, $0x01000000
6655 JB four_bytes_match_emit_repeat_encodeBetterBlockAsm
6656 MOVB $0xfc, (CX)
6657 MOVL SI, 1(CX)
6658 ADDQ $0x05, CX
6659 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6660
6661 four_bytes_match_emit_repeat_encodeBetterBlockAsm:
6662 MOVL SI, R11
6663 SHRL $0x10, R11
6664 MOVB $0xf8, (CX)
6665 MOVW SI, 1(CX)
6666 MOVB R11, 3(CX)
6667 ADDQ $0x04, CX
6668 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6669
6670 three_bytes_match_emit_repeat_encodeBetterBlockAsm:
6671 MOVB $0xf4, (CX)
6672 MOVW SI, 1(CX)
6673 ADDQ $0x03, CX
6674 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6675
6676 two_bytes_match_emit_repeat_encodeBetterBlockAsm:
6677 MOVB $0xf0, (CX)
6678 MOVB SI, 1(CX)
6679 ADDQ $0x02, CX
6680 CMPL SI, $0x40
6681 JB memmove_match_emit_repeat_encodeBetterBlockAsm
6682 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm
6683
6684 one_byte_match_emit_repeat_encodeBetterBlockAsm:
6685 SHLB $0x02, SI
6686 MOVB SI, (CX)
6687 ADDQ $0x01, CX
6688
6689 memmove_match_emit_repeat_encodeBetterBlockAsm:
6690 LEAQ (CX)(R9*1), SI
6691
6692 // genMemMoveShort
6693 CMPQ R9, $0x04
6694 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4
6695 CMPQ R9, $0x08
6696 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7
6697 CMPQ R9, $0x10
6698 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16
6699 CMPQ R9, $0x20
6700 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32
6701 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64
6702
6703 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4:
6704 MOVL (R10), R11
6705 MOVL R11, (CX)
6706 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6707
6708 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7:
6709 MOVL (R10), R11
6710 MOVL -4(R10)(R9*1), R10
6711 MOVL R11, (CX)
6712 MOVL R10, -4(CX)(R9*1)
6713 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6714
6715 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16:
6716 MOVQ (R10), R11
6717 MOVQ -8(R10)(R9*1), R10
6718 MOVQ R11, (CX)
6719 MOVQ R10, -8(CX)(R9*1)
6720 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6721
6722 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32:
6723 MOVOU (R10), X0
6724 MOVOU -16(R10)(R9*1), X1
6725 MOVOU X0, (CX)
6726 MOVOU X1, -16(CX)(R9*1)
6727 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm
6728
6729 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64:
6730 MOVOU (R10), X0
6731 MOVOU 16(R10), X1
6732 MOVOU -32(R10)(R9*1), X2
6733 MOVOU -16(R10)(R9*1), X3
6734 MOVOU X0, (CX)
6735 MOVOU X1, 16(CX)
6736 MOVOU X2, -32(CX)(R9*1)
6737 MOVOU X3, -16(CX)(R9*1)
6738
6739 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm:
6740 MOVQ SI, CX
6741 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm
6742
6743 memmove_long_match_emit_repeat_encodeBetterBlockAsm:
6744 LEAQ (CX)(R9*1), SI
6745
6746 // genMemMoveLong
6747 MOVOU (R10), X0
6748 MOVOU 16(R10), X1
6749 MOVOU -32(R10)(R9*1), X2
6750 MOVOU -16(R10)(R9*1), X3
6751 MOVQ R9, R13
6752 SHRQ $0x05, R13
6753 MOVQ CX, R11
6754 ANDL $0x0000001f, R11
6755 MOVQ $0x00000040, R14
6756 SUBQ R11, R14
6757 DECQ R13
6758 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
6759 LEAQ -32(R10)(R14*1), R11
6760 LEAQ -32(CX)(R14*1), R15
6761
6762 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back:
6763 MOVOU (R11), X4
6764 MOVOU 16(R11), X5
6765 MOVOA X4, (R15)
6766 MOVOA X5, 16(R15)
6767 ADDQ $0x20, R15
6768 ADDQ $0x20, R11
6769 ADDQ $0x20, R14
6770 DECQ R13
6771 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back
6772
6773 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32:
6774 MOVOU -32(R10)(R14*1), X4
6775 MOVOU -16(R10)(R14*1), X5
6776 MOVOA X4, -32(CX)(R14*1)
6777 MOVOA X5, -16(CX)(R14*1)
6778 ADDQ $0x20, R14
6779 CMPQ R9, R14
6780 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32
6781 MOVOU X0, (CX)
6782 MOVOU X1, 16(CX)
6783 MOVOU X2, -32(CX)(R9*1)
6784 MOVOU X3, -16(CX)(R9*1)
6785 MOVQ SI, CX
6786
6787 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm:
6788 ADDL R12, DX
6789 ADDL $0x04, R12
6790 MOVL DX, 12(SP)
6791
6792 // emitRepeat
6793 emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm:
6794 MOVL R12, SI
6795 LEAL -4(R12), R12
6796 CMPL SI, $0x08
6797 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm
6798 CMPL SI, $0x0c
6799 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
6800 CMPL R8, $0x00000800
6801 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm
6802
6803 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
6804 CMPL R12, $0x00000104
6805 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm
6806 CMPL R12, $0x00010100
6807 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm
6808 CMPL R12, $0x0100ffff
6809 JB repeat_five_match_nolit_repeat_encodeBetterBlockAsm
6810 LEAL -16842747(R12), R12
6811 MOVL $0xfffb001d, (CX)
6812 MOVB $0xff, 4(CX)
6813 ADDQ $0x05, CX
6814 JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm
6815
6816 repeat_five_match_nolit_repeat_encodeBetterBlockAsm:
6817 LEAL -65536(R12), R12
6818 MOVL R12, R8
6819 MOVW $0x001d, (CX)
6820 MOVW R12, 2(CX)
6821 SARL $0x10, R8
6822 MOVB R8, 4(CX)
6823 ADDQ $0x05, CX
6824 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6825
6826 repeat_four_match_nolit_repeat_encodeBetterBlockAsm:
6827 LEAL -256(R12), R12
6828 MOVW $0x0019, (CX)
6829 MOVW R12, 2(CX)
6830 ADDQ $0x04, CX
6831 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6832
6833 repeat_three_match_nolit_repeat_encodeBetterBlockAsm:
6834 LEAL -4(R12), R12
6835 MOVW $0x0015, (CX)
6836 MOVB R12, 2(CX)
6837 ADDQ $0x03, CX
6838 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6839
6840 repeat_two_match_nolit_repeat_encodeBetterBlockAsm:
6841 SHLL $0x02, R12
6842 ORL $0x01, R12
6843 MOVW R12, (CX)
6844 ADDQ $0x02, CX
6845 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm
6846
6847 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm:
6848 XORQ SI, SI
6849 LEAL 1(SI)(R12*4), R12
6850 MOVB R8, 1(CX)
6851 SARL $0x08, R8
6852 SHLL $0x05, R8
6853 ORL R8, R12
6854 MOVB R12, (CX)
6855 ADDQ $0x02, CX
6856
6857 match_nolit_emitcopy_end_encodeBetterBlockAsm:
6858 CMPL DX, 8(SP)
6859 JAE emit_remainder_encodeBetterBlockAsm
6860 CMPQ CX, (SP)
6861 JB match_nolit_dst_ok_encodeBetterBlockAsm
6862 MOVQ $0x00000000, ret+56(FP)
6863 RET
6864
6865 match_nolit_dst_ok_encodeBetterBlockAsm:
6866 MOVQ $0x00cf1bbcdcbfa563, SI
6867 MOVQ $0x9e3779b1, R8
6868 LEAQ 1(DI), DI
6869 LEAQ -2(DX), R9
6870 MOVQ (BX)(DI*1), R10
6871 MOVQ 1(BX)(DI*1), R11
6872 MOVQ (BX)(R9*1), R12
6873 MOVQ 1(BX)(R9*1), R13
6874 SHLQ $0x08, R10
6875 IMULQ SI, R10
6876 SHRQ $0x2f, R10
6877 SHLQ $0x20, R11
6878 IMULQ R8, R11
6879 SHRQ $0x32, R11
6880 SHLQ $0x08, R12
6881 IMULQ SI, R12
6882 SHRQ $0x2f, R12
6883 SHLQ $0x20, R13
6884 IMULQ R8, R13
6885 SHRQ $0x32, R13
6886 LEAQ 1(DI), R8
6887 LEAQ 1(R9), R14
6888 MOVL DI, (AX)(R10*4)
6889 MOVL R9, (AX)(R12*4)
6890 MOVL R8, 524288(AX)(R11*4)
6891 MOVL R14, 524288(AX)(R13*4)
6892 LEAQ 1(R9)(DI*1), R8
6893 SHRQ $0x01, R8
6894 ADDQ $0x01, DI
6895 SUBQ $0x01, R9
6896
6897 index_loop_encodeBetterBlockAsm:
6898 CMPQ R8, R9
6899 JAE search_loop_encodeBetterBlockAsm
6900 MOVQ (BX)(DI*1), R10
6901 MOVQ (BX)(R8*1), R11
6902 SHLQ $0x08, R10
6903 IMULQ SI, R10
6904 SHRQ $0x2f, R10
6905 SHLQ $0x08, R11
6906 IMULQ SI, R11
6907 SHRQ $0x2f, R11
6908 MOVL DI, (AX)(R10*4)
6909 MOVL R8, (AX)(R11*4)
6910 ADDQ $0x02, DI
6911 ADDQ $0x02, R8
6912 JMP index_loop_encodeBetterBlockAsm
6913
6914 emit_remainder_encodeBetterBlockAsm:
6915 MOVQ src_len+32(FP), AX
6916 SUBL 12(SP), AX
6917 LEAQ 5(CX)(AX*1), AX
6918 CMPQ AX, (SP)
6919 JB emit_remainder_ok_encodeBetterBlockAsm
6920 MOVQ $0x00000000, ret+56(FP)
6921 RET
6922
6923 emit_remainder_ok_encodeBetterBlockAsm:
6924 MOVQ src_len+32(FP), AX
6925 MOVL 12(SP), DX
6926 CMPL DX, AX
6927 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm
6928 MOVL AX, SI
6929 MOVL AX, 12(SP)
6930 LEAQ (BX)(DX*1), AX
6931 SUBL DX, SI
6932 LEAL -1(SI), DX
6933 CMPL DX, $0x3c
6934 JB one_byte_emit_remainder_encodeBetterBlockAsm
6935 CMPL DX, $0x00000100
6936 JB two_bytes_emit_remainder_encodeBetterBlockAsm
6937 CMPL DX, $0x00010000
6938 JB three_bytes_emit_remainder_encodeBetterBlockAsm
6939 CMPL DX, $0x01000000
6940 JB four_bytes_emit_remainder_encodeBetterBlockAsm
6941 MOVB $0xfc, (CX)
6942 MOVL DX, 1(CX)
6943 ADDQ $0x05, CX
6944 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6945
6946 four_bytes_emit_remainder_encodeBetterBlockAsm:
6947 MOVL DX, BX
6948 SHRL $0x10, BX
6949 MOVB $0xf8, (CX)
6950 MOVW DX, 1(CX)
6951 MOVB BL, 3(CX)
6952 ADDQ $0x04, CX
6953 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6954
6955 three_bytes_emit_remainder_encodeBetterBlockAsm:
6956 MOVB $0xf4, (CX)
6957 MOVW DX, 1(CX)
6958 ADDQ $0x03, CX
6959 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6960
6961 two_bytes_emit_remainder_encodeBetterBlockAsm:
6962 MOVB $0xf0, (CX)
6963 MOVB DL, 1(CX)
6964 ADDQ $0x02, CX
6965 CMPL DX, $0x40
6966 JB memmove_emit_remainder_encodeBetterBlockAsm
6967 JMP memmove_long_emit_remainder_encodeBetterBlockAsm
6968
6969 one_byte_emit_remainder_encodeBetterBlockAsm:
6970 SHLB $0x02, DL
6971 MOVB DL, (CX)
6972 ADDQ $0x01, CX
6973
6974 memmove_emit_remainder_encodeBetterBlockAsm:
6975 LEAQ (CX)(SI*1), DX
6976 MOVL SI, BX
6977
6978 // genMemMoveShort
6979 CMPQ BX, $0x03
6980 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2
6981 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3
6982 CMPQ BX, $0x08
6983 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7
6984 CMPQ BX, $0x10
6985 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16
6986 CMPQ BX, $0x20
6987 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32
6988 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64
6989
6990 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2:
6991 MOVB (AX), SI
6992 MOVB -1(AX)(BX*1), AL
6993 MOVB SI, (CX)
6994 MOVB AL, -1(CX)(BX*1)
6995 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
6996
6997 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3:
6998 MOVW (AX), SI
6999 MOVB 2(AX), AL
7000 MOVW SI, (CX)
7001 MOVB AL, 2(CX)
7002 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
7003
7004 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7:
7005 MOVL (AX), SI
7006 MOVL -4(AX)(BX*1), AX
7007 MOVL SI, (CX)
7008 MOVL AX, -4(CX)(BX*1)
7009 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
7010
7011 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16:
7012 MOVQ (AX), SI
7013 MOVQ -8(AX)(BX*1), AX
7014 MOVQ SI, (CX)
7015 MOVQ AX, -8(CX)(BX*1)
7016 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
7017
7018 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32:
7019 MOVOU (AX), X0
7020 MOVOU -16(AX)(BX*1), X1
7021 MOVOU X0, (CX)
7022 MOVOU X1, -16(CX)(BX*1)
7023 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm
7024
7025 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64:
7026 MOVOU (AX), X0
7027 MOVOU 16(AX), X1
7028 MOVOU -32(AX)(BX*1), X2
7029 MOVOU -16(AX)(BX*1), X3
7030 MOVOU X0, (CX)
7031 MOVOU X1, 16(CX)
7032 MOVOU X2, -32(CX)(BX*1)
7033 MOVOU X3, -16(CX)(BX*1)
7034
7035 memmove_end_copy_emit_remainder_encodeBetterBlockAsm:
7036 MOVQ DX, CX
7037 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm
7038
7039 memmove_long_emit_remainder_encodeBetterBlockAsm:
7040 LEAQ (CX)(SI*1), DX
7041 MOVL SI, BX
7042
7043 // genMemMoveLong
7044 MOVOU (AX), X0
7045 MOVOU 16(AX), X1
7046 MOVOU -32(AX)(BX*1), X2
7047 MOVOU -16(AX)(BX*1), X3
7048 MOVQ BX, DI
7049 SHRQ $0x05, DI
7050 MOVQ CX, SI
7051 ANDL $0x0000001f, SI
7052 MOVQ $0x00000040, R8
7053 SUBQ SI, R8
7054 DECQ DI
7055 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
7056 LEAQ -32(AX)(R8*1), SI
7057 LEAQ -32(CX)(R8*1), R9
7058
7059 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back:
7060 MOVOU (SI), X4
7061 MOVOU 16(SI), X5
7062 MOVOA X4, (R9)
7063 MOVOA X5, 16(R9)
7064 ADDQ $0x20, R9
7065 ADDQ $0x20, SI
7066 ADDQ $0x20, R8
7067 DECQ DI
7068 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back
7069
7070 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32:
7071 MOVOU -32(AX)(R8*1), X4
7072 MOVOU -16(AX)(R8*1), X5
7073 MOVOA X4, -32(CX)(R8*1)
7074 MOVOA X5, -16(CX)(R8*1)
7075 ADDQ $0x20, R8
7076 CMPQ BX, R8
7077 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32
7078 MOVOU X0, (CX)
7079 MOVOU X1, 16(CX)
7080 MOVOU X2, -32(CX)(BX*1)
7081 MOVOU X3, -16(CX)(BX*1)
7082 MOVQ DX, CX
7083
7084 emit_literal_done_emit_remainder_encodeBetterBlockAsm:
7085 MOVQ dst_base+0(FP), AX
7086 SUBQ AX, CX
7087 MOVQ CX, ret+56(FP)
7088 RET
7089
7090 // func encodeBetterBlockAsm4MB(dst []byte, src []byte, tmp *[589824]byte) int
7091 // Requires: BMI, SSE2
7092 TEXT ·encodeBetterBlockAsm4MB(SB), $24-64
7093 MOVQ tmp+48(FP), AX
7094 MOVQ dst_base+0(FP), CX
7095 MOVQ $0x00001200, DX
7096 MOVQ AX, BX
7097 PXOR X0, X0
7098
7099 zero_loop_encodeBetterBlockAsm4MB:
7100 MOVOU X0, (BX)
7101 MOVOU X0, 16(BX)
7102 MOVOU X0, 32(BX)
7103 MOVOU X0, 48(BX)
7104 MOVOU X0, 64(BX)
7105 MOVOU X0, 80(BX)
7106 MOVOU X0, 96(BX)
7107 MOVOU X0, 112(BX)
7108 ADDQ $0x80, BX
7109 DECQ DX
7110 JNZ zero_loop_encodeBetterBlockAsm4MB
7111 MOVL $0x00000000, 12(SP)
7112 MOVQ src_len+32(FP), DX
7113 LEAQ -6(DX), BX
7114 LEAQ -8(DX), SI
7115 MOVL SI, 8(SP)
7116 SHRQ $0x05, DX
7117 SUBL DX, BX
7118 LEAQ (CX)(BX*1), BX
7119 MOVQ BX, (SP)
7120 MOVL $0x00000001, DX
7121 MOVL $0x00000000, 16(SP)
7122 MOVQ src_base+24(FP), BX
7123
7124 search_loop_encodeBetterBlockAsm4MB:
7125 MOVL DX, SI
7126 SUBL 12(SP), SI
7127 SHRL $0x07, SI
7128 CMPL SI, $0x63
7129 JBE check_maxskip_ok_encodeBetterBlockAsm4MB
7130 LEAL 100(DX), SI
7131 JMP check_maxskip_cont_encodeBetterBlockAsm4MB
7132
7133 check_maxskip_ok_encodeBetterBlockAsm4MB:
7134 LEAL 1(DX)(SI*1), SI
7135
7136 check_maxskip_cont_encodeBetterBlockAsm4MB:
7137 CMPL SI, 8(SP)
7138 JAE emit_remainder_encodeBetterBlockAsm4MB
7139 MOVQ (BX)(DX*1), DI
7140 MOVL SI, 20(SP)
7141 MOVQ $0x00cf1bbcdcbfa563, R9
7142 MOVQ $0x9e3779b1, SI
7143 MOVQ DI, R10
7144 MOVQ DI, R11
7145 SHLQ $0x08, R10
7146 IMULQ R9, R10
7147 SHRQ $0x2f, R10
7148 SHLQ $0x20, R11
7149 IMULQ SI, R11
7150 SHRQ $0x32, R11
7151 MOVL (AX)(R10*4), SI
7152 MOVL 524288(AX)(R11*4), R8
7153 MOVL DX, (AX)(R10*4)
7154 MOVL DX, 524288(AX)(R11*4)
7155 MOVQ (BX)(SI*1), R10
7156 MOVQ (BX)(R8*1), R11
7157 CMPQ R10, DI
7158 JEQ candidate_match_encodeBetterBlockAsm4MB
7159 CMPQ R11, DI
7160 JNE no_short_found_encodeBetterBlockAsm4MB
7161 MOVL R8, SI
7162 JMP candidate_match_encodeBetterBlockAsm4MB
7163
7164 no_short_found_encodeBetterBlockAsm4MB:
7165 CMPL R10, DI
7166 JEQ candidate_match_encodeBetterBlockAsm4MB
7167 CMPL R11, DI
7168 JEQ candidateS_match_encodeBetterBlockAsm4MB
7169 MOVL 20(SP), DX
7170 JMP search_loop_encodeBetterBlockAsm4MB
7171
7172 candidateS_match_encodeBetterBlockAsm4MB:
7173 SHRQ $0x08, DI
7174 MOVQ DI, R10
7175 SHLQ $0x08, R10
7176 IMULQ R9, R10
7177 SHRQ $0x2f, R10
7178 MOVL (AX)(R10*4), SI
7179 INCL DX
7180 MOVL DX, (AX)(R10*4)
7181 CMPL (BX)(SI*1), DI
7182 JEQ candidate_match_encodeBetterBlockAsm4MB
7183 DECL DX
7184 MOVL R8, SI
7185
7186 candidate_match_encodeBetterBlockAsm4MB:
7187 MOVL 12(SP), DI
7188 TESTL SI, SI
7189 JZ match_extend_back_end_encodeBetterBlockAsm4MB
7190
7191 match_extend_back_loop_encodeBetterBlockAsm4MB:
7192 CMPL DX, DI
7193 JBE match_extend_back_end_encodeBetterBlockAsm4MB
7194 MOVB -1(BX)(SI*1), R8
7195 MOVB -1(BX)(DX*1), R9
7196 CMPB R8, R9
7197 JNE match_extend_back_end_encodeBetterBlockAsm4MB
7198 LEAL -1(DX), DX
7199 DECL SI
7200 JZ match_extend_back_end_encodeBetterBlockAsm4MB
7201 JMP match_extend_back_loop_encodeBetterBlockAsm4MB
7202
7203 match_extend_back_end_encodeBetterBlockAsm4MB:
7204 MOVL DX, DI
7205 SUBL 12(SP), DI
7206 LEAQ 4(CX)(DI*1), DI
7207 CMPQ DI, (SP)
7208 JB match_dst_size_check_encodeBetterBlockAsm4MB
7209 MOVQ $0x00000000, ret+56(FP)
7210 RET
7211
7212 match_dst_size_check_encodeBetterBlockAsm4MB:
7213 MOVL DX, DI
7214 ADDL $0x04, DX
7215 ADDL $0x04, SI
7216 MOVQ src_len+32(FP), R8
7217 SUBL DX, R8
7218 LEAQ (BX)(DX*1), R9
7219 LEAQ (BX)(SI*1), R10
7220
7221 // matchLen
7222 XORL R12, R12
7223
7224 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB:
7225 CMPL R8, $0x10
7226 JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB
7227 MOVQ (R9)(R12*1), R11
7228 MOVQ 8(R9)(R12*1), R13
7229 XORQ (R10)(R12*1), R11
7230 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
7231 XORQ 8(R10)(R12*1), R13
7232 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB
7233 LEAL -16(R8), R8
7234 LEAL 16(R12), R12
7235 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB
7236
7237 matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB:
7238 #ifdef GOAMD64_v3
7239 TZCNTQ R13, R13
7240
7241 #else
7242 BSFQ R13, R13
7243
7244 #endif
7245 SARQ $0x03, R13
7246 LEAL 8(R12)(R13*1), R12
7247 JMP match_nolit_end_encodeBetterBlockAsm4MB
7248
7249 matchlen_match8_match_nolit_encodeBetterBlockAsm4MB:
7250 CMPL R8, $0x08
7251 JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
7252 MOVQ (R9)(R12*1), R11
7253 XORQ (R10)(R12*1), R11
7254 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB
7255 LEAL -8(R8), R8
7256 LEAL 8(R12), R12
7257 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB
7258
7259 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB:
7260 #ifdef GOAMD64_v3
7261 TZCNTQ R11, R11
7262
7263 #else
7264 BSFQ R11, R11
7265
7266 #endif
7267 SARQ $0x03, R11
7268 LEAL (R12)(R11*1), R12
7269 JMP match_nolit_end_encodeBetterBlockAsm4MB
7270
7271 matchlen_match4_match_nolit_encodeBetterBlockAsm4MB:
7272 CMPL R8, $0x04
7273 JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
7274 MOVL (R9)(R12*1), R11
7275 CMPL (R10)(R12*1), R11
7276 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB
7277 LEAL -4(R8), R8
7278 LEAL 4(R12), R12
7279
7280 matchlen_match2_match_nolit_encodeBetterBlockAsm4MB:
7281 CMPL R8, $0x01
7282 JE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
7283 JB match_nolit_end_encodeBetterBlockAsm4MB
7284 MOVW (R9)(R12*1), R11
7285 CMPW (R10)(R12*1), R11
7286 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB
7287 LEAL 2(R12), R12
7288 SUBL $0x02, R8
7289 JZ match_nolit_end_encodeBetterBlockAsm4MB
7290
7291 matchlen_match1_match_nolit_encodeBetterBlockAsm4MB:
7292 MOVB (R9)(R12*1), R11
7293 CMPB (R10)(R12*1), R11
7294 JNE match_nolit_end_encodeBetterBlockAsm4MB
7295 LEAL 1(R12), R12
7296
7297 match_nolit_end_encodeBetterBlockAsm4MB:
7298 MOVL DX, R8
7299 SUBL SI, R8
7300
7301 // Check if repeat
7302 CMPL 16(SP), R8
7303 JEQ match_is_repeat_encodeBetterBlockAsm4MB
7304 CMPL R12, $0x01
7305 JA match_length_ok_encodeBetterBlockAsm4MB
7306 CMPL R8, $0x0000ffff
7307 JBE match_length_ok_encodeBetterBlockAsm4MB
7308 MOVL 20(SP), DX
7309 INCL DX
7310 JMP search_loop_encodeBetterBlockAsm4MB
7311
7312 match_length_ok_encodeBetterBlockAsm4MB:
7313 MOVL R8, 16(SP)
7314 MOVL 12(SP), SI
7315 CMPL SI, DI
7316 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB
7317 MOVL DI, R9
7318 MOVL DI, 12(SP)
7319 LEAQ (BX)(SI*1), R10
7320 SUBL SI, R9
7321 LEAL -1(R9), SI
7322 CMPL SI, $0x3c
7323 JB one_byte_match_emit_encodeBetterBlockAsm4MB
7324 CMPL SI, $0x00000100
7325 JB two_bytes_match_emit_encodeBetterBlockAsm4MB
7326 CMPL SI, $0x00010000
7327 JB three_bytes_match_emit_encodeBetterBlockAsm4MB
7328 MOVL SI, R11
7329 SHRL $0x10, R11
7330 MOVB $0xf8, (CX)
7331 MOVW SI, 1(CX)
7332 MOVB R11, 3(CX)
7333 ADDQ $0x04, CX
7334 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7335
7336 three_bytes_match_emit_encodeBetterBlockAsm4MB:
7337 MOVB $0xf4, (CX)
7338 MOVW SI, 1(CX)
7339 ADDQ $0x03, CX
7340 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7341
7342 two_bytes_match_emit_encodeBetterBlockAsm4MB:
7343 MOVB $0xf0, (CX)
7344 MOVB SI, 1(CX)
7345 ADDQ $0x02, CX
7346 CMPL SI, $0x40
7347 JB memmove_match_emit_encodeBetterBlockAsm4MB
7348 JMP memmove_long_match_emit_encodeBetterBlockAsm4MB
7349
7350 one_byte_match_emit_encodeBetterBlockAsm4MB:
7351 SHLB $0x02, SI
7352 MOVB SI, (CX)
7353 ADDQ $0x01, CX
7354
7355 memmove_match_emit_encodeBetterBlockAsm4MB:
7356 LEAQ (CX)(R9*1), SI
7357
7358 // genMemMoveShort
7359 CMPQ R9, $0x04
7360 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4
7361 CMPQ R9, $0x08
7362 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7
7363 CMPQ R9, $0x10
7364 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16
7365 CMPQ R9, $0x20
7366 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32
7367 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64
7368
7369 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4:
7370 MOVL (R10), R11
7371 MOVL R11, (CX)
7372 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7373
7374 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7:
7375 MOVL (R10), R11
7376 MOVL -4(R10)(R9*1), R10
7377 MOVL R11, (CX)
7378 MOVL R10, -4(CX)(R9*1)
7379 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7380
7381 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16:
7382 MOVQ (R10), R11
7383 MOVQ -8(R10)(R9*1), R10
7384 MOVQ R11, (CX)
7385 MOVQ R10, -8(CX)(R9*1)
7386 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7387
7388 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32:
7389 MOVOU (R10), X0
7390 MOVOU -16(R10)(R9*1), X1
7391 MOVOU X0, (CX)
7392 MOVOU X1, -16(CX)(R9*1)
7393 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB
7394
7395 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64:
7396 MOVOU (R10), X0
7397 MOVOU 16(R10), X1
7398 MOVOU -32(R10)(R9*1), X2
7399 MOVOU -16(R10)(R9*1), X3
7400 MOVOU X0, (CX)
7401 MOVOU X1, 16(CX)
7402 MOVOU X2, -32(CX)(R9*1)
7403 MOVOU X3, -16(CX)(R9*1)
7404
7405 memmove_end_copy_match_emit_encodeBetterBlockAsm4MB:
7406 MOVQ SI, CX
7407 JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB
7408
7409 memmove_long_match_emit_encodeBetterBlockAsm4MB:
7410 LEAQ (CX)(R9*1), SI
7411
7412 // genMemMoveLong
7413 MOVOU (R10), X0
7414 MOVOU 16(R10), X1
7415 MOVOU -32(R10)(R9*1), X2
7416 MOVOU -16(R10)(R9*1), X3
7417 MOVQ R9, R13
7418 SHRQ $0x05, R13
7419 MOVQ CX, R11
7420 ANDL $0x0000001f, R11
7421 MOVQ $0x00000040, R14
7422 SUBQ R11, R14
7423 DECQ R13
7424 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7425 LEAQ -32(R10)(R14*1), R11
7426 LEAQ -32(CX)(R14*1), R15
7427
7428 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back:
7429 MOVOU (R11), X4
7430 MOVOU 16(R11), X5
7431 MOVOA X4, (R15)
7432 MOVOA X5, 16(R15)
7433 ADDQ $0x20, R15
7434 ADDQ $0x20, R11
7435 ADDQ $0x20, R14
7436 DECQ R13
7437 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back
7438
7439 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
7440 MOVOU -32(R10)(R14*1), X4
7441 MOVOU -16(R10)(R14*1), X5
7442 MOVOA X4, -32(CX)(R14*1)
7443 MOVOA X5, -16(CX)(R14*1)
7444 ADDQ $0x20, R14
7445 CMPQ R9, R14
7446 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7447 MOVOU X0, (CX)
7448 MOVOU X1, 16(CX)
7449 MOVOU X2, -32(CX)(R9*1)
7450 MOVOU X3, -16(CX)(R9*1)
7451 MOVQ SI, CX
7452
7453 emit_literal_done_match_emit_encodeBetterBlockAsm4MB:
7454 ADDL R12, DX
7455 ADDL $0x04, R12
7456 MOVL DX, 12(SP)
7457
7458 // emitCopy
7459 CMPL R8, $0x00010000
7460 JB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
7461 CMPL R12, $0x40
7462 JBE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
7463 MOVB $0xff, (CX)
7464 MOVL R8, 1(CX)
7465 LEAL -64(R12), R12
7466 ADDQ $0x05, CX
7467 CMPL R12, $0x04
7468 JB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB
7469
7470 // emitRepeat
7471 MOVL R12, SI
7472 LEAL -4(R12), R12
7473 CMPL SI, $0x08
7474 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7475 CMPL SI, $0x0c
7476 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7477 CMPL R8, $0x00000800
7478 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7479
7480 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7481 CMPL R12, $0x00000104
7482 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7483 CMPL R12, $0x00010100
7484 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy
7485 LEAL -65536(R12), R12
7486 MOVL R12, R8
7487 MOVW $0x001d, (CX)
7488 MOVW R12, 2(CX)
7489 SARL $0x10, R8
7490 MOVB R8, 4(CX)
7491 ADDQ $0x05, CX
7492 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7493
7494 repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7495 LEAL -256(R12), R12
7496 MOVW $0x0019, (CX)
7497 MOVW R12, 2(CX)
7498 ADDQ $0x04, CX
7499 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7500
7501 repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7502 LEAL -4(R12), R12
7503 MOVW $0x0015, (CX)
7504 MOVB R12, 2(CX)
7505 ADDQ $0x03, CX
7506 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7507
7508 repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7509 SHLL $0x02, R12
7510 ORL $0x01, R12
7511 MOVW R12, (CX)
7512 ADDQ $0x02, CX
7513 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7514
7515 repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy:
7516 XORQ SI, SI
7517 LEAL 1(SI)(R12*4), R12
7518 MOVB R8, 1(CX)
7519 SARL $0x08, R8
7520 SHLL $0x05, R8
7521 ORL R8, R12
7522 MOVB R12, (CX)
7523 ADDQ $0x02, CX
7524 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7525
7526 four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB:
7527 TESTL R12, R12
7528 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7529 XORL SI, SI
7530 LEAL -1(SI)(R12*4), R12
7531 MOVB R12, (CX)
7532 MOVL R8, 1(CX)
7533 ADDQ $0x05, CX
7534 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7535
7536 two_byte_offset_match_nolit_encodeBetterBlockAsm4MB:
7537 CMPL R12, $0x40
7538 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB
7539 CMPL R8, $0x00000800
7540 JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB
7541 MOVL $0x00000001, SI
7542 LEAL 16(SI), SI
7543 MOVB R8, 1(CX)
7544 SHRL $0x08, R8
7545 SHLL $0x05, R8
7546 ORL R8, SI
7547 MOVB SI, (CX)
7548 ADDQ $0x02, CX
7549 SUBL $0x08, R12
7550
7551 // emitRepeat
7552 LEAL -4(R12), R12
7553 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7554 MOVL R12, SI
7555 LEAL -4(R12), R12
7556 CMPL SI, $0x08
7557 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7558 CMPL SI, $0x0c
7559 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7560 CMPL R8, $0x00000800
7561 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7562
7563 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7564 CMPL R12, $0x00000104
7565 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7566 CMPL R12, $0x00010100
7567 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b
7568 LEAL -65536(R12), R12
7569 MOVL R12, R8
7570 MOVW $0x001d, (CX)
7571 MOVW R12, 2(CX)
7572 SARL $0x10, R8
7573 MOVB R8, 4(CX)
7574 ADDQ $0x05, CX
7575 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7576
7577 repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7578 LEAL -256(R12), R12
7579 MOVW $0x0019, (CX)
7580 MOVW R12, 2(CX)
7581 ADDQ $0x04, CX
7582 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7583
7584 repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7585 LEAL -4(R12), R12
7586 MOVW $0x0015, (CX)
7587 MOVB R12, 2(CX)
7588 ADDQ $0x03, CX
7589 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7590
7591 repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7592 SHLL $0x02, R12
7593 ORL $0x01, R12
7594 MOVW R12, (CX)
7595 ADDQ $0x02, CX
7596 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7597
7598 repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b:
7599 XORQ SI, SI
7600 LEAL 1(SI)(R12*4), R12
7601 MOVB R8, 1(CX)
7602 SARL $0x08, R8
7603 SHLL $0x05, R8
7604 ORL R8, R12
7605 MOVB R12, (CX)
7606 ADDQ $0x02, CX
7607 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7608
7609 long_offset_short_match_nolit_encodeBetterBlockAsm4MB:
7610 MOVB $0xee, (CX)
7611 MOVW R8, 1(CX)
7612 LEAL -60(R12), R12
7613 ADDQ $0x03, CX
7614
7615 // emitRepeat
7616 MOVL R12, SI
7617 LEAL -4(R12), R12
7618 CMPL SI, $0x08
7619 JBE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7620 CMPL SI, $0x0c
7621 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7622 CMPL R8, $0x00000800
7623 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7624
7625 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7626 CMPL R12, $0x00000104
7627 JB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7628 CMPL R12, $0x00010100
7629 JB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short
7630 LEAL -65536(R12), R12
7631 MOVL R12, R8
7632 MOVW $0x001d, (CX)
7633 MOVW R12, 2(CX)
7634 SARL $0x10, R8
7635 MOVB R8, 4(CX)
7636 ADDQ $0x05, CX
7637 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7638
7639 repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7640 LEAL -256(R12), R12
7641 MOVW $0x0019, (CX)
7642 MOVW R12, 2(CX)
7643 ADDQ $0x04, CX
7644 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7645
7646 repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7647 LEAL -4(R12), R12
7648 MOVW $0x0015, (CX)
7649 MOVB R12, 2(CX)
7650 ADDQ $0x03, CX
7651 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7652
7653 repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7654 SHLL $0x02, R12
7655 ORL $0x01, R12
7656 MOVW R12, (CX)
7657 ADDQ $0x02, CX
7658 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7659
7660 repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
7661 XORQ SI, SI
7662 LEAL 1(SI)(R12*4), R12
7663 MOVB R8, 1(CX)
7664 SARL $0x08, R8
7665 SHLL $0x05, R8
7666 ORL R8, R12
7667 MOVB R12, (CX)
7668 ADDQ $0x02, CX
7669 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7670
7671 two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
7672 MOVL R12, SI
7673 SHLL $0x02, SI
7674 CMPL R12, $0x0c
7675 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
7676 CMPL R8, $0x00000800
7677 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB
7678 LEAL -15(SI), SI
7679 MOVB R8, 1(CX)
7680 SHRL $0x08, R8
7681 SHLL $0x05, R8
7682 ORL R8, SI
7683 MOVB SI, (CX)
7684 ADDQ $0x02, CX
7685 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7686
7687 emit_copy_three_match_nolit_encodeBetterBlockAsm4MB:
7688 LEAL -2(SI), SI
7689 MOVB SI, (CX)
7690 MOVW R8, 1(CX)
7691 ADDQ $0x03, CX
7692 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7693
7694 match_is_repeat_encodeBetterBlockAsm4MB:
7695 MOVL 12(SP), SI
7696 CMPL SI, DI
7697 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
7698 MOVL DI, R9
7699 MOVL DI, 12(SP)
7700 LEAQ (BX)(SI*1), R10
7701 SUBL SI, R9
7702 LEAL -1(R9), SI
7703 CMPL SI, $0x3c
7704 JB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB
7705 CMPL SI, $0x00000100
7706 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
7707 CMPL SI, $0x00010000
7708 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB
7709 MOVL SI, R11
7710 SHRL $0x10, R11
7711 MOVB $0xf8, (CX)
7712 MOVW SI, 1(CX)
7713 MOVB R11, 3(CX)
7714 ADDQ $0x04, CX
7715 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7716
7717 three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
7718 MOVB $0xf4, (CX)
7719 MOVW SI, 1(CX)
7720 ADDQ $0x03, CX
7721 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7722
7723 two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB:
7724 MOVB $0xf0, (CX)
7725 MOVB SI, 1(CX)
7726 ADDQ $0x02, CX
7727 CMPL SI, $0x40
7728 JB memmove_match_emit_repeat_encodeBetterBlockAsm4MB
7729 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB
7730
7731 one_byte_match_emit_repeat_encodeBetterBlockAsm4MB:
7732 SHLB $0x02, SI
7733 MOVB SI, (CX)
7734 ADDQ $0x01, CX
7735
7736 memmove_match_emit_repeat_encodeBetterBlockAsm4MB:
7737 LEAQ (CX)(R9*1), SI
7738
7739 // genMemMoveShort
7740 CMPQ R9, $0x04
7741 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4
7742 CMPQ R9, $0x08
7743 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7
7744 CMPQ R9, $0x10
7745 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16
7746 CMPQ R9, $0x20
7747 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32
7748 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64
7749
7750 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4:
7751 MOVL (R10), R11
7752 MOVL R11, (CX)
7753 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7754
7755 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7:
7756 MOVL (R10), R11
7757 MOVL -4(R10)(R9*1), R10
7758 MOVL R11, (CX)
7759 MOVL R10, -4(CX)(R9*1)
7760 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7761
7762 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16:
7763 MOVQ (R10), R11
7764 MOVQ -8(R10)(R9*1), R10
7765 MOVQ R11, (CX)
7766 MOVQ R10, -8(CX)(R9*1)
7767 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7768
7769 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32:
7770 MOVOU (R10), X0
7771 MOVOU -16(R10)(R9*1), X1
7772 MOVOU X0, (CX)
7773 MOVOU X1, -16(CX)(R9*1)
7774 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB
7775
7776 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64:
7777 MOVOU (R10), X0
7778 MOVOU 16(R10), X1
7779 MOVOU -32(R10)(R9*1), X2
7780 MOVOU -16(R10)(R9*1), X3
7781 MOVOU X0, (CX)
7782 MOVOU X1, 16(CX)
7783 MOVOU X2, -32(CX)(R9*1)
7784 MOVOU X3, -16(CX)(R9*1)
7785
7786 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB:
7787 MOVQ SI, CX
7788 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB
7789
7790 memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB:
7791 LEAQ (CX)(R9*1), SI
7792
7793 // genMemMoveLong
7794 MOVOU (R10), X0
7795 MOVOU 16(R10), X1
7796 MOVOU -32(R10)(R9*1), X2
7797 MOVOU -16(R10)(R9*1), X3
7798 MOVQ R9, R13
7799 SHRQ $0x05, R13
7800 MOVQ CX, R11
7801 ANDL $0x0000001f, R11
7802 MOVQ $0x00000040, R14
7803 SUBQ R11, R14
7804 DECQ R13
7805 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7806 LEAQ -32(R10)(R14*1), R11
7807 LEAQ -32(CX)(R14*1), R15
7808
7809 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back:
7810 MOVOU (R11), X4
7811 MOVOU 16(R11), X5
7812 MOVOA X4, (R15)
7813 MOVOA X5, 16(R15)
7814 ADDQ $0x20, R15
7815 ADDQ $0x20, R11
7816 ADDQ $0x20, R14
7817 DECQ R13
7818 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back
7819
7820 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
7821 MOVOU -32(R10)(R14*1), X4
7822 MOVOU -16(R10)(R14*1), X5
7823 MOVOA X4, -32(CX)(R14*1)
7824 MOVOA X5, -16(CX)(R14*1)
7825 ADDQ $0x20, R14
7826 CMPQ R9, R14
7827 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
7828 MOVOU X0, (CX)
7829 MOVOU X1, 16(CX)
7830 MOVOU X2, -32(CX)(R9*1)
7831 MOVOU X3, -16(CX)(R9*1)
7832 MOVQ SI, CX
7833
7834 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB:
7835 ADDL R12, DX
7836 ADDL $0x04, R12
7837 MOVL DX, 12(SP)
7838
7839 // emitRepeat
7840 MOVL R12, SI
7841 LEAL -4(R12), R12
7842 CMPL SI, $0x08
7843 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB
7844 CMPL SI, $0x0c
7845 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
7846 CMPL R8, $0x00000800
7847 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB
7848
7849 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
7850 CMPL R12, $0x00000104
7851 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB
7852 CMPL R12, $0x00010100
7853 JB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB
7854 LEAL -65536(R12), R12
7855 MOVL R12, R8
7856 MOVW $0x001d, (CX)
7857 MOVW R12, 2(CX)
7858 SARL $0x10, R8
7859 MOVB R8, 4(CX)
7860 ADDQ $0x05, CX
7861 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7862
7863 repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB:
7864 LEAL -256(R12), R12
7865 MOVW $0x0019, (CX)
7866 MOVW R12, 2(CX)
7867 ADDQ $0x04, CX
7868 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7869
7870 repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB:
7871 LEAL -4(R12), R12
7872 MOVW $0x0015, (CX)
7873 MOVB R12, 2(CX)
7874 ADDQ $0x03, CX
7875 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7876
7877 repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB:
7878 SHLL $0x02, R12
7879 ORL $0x01, R12
7880 MOVW R12, (CX)
7881 ADDQ $0x02, CX
7882 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
7883
7884 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB:
7885 XORQ SI, SI
7886 LEAL 1(SI)(R12*4), R12
7887 MOVB R8, 1(CX)
7888 SARL $0x08, R8
7889 SHLL $0x05, R8
7890 ORL R8, R12
7891 MOVB R12, (CX)
7892 ADDQ $0x02, CX
7893
7894 match_nolit_emitcopy_end_encodeBetterBlockAsm4MB:
7895 CMPL DX, 8(SP)
7896 JAE emit_remainder_encodeBetterBlockAsm4MB
7897 CMPQ CX, (SP)
7898 JB match_nolit_dst_ok_encodeBetterBlockAsm4MB
7899 MOVQ $0x00000000, ret+56(FP)
7900 RET
7901
7902 match_nolit_dst_ok_encodeBetterBlockAsm4MB:
7903 MOVQ $0x00cf1bbcdcbfa563, SI
7904 MOVQ $0x9e3779b1, R8
7905 LEAQ 1(DI), DI
7906 LEAQ -2(DX), R9
7907 MOVQ (BX)(DI*1), R10
7908 MOVQ 1(BX)(DI*1), R11
7909 MOVQ (BX)(R9*1), R12
7910 MOVQ 1(BX)(R9*1), R13
7911 SHLQ $0x08, R10
7912 IMULQ SI, R10
7913 SHRQ $0x2f, R10
7914 SHLQ $0x20, R11
7915 IMULQ R8, R11
7916 SHRQ $0x32, R11
7917 SHLQ $0x08, R12
7918 IMULQ SI, R12
7919 SHRQ $0x2f, R12
7920 SHLQ $0x20, R13
7921 IMULQ R8, R13
7922 SHRQ $0x32, R13
7923 LEAQ 1(DI), R8
7924 LEAQ 1(R9), R14
7925 MOVL DI, (AX)(R10*4)
7926 MOVL R9, (AX)(R12*4)
7927 MOVL R8, 524288(AX)(R11*4)
7928 MOVL R14, 524288(AX)(R13*4)
7929 LEAQ 1(R9)(DI*1), R8
7930 SHRQ $0x01, R8
7931 ADDQ $0x01, DI
7932 SUBQ $0x01, R9
7933
7934 index_loop_encodeBetterBlockAsm4MB:
7935 CMPQ R8, R9
7936 JAE search_loop_encodeBetterBlockAsm4MB
7937 MOVQ (BX)(DI*1), R10
7938 MOVQ (BX)(R8*1), R11
7939 SHLQ $0x08, R10
7940 IMULQ SI, R10
7941 SHRQ $0x2f, R10
7942 SHLQ $0x08, R11
7943 IMULQ SI, R11
7944 SHRQ $0x2f, R11
7945 MOVL DI, (AX)(R10*4)
7946 MOVL R8, (AX)(R11*4)
7947 ADDQ $0x02, DI
7948 ADDQ $0x02, R8
7949 JMP index_loop_encodeBetterBlockAsm4MB
7950
7951 emit_remainder_encodeBetterBlockAsm4MB:
7952 MOVQ src_len+32(FP), AX
7953 SUBL 12(SP), AX
7954 LEAQ 4(CX)(AX*1), AX
7955 CMPQ AX, (SP)
7956 JB emit_remainder_ok_encodeBetterBlockAsm4MB
7957 MOVQ $0x00000000, ret+56(FP)
7958 RET
7959
7960 emit_remainder_ok_encodeBetterBlockAsm4MB:
7961 MOVQ src_len+32(FP), AX
7962 MOVL 12(SP), DX
7963 CMPL DX, AX
7964 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
7965 MOVL AX, SI
7966 MOVL AX, 12(SP)
7967 LEAQ (BX)(DX*1), AX
7968 SUBL DX, SI
7969 LEAL -1(SI), DX
7970 CMPL DX, $0x3c
7971 JB one_byte_emit_remainder_encodeBetterBlockAsm4MB
7972 CMPL DX, $0x00000100
7973 JB two_bytes_emit_remainder_encodeBetterBlockAsm4MB
7974 CMPL DX, $0x00010000
7975 JB three_bytes_emit_remainder_encodeBetterBlockAsm4MB
7976 MOVL DX, BX
7977 SHRL $0x10, BX
7978 MOVB $0xf8, (CX)
7979 MOVW DX, 1(CX)
7980 MOVB BL, 3(CX)
7981 ADDQ $0x04, CX
7982 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7983
7984 three_bytes_emit_remainder_encodeBetterBlockAsm4MB:
7985 MOVB $0xf4, (CX)
7986 MOVW DX, 1(CX)
7987 ADDQ $0x03, CX
7988 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7989
7990 two_bytes_emit_remainder_encodeBetterBlockAsm4MB:
7991 MOVB $0xf0, (CX)
7992 MOVB DL, 1(CX)
7993 ADDQ $0x02, CX
7994 CMPL DX, $0x40
7995 JB memmove_emit_remainder_encodeBetterBlockAsm4MB
7996 JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB
7997
7998 one_byte_emit_remainder_encodeBetterBlockAsm4MB:
7999 SHLB $0x02, DL
8000 MOVB DL, (CX)
8001 ADDQ $0x01, CX
8002
8003 memmove_emit_remainder_encodeBetterBlockAsm4MB:
8004 LEAQ (CX)(SI*1), DX
8005 MOVL SI, BX
8006
8007 // genMemMoveShort
8008 CMPQ BX, $0x03
8009 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2
8010 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3
8011 CMPQ BX, $0x08
8012 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7
8013 CMPQ BX, $0x10
8014 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16
8015 CMPQ BX, $0x20
8016 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32
8017 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64
8018
8019 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2:
8020 MOVB (AX), SI
8021 MOVB -1(AX)(BX*1), AL
8022 MOVB SI, (CX)
8023 MOVB AL, -1(CX)(BX*1)
8024 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8025
8026 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3:
8027 MOVW (AX), SI
8028 MOVB 2(AX), AL
8029 MOVW SI, (CX)
8030 MOVB AL, 2(CX)
8031 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8032
8033 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7:
8034 MOVL (AX), SI
8035 MOVL -4(AX)(BX*1), AX
8036 MOVL SI, (CX)
8037 MOVL AX, -4(CX)(BX*1)
8038 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8039
8040 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16:
8041 MOVQ (AX), SI
8042 MOVQ -8(AX)(BX*1), AX
8043 MOVQ SI, (CX)
8044 MOVQ AX, -8(CX)(BX*1)
8045 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8046
8047 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32:
8048 MOVOU (AX), X0
8049 MOVOU -16(AX)(BX*1), X1
8050 MOVOU X0, (CX)
8051 MOVOU X1, -16(CX)(BX*1)
8052 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB
8053
8054 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64:
8055 MOVOU (AX), X0
8056 MOVOU 16(AX), X1
8057 MOVOU -32(AX)(BX*1), X2
8058 MOVOU -16(AX)(BX*1), X3
8059 MOVOU X0, (CX)
8060 MOVOU X1, 16(CX)
8061 MOVOU X2, -32(CX)(BX*1)
8062 MOVOU X3, -16(CX)(BX*1)
8063
8064 memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB:
8065 MOVQ DX, CX
8066 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB
8067
8068 memmove_long_emit_remainder_encodeBetterBlockAsm4MB:
8069 LEAQ (CX)(SI*1), DX
8070 MOVL SI, BX
8071
8072 // genMemMoveLong
8073 MOVOU (AX), X0
8074 MOVOU 16(AX), X1
8075 MOVOU -32(AX)(BX*1), X2
8076 MOVOU -16(AX)(BX*1), X3
8077 MOVQ BX, DI
8078 SHRQ $0x05, DI
8079 MOVQ CX, SI
8080 ANDL $0x0000001f, SI
8081 MOVQ $0x00000040, R8
8082 SUBQ SI, R8
8083 DECQ DI
8084 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
8085 LEAQ -32(AX)(R8*1), SI
8086 LEAQ -32(CX)(R8*1), R9
8087
8088 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back:
8089 MOVOU (SI), X4
8090 MOVOU 16(SI), X5
8091 MOVOA X4, (R9)
8092 MOVOA X5, 16(R9)
8093 ADDQ $0x20, R9
8094 ADDQ $0x20, SI
8095 ADDQ $0x20, R8
8096 DECQ DI
8097 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back
8098
8099 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32:
8100 MOVOU -32(AX)(R8*1), X4
8101 MOVOU -16(AX)(R8*1), X5
8102 MOVOA X4, -32(CX)(R8*1)
8103 MOVOA X5, -16(CX)(R8*1)
8104 ADDQ $0x20, R8
8105 CMPQ BX, R8
8106 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32
8107 MOVOU X0, (CX)
8108 MOVOU X1, 16(CX)
8109 MOVOU X2, -32(CX)(BX*1)
8110 MOVOU X3, -16(CX)(BX*1)
8111 MOVQ DX, CX
8112
8113 emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB:
8114 MOVQ dst_base+0(FP), AX
8115 SUBQ AX, CX
8116 MOVQ CX, ret+56(FP)
8117 RET
8118
8119 // func encodeBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
8120 // Requires: BMI, SSE2
8121 TEXT ·encodeBetterBlockAsm12B(SB), $24-64
8122 MOVQ tmp+48(FP), AX
8123 MOVQ dst_base+0(FP), CX
8124 MOVQ $0x00000280, DX
8125 MOVQ AX, BX
8126 PXOR X0, X0
8127
8128 zero_loop_encodeBetterBlockAsm12B:
8129 MOVOU X0, (BX)
8130 MOVOU X0, 16(BX)
8131 MOVOU X0, 32(BX)
8132 MOVOU X0, 48(BX)
8133 MOVOU X0, 64(BX)
8134 MOVOU X0, 80(BX)
8135 MOVOU X0, 96(BX)
8136 MOVOU X0, 112(BX)
8137 ADDQ $0x80, BX
8138 DECQ DX
8139 JNZ zero_loop_encodeBetterBlockAsm12B
8140 MOVL $0x00000000, 12(SP)
8141 MOVQ src_len+32(FP), DX
8142 LEAQ -6(DX), BX
8143 LEAQ -8(DX), SI
8144 MOVL SI, 8(SP)
8145 SHRQ $0x05, DX
8146 SUBL DX, BX
8147 LEAQ (CX)(BX*1), BX
8148 MOVQ BX, (SP)
8149 MOVL $0x00000001, DX
8150 MOVL $0x00000000, 16(SP)
8151 MOVQ src_base+24(FP), BX
8152
8153 search_loop_encodeBetterBlockAsm12B:
8154 MOVL DX, SI
8155 SUBL 12(SP), SI
8156 SHRL $0x06, SI
8157 LEAL 1(DX)(SI*1), SI
8158 CMPL SI, 8(SP)
8159 JAE emit_remainder_encodeBetterBlockAsm12B
8160 MOVQ (BX)(DX*1), DI
8161 MOVL SI, 20(SP)
8162 MOVQ $0x0000cf1bbcdcbf9b, R9
8163 MOVQ $0x9e3779b1, SI
8164 MOVQ DI, R10
8165 MOVQ DI, R11
8166 SHLQ $0x10, R10
8167 IMULQ R9, R10
8168 SHRQ $0x32, R10
8169 SHLQ $0x20, R11
8170 IMULQ SI, R11
8171 SHRQ $0x34, R11
8172 MOVL (AX)(R10*4), SI
8173 MOVL 65536(AX)(R11*4), R8
8174 MOVL DX, (AX)(R10*4)
8175 MOVL DX, 65536(AX)(R11*4)
8176 MOVQ (BX)(SI*1), R10
8177 MOVQ (BX)(R8*1), R11
8178 CMPQ R10, DI
8179 JEQ candidate_match_encodeBetterBlockAsm12B
8180 CMPQ R11, DI
8181 JNE no_short_found_encodeBetterBlockAsm12B
8182 MOVL R8, SI
8183 JMP candidate_match_encodeBetterBlockAsm12B
8184
8185 no_short_found_encodeBetterBlockAsm12B:
8186 CMPL R10, DI
8187 JEQ candidate_match_encodeBetterBlockAsm12B
8188 CMPL R11, DI
8189 JEQ candidateS_match_encodeBetterBlockAsm12B
8190 MOVL 20(SP), DX
8191 JMP search_loop_encodeBetterBlockAsm12B
8192
8193 candidateS_match_encodeBetterBlockAsm12B:
8194 SHRQ $0x08, DI
8195 MOVQ DI, R10
8196 SHLQ $0x10, R10
8197 IMULQ R9, R10
8198 SHRQ $0x32, R10
8199 MOVL (AX)(R10*4), SI
8200 INCL DX
8201 MOVL DX, (AX)(R10*4)
8202 CMPL (BX)(SI*1), DI
8203 JEQ candidate_match_encodeBetterBlockAsm12B
8204 DECL DX
8205 MOVL R8, SI
8206
8207 candidate_match_encodeBetterBlockAsm12B:
8208 MOVL 12(SP), DI
8209 TESTL SI, SI
8210 JZ match_extend_back_end_encodeBetterBlockAsm12B
8211
8212 match_extend_back_loop_encodeBetterBlockAsm12B:
8213 CMPL DX, DI
8214 JBE match_extend_back_end_encodeBetterBlockAsm12B
8215 MOVB -1(BX)(SI*1), R8
8216 MOVB -1(BX)(DX*1), R9
8217 CMPB R8, R9
8218 JNE match_extend_back_end_encodeBetterBlockAsm12B
8219 LEAL -1(DX), DX
8220 DECL SI
8221 JZ match_extend_back_end_encodeBetterBlockAsm12B
8222 JMP match_extend_back_loop_encodeBetterBlockAsm12B
8223
8224 match_extend_back_end_encodeBetterBlockAsm12B:
8225 MOVL DX, DI
8226 SUBL 12(SP), DI
8227 LEAQ 3(CX)(DI*1), DI
8228 CMPQ DI, (SP)
8229 JB match_dst_size_check_encodeBetterBlockAsm12B
8230 MOVQ $0x00000000, ret+56(FP)
8231 RET
8232
8233 match_dst_size_check_encodeBetterBlockAsm12B:
8234 MOVL DX, DI
8235 ADDL $0x04, DX
8236 ADDL $0x04, SI
8237 MOVQ src_len+32(FP), R8
8238 SUBL DX, R8
8239 LEAQ (BX)(DX*1), R9
8240 LEAQ (BX)(SI*1), R10
8241
8242 // matchLen
8243 XORL R12, R12
8244
8245 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B:
8246 CMPL R8, $0x10
8247 JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B
8248 MOVQ (R9)(R12*1), R11
8249 MOVQ 8(R9)(R12*1), R13
8250 XORQ (R10)(R12*1), R11
8251 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
8252 XORQ 8(R10)(R12*1), R13
8253 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B
8254 LEAL -16(R8), R8
8255 LEAL 16(R12), R12
8256 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B
8257
8258 matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B:
8259 #ifdef GOAMD64_v3
8260 TZCNTQ R13, R13
8261
8262 #else
8263 BSFQ R13, R13
8264
8265 #endif
8266 SARQ $0x03, R13
8267 LEAL 8(R12)(R13*1), R12
8268 JMP match_nolit_end_encodeBetterBlockAsm12B
8269
8270 matchlen_match8_match_nolit_encodeBetterBlockAsm12B:
8271 CMPL R8, $0x08
8272 JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B
8273 MOVQ (R9)(R12*1), R11
8274 XORQ (R10)(R12*1), R11
8275 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B
8276 LEAL -8(R8), R8
8277 LEAL 8(R12), R12
8278 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B
8279
8280 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B:
8281 #ifdef GOAMD64_v3
8282 TZCNTQ R11, R11
8283
8284 #else
8285 BSFQ R11, R11
8286
8287 #endif
8288 SARQ $0x03, R11
8289 LEAL (R12)(R11*1), R12
8290 JMP match_nolit_end_encodeBetterBlockAsm12B
8291
8292 matchlen_match4_match_nolit_encodeBetterBlockAsm12B:
8293 CMPL R8, $0x04
8294 JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B
8295 MOVL (R9)(R12*1), R11
8296 CMPL (R10)(R12*1), R11
8297 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B
8298 LEAL -4(R8), R8
8299 LEAL 4(R12), R12
8300
8301 matchlen_match2_match_nolit_encodeBetterBlockAsm12B:
8302 CMPL R8, $0x01
8303 JE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
8304 JB match_nolit_end_encodeBetterBlockAsm12B
8305 MOVW (R9)(R12*1), R11
8306 CMPW (R10)(R12*1), R11
8307 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B
8308 LEAL 2(R12), R12
8309 SUBL $0x02, R8
8310 JZ match_nolit_end_encodeBetterBlockAsm12B
8311
8312 matchlen_match1_match_nolit_encodeBetterBlockAsm12B:
8313 MOVB (R9)(R12*1), R11
8314 CMPB (R10)(R12*1), R11
8315 JNE match_nolit_end_encodeBetterBlockAsm12B
8316 LEAL 1(R12), R12
8317
8318 match_nolit_end_encodeBetterBlockAsm12B:
8319 MOVL DX, R8
8320 SUBL SI, R8
8321
8322 // Check if repeat
8323 CMPL 16(SP), R8
8324 JEQ match_is_repeat_encodeBetterBlockAsm12B
8325 MOVL R8, 16(SP)
8326 MOVL 12(SP), SI
8327 CMPL SI, DI
8328 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B
8329 MOVL DI, R9
8330 MOVL DI, 12(SP)
8331 LEAQ (BX)(SI*1), R10
8332 SUBL SI, R9
8333 LEAL -1(R9), SI
8334 CMPL SI, $0x3c
8335 JB one_byte_match_emit_encodeBetterBlockAsm12B
8336 CMPL SI, $0x00000100
8337 JB two_bytes_match_emit_encodeBetterBlockAsm12B
8338 JB three_bytes_match_emit_encodeBetterBlockAsm12B
8339
8340 three_bytes_match_emit_encodeBetterBlockAsm12B:
8341 MOVB $0xf4, (CX)
8342 MOVW SI, 1(CX)
8343 ADDQ $0x03, CX
8344 JMP memmove_long_match_emit_encodeBetterBlockAsm12B
8345
8346 two_bytes_match_emit_encodeBetterBlockAsm12B:
8347 MOVB $0xf0, (CX)
8348 MOVB SI, 1(CX)
8349 ADDQ $0x02, CX
8350 CMPL SI, $0x40
8351 JB memmove_match_emit_encodeBetterBlockAsm12B
8352 JMP memmove_long_match_emit_encodeBetterBlockAsm12B
8353
8354 one_byte_match_emit_encodeBetterBlockAsm12B:
8355 SHLB $0x02, SI
8356 MOVB SI, (CX)
8357 ADDQ $0x01, CX
8358
8359 memmove_match_emit_encodeBetterBlockAsm12B:
8360 LEAQ (CX)(R9*1), SI
8361
8362 // genMemMoveShort
8363 CMPQ R9, $0x04
8364 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4
8365 CMPQ R9, $0x08
8366 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7
8367 CMPQ R9, $0x10
8368 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16
8369 CMPQ R9, $0x20
8370 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32
8371 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64
8372
8373 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4:
8374 MOVL (R10), R11
8375 MOVL R11, (CX)
8376 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8377
8378 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7:
8379 MOVL (R10), R11
8380 MOVL -4(R10)(R9*1), R10
8381 MOVL R11, (CX)
8382 MOVL R10, -4(CX)(R9*1)
8383 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8384
8385 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16:
8386 MOVQ (R10), R11
8387 MOVQ -8(R10)(R9*1), R10
8388 MOVQ R11, (CX)
8389 MOVQ R10, -8(CX)(R9*1)
8390 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8391
8392 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32:
8393 MOVOU (R10), X0
8394 MOVOU -16(R10)(R9*1), X1
8395 MOVOU X0, (CX)
8396 MOVOU X1, -16(CX)(R9*1)
8397 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B
8398
8399 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64:
8400 MOVOU (R10), X0
8401 MOVOU 16(R10), X1
8402 MOVOU -32(R10)(R9*1), X2
8403 MOVOU -16(R10)(R9*1), X3
8404 MOVOU X0, (CX)
8405 MOVOU X1, 16(CX)
8406 MOVOU X2, -32(CX)(R9*1)
8407 MOVOU X3, -16(CX)(R9*1)
8408
8409 memmove_end_copy_match_emit_encodeBetterBlockAsm12B:
8410 MOVQ SI, CX
8411 JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B
8412
8413 memmove_long_match_emit_encodeBetterBlockAsm12B:
8414 LEAQ (CX)(R9*1), SI
8415
8416 // genMemMoveLong
8417 MOVOU (R10), X0
8418 MOVOU 16(R10), X1
8419 MOVOU -32(R10)(R9*1), X2
8420 MOVOU -16(R10)(R9*1), X3
8421 MOVQ R9, R13
8422 SHRQ $0x05, R13
8423 MOVQ CX, R11
8424 ANDL $0x0000001f, R11
8425 MOVQ $0x00000040, R14
8426 SUBQ R11, R14
8427 DECQ R13
8428 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8429 LEAQ -32(R10)(R14*1), R11
8430 LEAQ -32(CX)(R14*1), R15
8431
8432 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back:
8433 MOVOU (R11), X4
8434 MOVOU 16(R11), X5
8435 MOVOA X4, (R15)
8436 MOVOA X5, 16(R15)
8437 ADDQ $0x20, R15
8438 ADDQ $0x20, R11
8439 ADDQ $0x20, R14
8440 DECQ R13
8441 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back
8442
8443 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8444 MOVOU -32(R10)(R14*1), X4
8445 MOVOU -16(R10)(R14*1), X5
8446 MOVOA X4, -32(CX)(R14*1)
8447 MOVOA X5, -16(CX)(R14*1)
8448 ADDQ $0x20, R14
8449 CMPQ R9, R14
8450 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8451 MOVOU X0, (CX)
8452 MOVOU X1, 16(CX)
8453 MOVOU X2, -32(CX)(R9*1)
8454 MOVOU X3, -16(CX)(R9*1)
8455 MOVQ SI, CX
8456
8457 emit_literal_done_match_emit_encodeBetterBlockAsm12B:
8458 ADDL R12, DX
8459 ADDL $0x04, R12
8460 MOVL DX, 12(SP)
8461
8462 // emitCopy
8463 CMPL R12, $0x40
8464 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
8465 CMPL R8, $0x00000800
8466 JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B
8467 MOVL $0x00000001, SI
8468 LEAL 16(SI), SI
8469 MOVB R8, 1(CX)
8470 SHRL $0x08, R8
8471 SHLL $0x05, R8
8472 ORL R8, SI
8473 MOVB SI, (CX)
8474 ADDQ $0x02, CX
8475 SUBL $0x08, R12
8476
8477 // emitRepeat
8478 LEAL -4(R12), R12
8479 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8480 MOVL R12, SI
8481 LEAL -4(R12), R12
8482 CMPL SI, $0x08
8483 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8484 CMPL SI, $0x0c
8485 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8486 CMPL R8, $0x00000800
8487 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8488
8489 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8490 CMPL R12, $0x00000104
8491 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b
8492 LEAL -256(R12), R12
8493 MOVW $0x0019, (CX)
8494 MOVW R12, 2(CX)
8495 ADDQ $0x04, CX
8496 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8497
8498 repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8499 LEAL -4(R12), R12
8500 MOVW $0x0015, (CX)
8501 MOVB R12, 2(CX)
8502 ADDQ $0x03, CX
8503 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8504
8505 repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8506 SHLL $0x02, R12
8507 ORL $0x01, R12
8508 MOVW R12, (CX)
8509 ADDQ $0x02, CX
8510 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8511
8512 repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b:
8513 XORQ SI, SI
8514 LEAL 1(SI)(R12*4), R12
8515 MOVB R8, 1(CX)
8516 SARL $0x08, R8
8517 SHLL $0x05, R8
8518 ORL R8, R12
8519 MOVB R12, (CX)
8520 ADDQ $0x02, CX
8521 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8522
8523 long_offset_short_match_nolit_encodeBetterBlockAsm12B:
8524 MOVB $0xee, (CX)
8525 MOVW R8, 1(CX)
8526 LEAL -60(R12), R12
8527 ADDQ $0x03, CX
8528
8529 // emitRepeat
8530 MOVL R12, SI
8531 LEAL -4(R12), R12
8532 CMPL SI, $0x08
8533 JBE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8534 CMPL SI, $0x0c
8535 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8536 CMPL R8, $0x00000800
8537 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8538
8539 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8540 CMPL R12, $0x00000104
8541 JB repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short
8542 LEAL -256(R12), R12
8543 MOVW $0x0019, (CX)
8544 MOVW R12, 2(CX)
8545 ADDQ $0x04, CX
8546 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8547
8548 repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8549 LEAL -4(R12), R12
8550 MOVW $0x0015, (CX)
8551 MOVB R12, 2(CX)
8552 ADDQ $0x03, CX
8553 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8554
8555 repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8556 SHLL $0x02, R12
8557 ORL $0x01, R12
8558 MOVW R12, (CX)
8559 ADDQ $0x02, CX
8560 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8561
8562 repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
8563 XORQ SI, SI
8564 LEAL 1(SI)(R12*4), R12
8565 MOVB R8, 1(CX)
8566 SARL $0x08, R8
8567 SHLL $0x05, R8
8568 ORL R8, R12
8569 MOVB R12, (CX)
8570 ADDQ $0x02, CX
8571 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8572
8573 two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
8574 MOVL R12, SI
8575 SHLL $0x02, SI
8576 CMPL R12, $0x0c
8577 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
8578 CMPL R8, $0x00000800
8579 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm12B
8580 LEAL -15(SI), SI
8581 MOVB R8, 1(CX)
8582 SHRL $0x08, R8
8583 SHLL $0x05, R8
8584 ORL R8, SI
8585 MOVB SI, (CX)
8586 ADDQ $0x02, CX
8587 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8588
8589 emit_copy_three_match_nolit_encodeBetterBlockAsm12B:
8590 LEAL -2(SI), SI
8591 MOVB SI, (CX)
8592 MOVW R8, 1(CX)
8593 ADDQ $0x03, CX
8594 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8595
8596 match_is_repeat_encodeBetterBlockAsm12B:
8597 MOVL 12(SP), SI
8598 CMPL SI, DI
8599 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
8600 MOVL DI, R9
8601 MOVL DI, 12(SP)
8602 LEAQ (BX)(SI*1), R10
8603 SUBL SI, R9
8604 LEAL -1(R9), SI
8605 CMPL SI, $0x3c
8606 JB one_byte_match_emit_repeat_encodeBetterBlockAsm12B
8607 CMPL SI, $0x00000100
8608 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm12B
8609 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm12B
8610
8611 three_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
8612 MOVB $0xf4, (CX)
8613 MOVW SI, 1(CX)
8614 ADDQ $0x03, CX
8615 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
8616
8617 two_bytes_match_emit_repeat_encodeBetterBlockAsm12B:
8618 MOVB $0xf0, (CX)
8619 MOVB SI, 1(CX)
8620 ADDQ $0x02, CX
8621 CMPL SI, $0x40
8622 JB memmove_match_emit_repeat_encodeBetterBlockAsm12B
8623 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B
8624
8625 one_byte_match_emit_repeat_encodeBetterBlockAsm12B:
8626 SHLB $0x02, SI
8627 MOVB SI, (CX)
8628 ADDQ $0x01, CX
8629
8630 memmove_match_emit_repeat_encodeBetterBlockAsm12B:
8631 LEAQ (CX)(R9*1), SI
8632
8633 // genMemMoveShort
8634 CMPQ R9, $0x04
8635 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4
8636 CMPQ R9, $0x08
8637 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7
8638 CMPQ R9, $0x10
8639 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16
8640 CMPQ R9, $0x20
8641 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32
8642 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64
8643
8644 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4:
8645 MOVL (R10), R11
8646 MOVL R11, (CX)
8647 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8648
8649 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7:
8650 MOVL (R10), R11
8651 MOVL -4(R10)(R9*1), R10
8652 MOVL R11, (CX)
8653 MOVL R10, -4(CX)(R9*1)
8654 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8655
8656 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16:
8657 MOVQ (R10), R11
8658 MOVQ -8(R10)(R9*1), R10
8659 MOVQ R11, (CX)
8660 MOVQ R10, -8(CX)(R9*1)
8661 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8662
8663 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32:
8664 MOVOU (R10), X0
8665 MOVOU -16(R10)(R9*1), X1
8666 MOVOU X0, (CX)
8667 MOVOU X1, -16(CX)(R9*1)
8668 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B
8669
8670 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64:
8671 MOVOU (R10), X0
8672 MOVOU 16(R10), X1
8673 MOVOU -32(R10)(R9*1), X2
8674 MOVOU -16(R10)(R9*1), X3
8675 MOVOU X0, (CX)
8676 MOVOU X1, 16(CX)
8677 MOVOU X2, -32(CX)(R9*1)
8678 MOVOU X3, -16(CX)(R9*1)
8679
8680 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B:
8681 MOVQ SI, CX
8682 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B
8683
8684 memmove_long_match_emit_repeat_encodeBetterBlockAsm12B:
8685 LEAQ (CX)(R9*1), SI
8686
8687 // genMemMoveLong
8688 MOVOU (R10), X0
8689 MOVOU 16(R10), X1
8690 MOVOU -32(R10)(R9*1), X2
8691 MOVOU -16(R10)(R9*1), X3
8692 MOVQ R9, R13
8693 SHRQ $0x05, R13
8694 MOVQ CX, R11
8695 ANDL $0x0000001f, R11
8696 MOVQ $0x00000040, R14
8697 SUBQ R11, R14
8698 DECQ R13
8699 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8700 LEAQ -32(R10)(R14*1), R11
8701 LEAQ -32(CX)(R14*1), R15
8702
8703 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back:
8704 MOVOU (R11), X4
8705 MOVOU 16(R11), X5
8706 MOVOA X4, (R15)
8707 MOVOA X5, 16(R15)
8708 ADDQ $0x20, R15
8709 ADDQ $0x20, R11
8710 ADDQ $0x20, R14
8711 DECQ R13
8712 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back
8713
8714 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8715 MOVOU -32(R10)(R14*1), X4
8716 MOVOU -16(R10)(R14*1), X5
8717 MOVOA X4, -32(CX)(R14*1)
8718 MOVOA X5, -16(CX)(R14*1)
8719 ADDQ $0x20, R14
8720 CMPQ R9, R14
8721 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8722 MOVOU X0, (CX)
8723 MOVOU X1, 16(CX)
8724 MOVOU X2, -32(CX)(R9*1)
8725 MOVOU X3, -16(CX)(R9*1)
8726 MOVQ SI, CX
8727
8728 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B:
8729 ADDL R12, DX
8730 ADDL $0x04, R12
8731 MOVL DX, 12(SP)
8732
8733 // emitRepeat
8734 MOVL R12, SI
8735 LEAL -4(R12), R12
8736 CMPL SI, $0x08
8737 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B
8738 CMPL SI, $0x0c
8739 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
8740 CMPL R8, $0x00000800
8741 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B
8742
8743 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
8744 CMPL R12, $0x00000104
8745 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B
8746 LEAL -256(R12), R12
8747 MOVW $0x0019, (CX)
8748 MOVW R12, 2(CX)
8749 ADDQ $0x04, CX
8750 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8751
8752 repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B:
8753 LEAL -4(R12), R12
8754 MOVW $0x0015, (CX)
8755 MOVB R12, 2(CX)
8756 ADDQ $0x03, CX
8757 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8758
8759 repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B:
8760 SHLL $0x02, R12
8761 ORL $0x01, R12
8762 MOVW R12, (CX)
8763 ADDQ $0x02, CX
8764 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B
8765
8766 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B:
8767 XORQ SI, SI
8768 LEAL 1(SI)(R12*4), R12
8769 MOVB R8, 1(CX)
8770 SARL $0x08, R8
8771 SHLL $0x05, R8
8772 ORL R8, R12
8773 MOVB R12, (CX)
8774 ADDQ $0x02, CX
8775
8776 match_nolit_emitcopy_end_encodeBetterBlockAsm12B:
8777 CMPL DX, 8(SP)
8778 JAE emit_remainder_encodeBetterBlockAsm12B
8779 CMPQ CX, (SP)
8780 JB match_nolit_dst_ok_encodeBetterBlockAsm12B
8781 MOVQ $0x00000000, ret+56(FP)
8782 RET
8783
8784 match_nolit_dst_ok_encodeBetterBlockAsm12B:
8785 MOVQ $0x0000cf1bbcdcbf9b, SI
8786 MOVQ $0x9e3779b1, R8
8787 LEAQ 1(DI), DI
8788 LEAQ -2(DX), R9
8789 MOVQ (BX)(DI*1), R10
8790 MOVQ 1(BX)(DI*1), R11
8791 MOVQ (BX)(R9*1), R12
8792 MOVQ 1(BX)(R9*1), R13
8793 SHLQ $0x10, R10
8794 IMULQ SI, R10
8795 SHRQ $0x32, R10
8796 SHLQ $0x20, R11
8797 IMULQ R8, R11
8798 SHRQ $0x34, R11
8799 SHLQ $0x10, R12
8800 IMULQ SI, R12
8801 SHRQ $0x32, R12
8802 SHLQ $0x20, R13
8803 IMULQ R8, R13
8804 SHRQ $0x34, R13
8805 LEAQ 1(DI), R8
8806 LEAQ 1(R9), R14
8807 MOVL DI, (AX)(R10*4)
8808 MOVL R9, (AX)(R12*4)
8809 MOVL R8, 65536(AX)(R11*4)
8810 MOVL R14, 65536(AX)(R13*4)
8811 LEAQ 1(R9)(DI*1), R8
8812 SHRQ $0x01, R8
8813 ADDQ $0x01, DI
8814 SUBQ $0x01, R9
8815
8816 index_loop_encodeBetterBlockAsm12B:
8817 CMPQ R8, R9
8818 JAE search_loop_encodeBetterBlockAsm12B
8819 MOVQ (BX)(DI*1), R10
8820 MOVQ (BX)(R8*1), R11
8821 SHLQ $0x10, R10
8822 IMULQ SI, R10
8823 SHRQ $0x32, R10
8824 SHLQ $0x10, R11
8825 IMULQ SI, R11
8826 SHRQ $0x32, R11
8827 MOVL DI, (AX)(R10*4)
8828 MOVL R8, (AX)(R11*4)
8829 ADDQ $0x02, DI
8830 ADDQ $0x02, R8
8831 JMP index_loop_encodeBetterBlockAsm12B
8832
8833 emit_remainder_encodeBetterBlockAsm12B:
8834 MOVQ src_len+32(FP), AX
8835 SUBL 12(SP), AX
8836 LEAQ 3(CX)(AX*1), AX
8837 CMPQ AX, (SP)
8838 JB emit_remainder_ok_encodeBetterBlockAsm12B
8839 MOVQ $0x00000000, ret+56(FP)
8840 RET
8841
8842 emit_remainder_ok_encodeBetterBlockAsm12B:
8843 MOVQ src_len+32(FP), AX
8844 MOVL 12(SP), DX
8845 CMPL DX, AX
8846 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
8847 MOVL AX, SI
8848 MOVL AX, 12(SP)
8849 LEAQ (BX)(DX*1), AX
8850 SUBL DX, SI
8851 LEAL -1(SI), DX
8852 CMPL DX, $0x3c
8853 JB one_byte_emit_remainder_encodeBetterBlockAsm12B
8854 CMPL DX, $0x00000100
8855 JB two_bytes_emit_remainder_encodeBetterBlockAsm12B
8856 JB three_bytes_emit_remainder_encodeBetterBlockAsm12B
8857
8858 three_bytes_emit_remainder_encodeBetterBlockAsm12B:
8859 MOVB $0xf4, (CX)
8860 MOVW DX, 1(CX)
8861 ADDQ $0x03, CX
8862 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
8863
8864 two_bytes_emit_remainder_encodeBetterBlockAsm12B:
8865 MOVB $0xf0, (CX)
8866 MOVB DL, 1(CX)
8867 ADDQ $0x02, CX
8868 CMPL DX, $0x40
8869 JB memmove_emit_remainder_encodeBetterBlockAsm12B
8870 JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B
8871
8872 one_byte_emit_remainder_encodeBetterBlockAsm12B:
8873 SHLB $0x02, DL
8874 MOVB DL, (CX)
8875 ADDQ $0x01, CX
8876
8877 memmove_emit_remainder_encodeBetterBlockAsm12B:
8878 LEAQ (CX)(SI*1), DX
8879 MOVL SI, BX
8880
8881 // genMemMoveShort
8882 CMPQ BX, $0x03
8883 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2
8884 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3
8885 CMPQ BX, $0x08
8886 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7
8887 CMPQ BX, $0x10
8888 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16
8889 CMPQ BX, $0x20
8890 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32
8891 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64
8892
8893 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2:
8894 MOVB (AX), SI
8895 MOVB -1(AX)(BX*1), AL
8896 MOVB SI, (CX)
8897 MOVB AL, -1(CX)(BX*1)
8898 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8899
8900 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3:
8901 MOVW (AX), SI
8902 MOVB 2(AX), AL
8903 MOVW SI, (CX)
8904 MOVB AL, 2(CX)
8905 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8906
8907 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7:
8908 MOVL (AX), SI
8909 MOVL -4(AX)(BX*1), AX
8910 MOVL SI, (CX)
8911 MOVL AX, -4(CX)(BX*1)
8912 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8913
8914 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16:
8915 MOVQ (AX), SI
8916 MOVQ -8(AX)(BX*1), AX
8917 MOVQ SI, (CX)
8918 MOVQ AX, -8(CX)(BX*1)
8919 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8920
8921 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32:
8922 MOVOU (AX), X0
8923 MOVOU -16(AX)(BX*1), X1
8924 MOVOU X0, (CX)
8925 MOVOU X1, -16(CX)(BX*1)
8926 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B
8927
8928 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64:
8929 MOVOU (AX), X0
8930 MOVOU 16(AX), X1
8931 MOVOU -32(AX)(BX*1), X2
8932 MOVOU -16(AX)(BX*1), X3
8933 MOVOU X0, (CX)
8934 MOVOU X1, 16(CX)
8935 MOVOU X2, -32(CX)(BX*1)
8936 MOVOU X3, -16(CX)(BX*1)
8937
8938 memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B:
8939 MOVQ DX, CX
8940 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B
8941
8942 memmove_long_emit_remainder_encodeBetterBlockAsm12B:
8943 LEAQ (CX)(SI*1), DX
8944 MOVL SI, BX
8945
8946 // genMemMoveLong
8947 MOVOU (AX), X0
8948 MOVOU 16(AX), X1
8949 MOVOU -32(AX)(BX*1), X2
8950 MOVOU -16(AX)(BX*1), X3
8951 MOVQ BX, DI
8952 SHRQ $0x05, DI
8953 MOVQ CX, SI
8954 ANDL $0x0000001f, SI
8955 MOVQ $0x00000040, R8
8956 SUBQ SI, R8
8957 DECQ DI
8958 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8959 LEAQ -32(AX)(R8*1), SI
8960 LEAQ -32(CX)(R8*1), R9
8961
8962 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back:
8963 MOVOU (SI), X4
8964 MOVOU 16(SI), X5
8965 MOVOA X4, (R9)
8966 MOVOA X5, 16(R9)
8967 ADDQ $0x20, R9
8968 ADDQ $0x20, SI
8969 ADDQ $0x20, R8
8970 DECQ DI
8971 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back
8972
8973 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32:
8974 MOVOU -32(AX)(R8*1), X4
8975 MOVOU -16(AX)(R8*1), X5
8976 MOVOA X4, -32(CX)(R8*1)
8977 MOVOA X5, -16(CX)(R8*1)
8978 ADDQ $0x20, R8
8979 CMPQ BX, R8
8980 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32
8981 MOVOU X0, (CX)
8982 MOVOU X1, 16(CX)
8983 MOVOU X2, -32(CX)(BX*1)
8984 MOVOU X3, -16(CX)(BX*1)
8985 MOVQ DX, CX
8986
8987 emit_literal_done_emit_remainder_encodeBetterBlockAsm12B:
8988 MOVQ dst_base+0(FP), AX
8989 SUBQ AX, CX
8990 MOVQ CX, ret+56(FP)
8991 RET
8992
8993 // func encodeBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
8994 // Requires: BMI, SSE2
8995 TEXT ·encodeBetterBlockAsm10B(SB), $24-64
8996 MOVQ tmp+48(FP), AX
8997 MOVQ dst_base+0(FP), CX
8998 MOVQ $0x000000a0, DX
8999 MOVQ AX, BX
9000 PXOR X0, X0
9001
9002 zero_loop_encodeBetterBlockAsm10B:
9003 MOVOU X0, (BX)
9004 MOVOU X0, 16(BX)
9005 MOVOU X0, 32(BX)
9006 MOVOU X0, 48(BX)
9007 MOVOU X0, 64(BX)
9008 MOVOU X0, 80(BX)
9009 MOVOU X0, 96(BX)
9010 MOVOU X0, 112(BX)
9011 ADDQ $0x80, BX
9012 DECQ DX
9013 JNZ zero_loop_encodeBetterBlockAsm10B
9014 MOVL $0x00000000, 12(SP)
9015 MOVQ src_len+32(FP), DX
9016 LEAQ -6(DX), BX
9017 LEAQ -8(DX), SI
9018 MOVL SI, 8(SP)
9019 SHRQ $0x05, DX
9020 SUBL DX, BX
9021 LEAQ (CX)(BX*1), BX
9022 MOVQ BX, (SP)
9023 MOVL $0x00000001, DX
9024 MOVL $0x00000000, 16(SP)
9025 MOVQ src_base+24(FP), BX
9026
9027 search_loop_encodeBetterBlockAsm10B:
9028 MOVL DX, SI
9029 SUBL 12(SP), SI
9030 SHRL $0x05, SI
9031 LEAL 1(DX)(SI*1), SI
9032 CMPL SI, 8(SP)
9033 JAE emit_remainder_encodeBetterBlockAsm10B
9034 MOVQ (BX)(DX*1), DI
9035 MOVL SI, 20(SP)
9036 MOVQ $0x0000cf1bbcdcbf9b, R9
9037 MOVQ $0x9e3779b1, SI
9038 MOVQ DI, R10
9039 MOVQ DI, R11
9040 SHLQ $0x10, R10
9041 IMULQ R9, R10
9042 SHRQ $0x34, R10
9043 SHLQ $0x20, R11
9044 IMULQ SI, R11
9045 SHRQ $0x36, R11
9046 MOVL (AX)(R10*4), SI
9047 MOVL 16384(AX)(R11*4), R8
9048 MOVL DX, (AX)(R10*4)
9049 MOVL DX, 16384(AX)(R11*4)
9050 MOVQ (BX)(SI*1), R10
9051 MOVQ (BX)(R8*1), R11
9052 CMPQ R10, DI
9053 JEQ candidate_match_encodeBetterBlockAsm10B
9054 CMPQ R11, DI
9055 JNE no_short_found_encodeBetterBlockAsm10B
9056 MOVL R8, SI
9057 JMP candidate_match_encodeBetterBlockAsm10B
9058
9059 no_short_found_encodeBetterBlockAsm10B:
9060 CMPL R10, DI
9061 JEQ candidate_match_encodeBetterBlockAsm10B
9062 CMPL R11, DI
9063 JEQ candidateS_match_encodeBetterBlockAsm10B
9064 MOVL 20(SP), DX
9065 JMP search_loop_encodeBetterBlockAsm10B
9066
9067 candidateS_match_encodeBetterBlockAsm10B:
9068 SHRQ $0x08, DI
9069 MOVQ DI, R10
9070 SHLQ $0x10, R10
9071 IMULQ R9, R10
9072 SHRQ $0x34, R10
9073 MOVL (AX)(R10*4), SI
9074 INCL DX
9075 MOVL DX, (AX)(R10*4)
9076 CMPL (BX)(SI*1), DI
9077 JEQ candidate_match_encodeBetterBlockAsm10B
9078 DECL DX
9079 MOVL R8, SI
9080
9081 candidate_match_encodeBetterBlockAsm10B:
9082 MOVL 12(SP), DI
9083 TESTL SI, SI
9084 JZ match_extend_back_end_encodeBetterBlockAsm10B
9085
9086 match_extend_back_loop_encodeBetterBlockAsm10B:
9087 CMPL DX, DI
9088 JBE match_extend_back_end_encodeBetterBlockAsm10B
9089 MOVB -1(BX)(SI*1), R8
9090 MOVB -1(BX)(DX*1), R9
9091 CMPB R8, R9
9092 JNE match_extend_back_end_encodeBetterBlockAsm10B
9093 LEAL -1(DX), DX
9094 DECL SI
9095 JZ match_extend_back_end_encodeBetterBlockAsm10B
9096 JMP match_extend_back_loop_encodeBetterBlockAsm10B
9097
9098 match_extend_back_end_encodeBetterBlockAsm10B:
9099 MOVL DX, DI
9100 SUBL 12(SP), DI
9101 LEAQ 3(CX)(DI*1), DI
9102 CMPQ DI, (SP)
9103 JB match_dst_size_check_encodeBetterBlockAsm10B
9104 MOVQ $0x00000000, ret+56(FP)
9105 RET
9106
9107 match_dst_size_check_encodeBetterBlockAsm10B:
9108 MOVL DX, DI
9109 ADDL $0x04, DX
9110 ADDL $0x04, SI
9111 MOVQ src_len+32(FP), R8
9112 SUBL DX, R8
9113 LEAQ (BX)(DX*1), R9
9114 LEAQ (BX)(SI*1), R10
9115
9116 // matchLen
9117 XORL R12, R12
9118
9119 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B:
9120 CMPL R8, $0x10
9121 JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B
9122 MOVQ (R9)(R12*1), R11
9123 MOVQ 8(R9)(R12*1), R13
9124 XORQ (R10)(R12*1), R11
9125 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
9126 XORQ 8(R10)(R12*1), R13
9127 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B
9128 LEAL -16(R8), R8
9129 LEAL 16(R12), R12
9130 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B
9131
9132 matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B:
9133 #ifdef GOAMD64_v3
9134 TZCNTQ R13, R13
9135
9136 #else
9137 BSFQ R13, R13
9138
9139 #endif
9140 SARQ $0x03, R13
9141 LEAL 8(R12)(R13*1), R12
9142 JMP match_nolit_end_encodeBetterBlockAsm10B
9143
9144 matchlen_match8_match_nolit_encodeBetterBlockAsm10B:
9145 CMPL R8, $0x08
9146 JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B
9147 MOVQ (R9)(R12*1), R11
9148 XORQ (R10)(R12*1), R11
9149 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B
9150 LEAL -8(R8), R8
9151 LEAL 8(R12), R12
9152 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B
9153
9154 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B:
9155 #ifdef GOAMD64_v3
9156 TZCNTQ R11, R11
9157
9158 #else
9159 BSFQ R11, R11
9160
9161 #endif
9162 SARQ $0x03, R11
9163 LEAL (R12)(R11*1), R12
9164 JMP match_nolit_end_encodeBetterBlockAsm10B
9165
9166 matchlen_match4_match_nolit_encodeBetterBlockAsm10B:
9167 CMPL R8, $0x04
9168 JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B
9169 MOVL (R9)(R12*1), R11
9170 CMPL (R10)(R12*1), R11
9171 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B
9172 LEAL -4(R8), R8
9173 LEAL 4(R12), R12
9174
9175 matchlen_match2_match_nolit_encodeBetterBlockAsm10B:
9176 CMPL R8, $0x01
9177 JE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
9178 JB match_nolit_end_encodeBetterBlockAsm10B
9179 MOVW (R9)(R12*1), R11
9180 CMPW (R10)(R12*1), R11
9181 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B
9182 LEAL 2(R12), R12
9183 SUBL $0x02, R8
9184 JZ match_nolit_end_encodeBetterBlockAsm10B
9185
9186 matchlen_match1_match_nolit_encodeBetterBlockAsm10B:
9187 MOVB (R9)(R12*1), R11
9188 CMPB (R10)(R12*1), R11
9189 JNE match_nolit_end_encodeBetterBlockAsm10B
9190 LEAL 1(R12), R12
9191
9192 match_nolit_end_encodeBetterBlockAsm10B:
9193 MOVL DX, R8
9194 SUBL SI, R8
9195
9196 // Check if repeat
9197 CMPL 16(SP), R8
9198 JEQ match_is_repeat_encodeBetterBlockAsm10B
9199 MOVL R8, 16(SP)
9200 MOVL 12(SP), SI
9201 CMPL SI, DI
9202 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B
9203 MOVL DI, R9
9204 MOVL DI, 12(SP)
9205 LEAQ (BX)(SI*1), R10
9206 SUBL SI, R9
9207 LEAL -1(R9), SI
9208 CMPL SI, $0x3c
9209 JB one_byte_match_emit_encodeBetterBlockAsm10B
9210 CMPL SI, $0x00000100
9211 JB two_bytes_match_emit_encodeBetterBlockAsm10B
9212 JB three_bytes_match_emit_encodeBetterBlockAsm10B
9213
9214 three_bytes_match_emit_encodeBetterBlockAsm10B:
9215 MOVB $0xf4, (CX)
9216 MOVW SI, 1(CX)
9217 ADDQ $0x03, CX
9218 JMP memmove_long_match_emit_encodeBetterBlockAsm10B
9219
9220 two_bytes_match_emit_encodeBetterBlockAsm10B:
9221 MOVB $0xf0, (CX)
9222 MOVB SI, 1(CX)
9223 ADDQ $0x02, CX
9224 CMPL SI, $0x40
9225 JB memmove_match_emit_encodeBetterBlockAsm10B
9226 JMP memmove_long_match_emit_encodeBetterBlockAsm10B
9227
9228 one_byte_match_emit_encodeBetterBlockAsm10B:
9229 SHLB $0x02, SI
9230 MOVB SI, (CX)
9231 ADDQ $0x01, CX
9232
9233 memmove_match_emit_encodeBetterBlockAsm10B:
9234 LEAQ (CX)(R9*1), SI
9235
9236 // genMemMoveShort
9237 CMPQ R9, $0x04
9238 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4
9239 CMPQ R9, $0x08
9240 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7
9241 CMPQ R9, $0x10
9242 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16
9243 CMPQ R9, $0x20
9244 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32
9245 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64
9246
9247 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4:
9248 MOVL (R10), R11
9249 MOVL R11, (CX)
9250 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9251
9252 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7:
9253 MOVL (R10), R11
9254 MOVL -4(R10)(R9*1), R10
9255 MOVL R11, (CX)
9256 MOVL R10, -4(CX)(R9*1)
9257 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9258
9259 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16:
9260 MOVQ (R10), R11
9261 MOVQ -8(R10)(R9*1), R10
9262 MOVQ R11, (CX)
9263 MOVQ R10, -8(CX)(R9*1)
9264 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9265
9266 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32:
9267 MOVOU (R10), X0
9268 MOVOU -16(R10)(R9*1), X1
9269 MOVOU X0, (CX)
9270 MOVOU X1, -16(CX)(R9*1)
9271 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B
9272
9273 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64:
9274 MOVOU (R10), X0
9275 MOVOU 16(R10), X1
9276 MOVOU -32(R10)(R9*1), X2
9277 MOVOU -16(R10)(R9*1), X3
9278 MOVOU X0, (CX)
9279 MOVOU X1, 16(CX)
9280 MOVOU X2, -32(CX)(R9*1)
9281 MOVOU X3, -16(CX)(R9*1)
9282
9283 memmove_end_copy_match_emit_encodeBetterBlockAsm10B:
9284 MOVQ SI, CX
9285 JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B
9286
9287 memmove_long_match_emit_encodeBetterBlockAsm10B:
9288 LEAQ (CX)(R9*1), SI
9289
9290 // genMemMoveLong
9291 MOVOU (R10), X0
9292 MOVOU 16(R10), X1
9293 MOVOU -32(R10)(R9*1), X2
9294 MOVOU -16(R10)(R9*1), X3
9295 MOVQ R9, R13
9296 SHRQ $0x05, R13
9297 MOVQ CX, R11
9298 ANDL $0x0000001f, R11
9299 MOVQ $0x00000040, R14
9300 SUBQ R11, R14
9301 DECQ R13
9302 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9303 LEAQ -32(R10)(R14*1), R11
9304 LEAQ -32(CX)(R14*1), R15
9305
9306 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back:
9307 MOVOU (R11), X4
9308 MOVOU 16(R11), X5
9309 MOVOA X4, (R15)
9310 MOVOA X5, 16(R15)
9311 ADDQ $0x20, R15
9312 ADDQ $0x20, R11
9313 ADDQ $0x20, R14
9314 DECQ R13
9315 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back
9316
9317 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9318 MOVOU -32(R10)(R14*1), X4
9319 MOVOU -16(R10)(R14*1), X5
9320 MOVOA X4, -32(CX)(R14*1)
9321 MOVOA X5, -16(CX)(R14*1)
9322 ADDQ $0x20, R14
9323 CMPQ R9, R14
9324 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9325 MOVOU X0, (CX)
9326 MOVOU X1, 16(CX)
9327 MOVOU X2, -32(CX)(R9*1)
9328 MOVOU X3, -16(CX)(R9*1)
9329 MOVQ SI, CX
9330
9331 emit_literal_done_match_emit_encodeBetterBlockAsm10B:
9332 ADDL R12, DX
9333 ADDL $0x04, R12
9334 MOVL DX, 12(SP)
9335
9336 // emitCopy
9337 CMPL R12, $0x40
9338 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
9339 CMPL R8, $0x00000800
9340 JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B
9341 MOVL $0x00000001, SI
9342 LEAL 16(SI), SI
9343 MOVB R8, 1(CX)
9344 SHRL $0x08, R8
9345 SHLL $0x05, R8
9346 ORL R8, SI
9347 MOVB SI, (CX)
9348 ADDQ $0x02, CX
9349 SUBL $0x08, R12
9350
9351 // emitRepeat
9352 LEAL -4(R12), R12
9353 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9354 MOVL R12, SI
9355 LEAL -4(R12), R12
9356 CMPL SI, $0x08
9357 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9358 CMPL SI, $0x0c
9359 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9360 CMPL R8, $0x00000800
9361 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9362
9363 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9364 CMPL R12, $0x00000104
9365 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b
9366 LEAL -256(R12), R12
9367 MOVW $0x0019, (CX)
9368 MOVW R12, 2(CX)
9369 ADDQ $0x04, CX
9370 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9371
9372 repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9373 LEAL -4(R12), R12
9374 MOVW $0x0015, (CX)
9375 MOVB R12, 2(CX)
9376 ADDQ $0x03, CX
9377 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9378
9379 repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9380 SHLL $0x02, R12
9381 ORL $0x01, R12
9382 MOVW R12, (CX)
9383 ADDQ $0x02, CX
9384 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9385
9386 repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b:
9387 XORQ SI, SI
9388 LEAL 1(SI)(R12*4), R12
9389 MOVB R8, 1(CX)
9390 SARL $0x08, R8
9391 SHLL $0x05, R8
9392 ORL R8, R12
9393 MOVB R12, (CX)
9394 ADDQ $0x02, CX
9395 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9396
9397 long_offset_short_match_nolit_encodeBetterBlockAsm10B:
9398 MOVB $0xee, (CX)
9399 MOVW R8, 1(CX)
9400 LEAL -60(R12), R12
9401 ADDQ $0x03, CX
9402
9403 // emitRepeat
9404 MOVL R12, SI
9405 LEAL -4(R12), R12
9406 CMPL SI, $0x08
9407 JBE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9408 CMPL SI, $0x0c
9409 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9410 CMPL R8, $0x00000800
9411 JB repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9412
9413 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9414 CMPL R12, $0x00000104
9415 JB repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short
9416 LEAL -256(R12), R12
9417 MOVW $0x0019, (CX)
9418 MOVW R12, 2(CX)
9419 ADDQ $0x04, CX
9420 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9421
9422 repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9423 LEAL -4(R12), R12
9424 MOVW $0x0015, (CX)
9425 MOVB R12, 2(CX)
9426 ADDQ $0x03, CX
9427 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9428
9429 repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9430 SHLL $0x02, R12
9431 ORL $0x01, R12
9432 MOVW R12, (CX)
9433 ADDQ $0x02, CX
9434 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9435
9436 repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
9437 XORQ SI, SI
9438 LEAL 1(SI)(R12*4), R12
9439 MOVB R8, 1(CX)
9440 SARL $0x08, R8
9441 SHLL $0x05, R8
9442 ORL R8, R12
9443 MOVB R12, (CX)
9444 ADDQ $0x02, CX
9445 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9446
9447 two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
9448 MOVL R12, SI
9449 SHLL $0x02, SI
9450 CMPL R12, $0x0c
9451 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
9452 CMPL R8, $0x00000800
9453 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm10B
9454 LEAL -15(SI), SI
9455 MOVB R8, 1(CX)
9456 SHRL $0x08, R8
9457 SHLL $0x05, R8
9458 ORL R8, SI
9459 MOVB SI, (CX)
9460 ADDQ $0x02, CX
9461 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9462
9463 emit_copy_three_match_nolit_encodeBetterBlockAsm10B:
9464 LEAL -2(SI), SI
9465 MOVB SI, (CX)
9466 MOVW R8, 1(CX)
9467 ADDQ $0x03, CX
9468 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9469
9470 match_is_repeat_encodeBetterBlockAsm10B:
9471 MOVL 12(SP), SI
9472 CMPL SI, DI
9473 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
9474 MOVL DI, R9
9475 MOVL DI, 12(SP)
9476 LEAQ (BX)(SI*1), R10
9477 SUBL SI, R9
9478 LEAL -1(R9), SI
9479 CMPL SI, $0x3c
9480 JB one_byte_match_emit_repeat_encodeBetterBlockAsm10B
9481 CMPL SI, $0x00000100
9482 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm10B
9483 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm10B
9484
9485 three_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
9486 MOVB $0xf4, (CX)
9487 MOVW SI, 1(CX)
9488 ADDQ $0x03, CX
9489 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
9490
9491 two_bytes_match_emit_repeat_encodeBetterBlockAsm10B:
9492 MOVB $0xf0, (CX)
9493 MOVB SI, 1(CX)
9494 ADDQ $0x02, CX
9495 CMPL SI, $0x40
9496 JB memmove_match_emit_repeat_encodeBetterBlockAsm10B
9497 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B
9498
9499 one_byte_match_emit_repeat_encodeBetterBlockAsm10B:
9500 SHLB $0x02, SI
9501 MOVB SI, (CX)
9502 ADDQ $0x01, CX
9503
9504 memmove_match_emit_repeat_encodeBetterBlockAsm10B:
9505 LEAQ (CX)(R9*1), SI
9506
9507 // genMemMoveShort
9508 CMPQ R9, $0x04
9509 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4
9510 CMPQ R9, $0x08
9511 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7
9512 CMPQ R9, $0x10
9513 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16
9514 CMPQ R9, $0x20
9515 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32
9516 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64
9517
9518 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4:
9519 MOVL (R10), R11
9520 MOVL R11, (CX)
9521 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9522
9523 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7:
9524 MOVL (R10), R11
9525 MOVL -4(R10)(R9*1), R10
9526 MOVL R11, (CX)
9527 MOVL R10, -4(CX)(R9*1)
9528 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9529
9530 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16:
9531 MOVQ (R10), R11
9532 MOVQ -8(R10)(R9*1), R10
9533 MOVQ R11, (CX)
9534 MOVQ R10, -8(CX)(R9*1)
9535 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9536
9537 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32:
9538 MOVOU (R10), X0
9539 MOVOU -16(R10)(R9*1), X1
9540 MOVOU X0, (CX)
9541 MOVOU X1, -16(CX)(R9*1)
9542 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B
9543
9544 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64:
9545 MOVOU (R10), X0
9546 MOVOU 16(R10), X1
9547 MOVOU -32(R10)(R9*1), X2
9548 MOVOU -16(R10)(R9*1), X3
9549 MOVOU X0, (CX)
9550 MOVOU X1, 16(CX)
9551 MOVOU X2, -32(CX)(R9*1)
9552 MOVOU X3, -16(CX)(R9*1)
9553
9554 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B:
9555 MOVQ SI, CX
9556 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B
9557
9558 memmove_long_match_emit_repeat_encodeBetterBlockAsm10B:
9559 LEAQ (CX)(R9*1), SI
9560
9561 // genMemMoveLong
9562 MOVOU (R10), X0
9563 MOVOU 16(R10), X1
9564 MOVOU -32(R10)(R9*1), X2
9565 MOVOU -16(R10)(R9*1), X3
9566 MOVQ R9, R13
9567 SHRQ $0x05, R13
9568 MOVQ CX, R11
9569 ANDL $0x0000001f, R11
9570 MOVQ $0x00000040, R14
9571 SUBQ R11, R14
9572 DECQ R13
9573 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9574 LEAQ -32(R10)(R14*1), R11
9575 LEAQ -32(CX)(R14*1), R15
9576
9577 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back:
9578 MOVOU (R11), X4
9579 MOVOU 16(R11), X5
9580 MOVOA X4, (R15)
9581 MOVOA X5, 16(R15)
9582 ADDQ $0x20, R15
9583 ADDQ $0x20, R11
9584 ADDQ $0x20, R14
9585 DECQ R13
9586 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back
9587
9588 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9589 MOVOU -32(R10)(R14*1), X4
9590 MOVOU -16(R10)(R14*1), X5
9591 MOVOA X4, -32(CX)(R14*1)
9592 MOVOA X5, -16(CX)(R14*1)
9593 ADDQ $0x20, R14
9594 CMPQ R9, R14
9595 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9596 MOVOU X0, (CX)
9597 MOVOU X1, 16(CX)
9598 MOVOU X2, -32(CX)(R9*1)
9599 MOVOU X3, -16(CX)(R9*1)
9600 MOVQ SI, CX
9601
9602 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B:
9603 ADDL R12, DX
9604 ADDL $0x04, R12
9605 MOVL DX, 12(SP)
9606
9607 // emitRepeat
9608 MOVL R12, SI
9609 LEAL -4(R12), R12
9610 CMPL SI, $0x08
9611 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B
9612 CMPL SI, $0x0c
9613 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
9614 CMPL R8, $0x00000800
9615 JB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B
9616
9617 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
9618 CMPL R12, $0x00000104
9619 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B
9620 LEAL -256(R12), R12
9621 MOVW $0x0019, (CX)
9622 MOVW R12, 2(CX)
9623 ADDQ $0x04, CX
9624 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9625
9626 repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B:
9627 LEAL -4(R12), R12
9628 MOVW $0x0015, (CX)
9629 MOVB R12, 2(CX)
9630 ADDQ $0x03, CX
9631 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9632
9633 repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B:
9634 SHLL $0x02, R12
9635 ORL $0x01, R12
9636 MOVW R12, (CX)
9637 ADDQ $0x02, CX
9638 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B
9639
9640 repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B:
9641 XORQ SI, SI
9642 LEAL 1(SI)(R12*4), R12
9643 MOVB R8, 1(CX)
9644 SARL $0x08, R8
9645 SHLL $0x05, R8
9646 ORL R8, R12
9647 MOVB R12, (CX)
9648 ADDQ $0x02, CX
9649
9650 match_nolit_emitcopy_end_encodeBetterBlockAsm10B:
9651 CMPL DX, 8(SP)
9652 JAE emit_remainder_encodeBetterBlockAsm10B
9653 CMPQ CX, (SP)
9654 JB match_nolit_dst_ok_encodeBetterBlockAsm10B
9655 MOVQ $0x00000000, ret+56(FP)
9656 RET
9657
9658 match_nolit_dst_ok_encodeBetterBlockAsm10B:
9659 MOVQ $0x0000cf1bbcdcbf9b, SI
9660 MOVQ $0x9e3779b1, R8
9661 LEAQ 1(DI), DI
9662 LEAQ -2(DX), R9
9663 MOVQ (BX)(DI*1), R10
9664 MOVQ 1(BX)(DI*1), R11
9665 MOVQ (BX)(R9*1), R12
9666 MOVQ 1(BX)(R9*1), R13
9667 SHLQ $0x10, R10
9668 IMULQ SI, R10
9669 SHRQ $0x34, R10
9670 SHLQ $0x20, R11
9671 IMULQ R8, R11
9672 SHRQ $0x36, R11
9673 SHLQ $0x10, R12
9674 IMULQ SI, R12
9675 SHRQ $0x34, R12
9676 SHLQ $0x20, R13
9677 IMULQ R8, R13
9678 SHRQ $0x36, R13
9679 LEAQ 1(DI), R8
9680 LEAQ 1(R9), R14
9681 MOVL DI, (AX)(R10*4)
9682 MOVL R9, (AX)(R12*4)
9683 MOVL R8, 16384(AX)(R11*4)
9684 MOVL R14, 16384(AX)(R13*4)
9685 LEAQ 1(R9)(DI*1), R8
9686 SHRQ $0x01, R8
9687 ADDQ $0x01, DI
9688 SUBQ $0x01, R9
9689
9690 index_loop_encodeBetterBlockAsm10B:
9691 CMPQ R8, R9
9692 JAE search_loop_encodeBetterBlockAsm10B
9693 MOVQ (BX)(DI*1), R10
9694 MOVQ (BX)(R8*1), R11
9695 SHLQ $0x10, R10
9696 IMULQ SI, R10
9697 SHRQ $0x34, R10
9698 SHLQ $0x10, R11
9699 IMULQ SI, R11
9700 SHRQ $0x34, R11
9701 MOVL DI, (AX)(R10*4)
9702 MOVL R8, (AX)(R11*4)
9703 ADDQ $0x02, DI
9704 ADDQ $0x02, R8
9705 JMP index_loop_encodeBetterBlockAsm10B
9706
9707 emit_remainder_encodeBetterBlockAsm10B:
9708 MOVQ src_len+32(FP), AX
9709 SUBL 12(SP), AX
9710 LEAQ 3(CX)(AX*1), AX
9711 CMPQ AX, (SP)
9712 JB emit_remainder_ok_encodeBetterBlockAsm10B
9713 MOVQ $0x00000000, ret+56(FP)
9714 RET
9715
9716 emit_remainder_ok_encodeBetterBlockAsm10B:
9717 MOVQ src_len+32(FP), AX
9718 MOVL 12(SP), DX
9719 CMPL DX, AX
9720 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
9721 MOVL AX, SI
9722 MOVL AX, 12(SP)
9723 LEAQ (BX)(DX*1), AX
9724 SUBL DX, SI
9725 LEAL -1(SI), DX
9726 CMPL DX, $0x3c
9727 JB one_byte_emit_remainder_encodeBetterBlockAsm10B
9728 CMPL DX, $0x00000100
9729 JB two_bytes_emit_remainder_encodeBetterBlockAsm10B
9730 JB three_bytes_emit_remainder_encodeBetterBlockAsm10B
9731
9732 three_bytes_emit_remainder_encodeBetterBlockAsm10B:
9733 MOVB $0xf4, (CX)
9734 MOVW DX, 1(CX)
9735 ADDQ $0x03, CX
9736 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
9737
9738 two_bytes_emit_remainder_encodeBetterBlockAsm10B:
9739 MOVB $0xf0, (CX)
9740 MOVB DL, 1(CX)
9741 ADDQ $0x02, CX
9742 CMPL DX, $0x40
9743 JB memmove_emit_remainder_encodeBetterBlockAsm10B
9744 JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B
9745
9746 one_byte_emit_remainder_encodeBetterBlockAsm10B:
9747 SHLB $0x02, DL
9748 MOVB DL, (CX)
9749 ADDQ $0x01, CX
9750
9751 memmove_emit_remainder_encodeBetterBlockAsm10B:
9752 LEAQ (CX)(SI*1), DX
9753 MOVL SI, BX
9754
9755 // genMemMoveShort
9756 CMPQ BX, $0x03
9757 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2
9758 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3
9759 CMPQ BX, $0x08
9760 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7
9761 CMPQ BX, $0x10
9762 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16
9763 CMPQ BX, $0x20
9764 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32
9765 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64
9766
9767 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2:
9768 MOVB (AX), SI
9769 MOVB -1(AX)(BX*1), AL
9770 MOVB SI, (CX)
9771 MOVB AL, -1(CX)(BX*1)
9772 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9773
9774 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3:
9775 MOVW (AX), SI
9776 MOVB 2(AX), AL
9777 MOVW SI, (CX)
9778 MOVB AL, 2(CX)
9779 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9780
9781 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7:
9782 MOVL (AX), SI
9783 MOVL -4(AX)(BX*1), AX
9784 MOVL SI, (CX)
9785 MOVL AX, -4(CX)(BX*1)
9786 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9787
9788 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16:
9789 MOVQ (AX), SI
9790 MOVQ -8(AX)(BX*1), AX
9791 MOVQ SI, (CX)
9792 MOVQ AX, -8(CX)(BX*1)
9793 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9794
9795 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32:
9796 MOVOU (AX), X0
9797 MOVOU -16(AX)(BX*1), X1
9798 MOVOU X0, (CX)
9799 MOVOU X1, -16(CX)(BX*1)
9800 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B
9801
9802 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64:
9803 MOVOU (AX), X0
9804 MOVOU 16(AX), X1
9805 MOVOU -32(AX)(BX*1), X2
9806 MOVOU -16(AX)(BX*1), X3
9807 MOVOU X0, (CX)
9808 MOVOU X1, 16(CX)
9809 MOVOU X2, -32(CX)(BX*1)
9810 MOVOU X3, -16(CX)(BX*1)
9811
9812 memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B:
9813 MOVQ DX, CX
9814 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B
9815
9816 memmove_long_emit_remainder_encodeBetterBlockAsm10B:
9817 LEAQ (CX)(SI*1), DX
9818 MOVL SI, BX
9819
9820 // genMemMoveLong
9821 MOVOU (AX), X0
9822 MOVOU 16(AX), X1
9823 MOVOU -32(AX)(BX*1), X2
9824 MOVOU -16(AX)(BX*1), X3
9825 MOVQ BX, DI
9826 SHRQ $0x05, DI
9827 MOVQ CX, SI
9828 ANDL $0x0000001f, SI
9829 MOVQ $0x00000040, R8
9830 SUBQ SI, R8
9831 DECQ DI
9832 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9833 LEAQ -32(AX)(R8*1), SI
9834 LEAQ -32(CX)(R8*1), R9
9835
9836 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back:
9837 MOVOU (SI), X4
9838 MOVOU 16(SI), X5
9839 MOVOA X4, (R9)
9840 MOVOA X5, 16(R9)
9841 ADDQ $0x20, R9
9842 ADDQ $0x20, SI
9843 ADDQ $0x20, R8
9844 DECQ DI
9845 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back
9846
9847 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32:
9848 MOVOU -32(AX)(R8*1), X4
9849 MOVOU -16(AX)(R8*1), X5
9850 MOVOA X4, -32(CX)(R8*1)
9851 MOVOA X5, -16(CX)(R8*1)
9852 ADDQ $0x20, R8
9853 CMPQ BX, R8
9854 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32
9855 MOVOU X0, (CX)
9856 MOVOU X1, 16(CX)
9857 MOVOU X2, -32(CX)(BX*1)
9858 MOVOU X3, -16(CX)(BX*1)
9859 MOVQ DX, CX
9860
9861 emit_literal_done_emit_remainder_encodeBetterBlockAsm10B:
9862 MOVQ dst_base+0(FP), AX
9863 SUBQ AX, CX
9864 MOVQ CX, ret+56(FP)
9865 RET
9866
9867 // func encodeBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
9868 // Requires: BMI, SSE2
9869 TEXT ·encodeBetterBlockAsm8B(SB), $24-64
9870 MOVQ tmp+48(FP), AX
9871 MOVQ dst_base+0(FP), CX
9872 MOVQ $0x00000028, DX
9873 MOVQ AX, BX
9874 PXOR X0, X0
9875
9876 zero_loop_encodeBetterBlockAsm8B:
9877 MOVOU X0, (BX)
9878 MOVOU X0, 16(BX)
9879 MOVOU X0, 32(BX)
9880 MOVOU X0, 48(BX)
9881 MOVOU X0, 64(BX)
9882 MOVOU X0, 80(BX)
9883 MOVOU X0, 96(BX)
9884 MOVOU X0, 112(BX)
9885 ADDQ $0x80, BX
9886 DECQ DX
9887 JNZ zero_loop_encodeBetterBlockAsm8B
9888 MOVL $0x00000000, 12(SP)
9889 MOVQ src_len+32(FP), DX
9890 LEAQ -6(DX), BX
9891 LEAQ -8(DX), SI
9892 MOVL SI, 8(SP)
9893 SHRQ $0x05, DX
9894 SUBL DX, BX
9895 LEAQ (CX)(BX*1), BX
9896 MOVQ BX, (SP)
9897 MOVL $0x00000001, DX
9898 MOVL $0x00000000, 16(SP)
9899 MOVQ src_base+24(FP), BX
9900
9901 search_loop_encodeBetterBlockAsm8B:
9902 MOVL DX, SI
9903 SUBL 12(SP), SI
9904 SHRL $0x04, SI
9905 LEAL 1(DX)(SI*1), SI
9906 CMPL SI, 8(SP)
9907 JAE emit_remainder_encodeBetterBlockAsm8B
9908 MOVQ (BX)(DX*1), DI
9909 MOVL SI, 20(SP)
9910 MOVQ $0x0000cf1bbcdcbf9b, R9
9911 MOVQ $0x9e3779b1, SI
9912 MOVQ DI, R10
9913 MOVQ DI, R11
9914 SHLQ $0x10, R10
9915 IMULQ R9, R10
9916 SHRQ $0x36, R10
9917 SHLQ $0x20, R11
9918 IMULQ SI, R11
9919 SHRQ $0x38, R11
9920 MOVL (AX)(R10*4), SI
9921 MOVL 4096(AX)(R11*4), R8
9922 MOVL DX, (AX)(R10*4)
9923 MOVL DX, 4096(AX)(R11*4)
9924 MOVQ (BX)(SI*1), R10
9925 MOVQ (BX)(R8*1), R11
9926 CMPQ R10, DI
9927 JEQ candidate_match_encodeBetterBlockAsm8B
9928 CMPQ R11, DI
9929 JNE no_short_found_encodeBetterBlockAsm8B
9930 MOVL R8, SI
9931 JMP candidate_match_encodeBetterBlockAsm8B
9932
9933 no_short_found_encodeBetterBlockAsm8B:
9934 CMPL R10, DI
9935 JEQ candidate_match_encodeBetterBlockAsm8B
9936 CMPL R11, DI
9937 JEQ candidateS_match_encodeBetterBlockAsm8B
9938 MOVL 20(SP), DX
9939 JMP search_loop_encodeBetterBlockAsm8B
9940
9941 candidateS_match_encodeBetterBlockAsm8B:
9942 SHRQ $0x08, DI
9943 MOVQ DI, R10
9944 SHLQ $0x10, R10
9945 IMULQ R9, R10
9946 SHRQ $0x36, R10
9947 MOVL (AX)(R10*4), SI
9948 INCL DX
9949 MOVL DX, (AX)(R10*4)
9950 CMPL (BX)(SI*1), DI
9951 JEQ candidate_match_encodeBetterBlockAsm8B
9952 DECL DX
9953 MOVL R8, SI
9954
9955 candidate_match_encodeBetterBlockAsm8B:
9956 MOVL 12(SP), DI
9957 TESTL SI, SI
9958 JZ match_extend_back_end_encodeBetterBlockAsm8B
9959
9960 match_extend_back_loop_encodeBetterBlockAsm8B:
9961 CMPL DX, DI
9962 JBE match_extend_back_end_encodeBetterBlockAsm8B
9963 MOVB -1(BX)(SI*1), R8
9964 MOVB -1(BX)(DX*1), R9
9965 CMPB R8, R9
9966 JNE match_extend_back_end_encodeBetterBlockAsm8B
9967 LEAL -1(DX), DX
9968 DECL SI
9969 JZ match_extend_back_end_encodeBetterBlockAsm8B
9970 JMP match_extend_back_loop_encodeBetterBlockAsm8B
9971
9972 match_extend_back_end_encodeBetterBlockAsm8B:
9973 MOVL DX, DI
9974 SUBL 12(SP), DI
9975 LEAQ 3(CX)(DI*1), DI
9976 CMPQ DI, (SP)
9977 JB match_dst_size_check_encodeBetterBlockAsm8B
9978 MOVQ $0x00000000, ret+56(FP)
9979 RET
9980
9981 match_dst_size_check_encodeBetterBlockAsm8B:
9982 MOVL DX, DI
9983 ADDL $0x04, DX
9984 ADDL $0x04, SI
9985 MOVQ src_len+32(FP), R8
9986 SUBL DX, R8
9987 LEAQ (BX)(DX*1), R9
9988 LEAQ (BX)(SI*1), R10
9989
9990 // matchLen
9991 XORL R12, R12
9992
9993 matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B:
9994 CMPL R8, $0x10
9995 JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B
9996 MOVQ (R9)(R12*1), R11
9997 MOVQ 8(R9)(R12*1), R13
9998 XORQ (R10)(R12*1), R11
9999 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
10000 XORQ 8(R10)(R12*1), R13
10001 JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B
10002 LEAL -16(R8), R8
10003 LEAL 16(R12), R12
10004 JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B
10005
10006 matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B:
10007 #ifdef GOAMD64_v3
10008 TZCNTQ R13, R13
10009
10010 #else
10011 BSFQ R13, R13
10012
10013 #endif
10014 SARQ $0x03, R13
10015 LEAL 8(R12)(R13*1), R12
10016 JMP match_nolit_end_encodeBetterBlockAsm8B
10017
10018 matchlen_match8_match_nolit_encodeBetterBlockAsm8B:
10019 CMPL R8, $0x08
10020 JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B
10021 MOVQ (R9)(R12*1), R11
10022 XORQ (R10)(R12*1), R11
10023 JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B
10024 LEAL -8(R8), R8
10025 LEAL 8(R12), R12
10026 JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B
10027
10028 matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B:
10029 #ifdef GOAMD64_v3
10030 TZCNTQ R11, R11
10031
10032 #else
10033 BSFQ R11, R11
10034
10035 #endif
10036 SARQ $0x03, R11
10037 LEAL (R12)(R11*1), R12
10038 JMP match_nolit_end_encodeBetterBlockAsm8B
10039
10040 matchlen_match4_match_nolit_encodeBetterBlockAsm8B:
10041 CMPL R8, $0x04
10042 JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B
10043 MOVL (R9)(R12*1), R11
10044 CMPL (R10)(R12*1), R11
10045 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B
10046 LEAL -4(R8), R8
10047 LEAL 4(R12), R12
10048
10049 matchlen_match2_match_nolit_encodeBetterBlockAsm8B:
10050 CMPL R8, $0x01
10051 JE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
10052 JB match_nolit_end_encodeBetterBlockAsm8B
10053 MOVW (R9)(R12*1), R11
10054 CMPW (R10)(R12*1), R11
10055 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B
10056 LEAL 2(R12), R12
10057 SUBL $0x02, R8
10058 JZ match_nolit_end_encodeBetterBlockAsm8B
10059
10060 matchlen_match1_match_nolit_encodeBetterBlockAsm8B:
10061 MOVB (R9)(R12*1), R11
10062 CMPB (R10)(R12*1), R11
10063 JNE match_nolit_end_encodeBetterBlockAsm8B
10064 LEAL 1(R12), R12
10065
10066 match_nolit_end_encodeBetterBlockAsm8B:
10067 MOVL DX, R8
10068 SUBL SI, R8
10069
10070 // Check if repeat
10071 CMPL 16(SP), R8
10072 JEQ match_is_repeat_encodeBetterBlockAsm8B
10073 MOVL R8, 16(SP)
10074 MOVL 12(SP), SI
10075 CMPL SI, DI
10076 JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B
10077 MOVL DI, R9
10078 MOVL DI, 12(SP)
10079 LEAQ (BX)(SI*1), R10
10080 SUBL SI, R9
10081 LEAL -1(R9), SI
10082 CMPL SI, $0x3c
10083 JB one_byte_match_emit_encodeBetterBlockAsm8B
10084 CMPL SI, $0x00000100
10085 JB two_bytes_match_emit_encodeBetterBlockAsm8B
10086 JB three_bytes_match_emit_encodeBetterBlockAsm8B
10087
10088 three_bytes_match_emit_encodeBetterBlockAsm8B:
10089 MOVB $0xf4, (CX)
10090 MOVW SI, 1(CX)
10091 ADDQ $0x03, CX
10092 JMP memmove_long_match_emit_encodeBetterBlockAsm8B
10093
10094 two_bytes_match_emit_encodeBetterBlockAsm8B:
10095 MOVB $0xf0, (CX)
10096 MOVB SI, 1(CX)
10097 ADDQ $0x02, CX
10098 CMPL SI, $0x40
10099 JB memmove_match_emit_encodeBetterBlockAsm8B
10100 JMP memmove_long_match_emit_encodeBetterBlockAsm8B
10101
10102 one_byte_match_emit_encodeBetterBlockAsm8B:
10103 SHLB $0x02, SI
10104 MOVB SI, (CX)
10105 ADDQ $0x01, CX
10106
10107 memmove_match_emit_encodeBetterBlockAsm8B:
10108 LEAQ (CX)(R9*1), SI
10109
10110 // genMemMoveShort
10111 CMPQ R9, $0x04
10112 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4
10113 CMPQ R9, $0x08
10114 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7
10115 CMPQ R9, $0x10
10116 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16
10117 CMPQ R9, $0x20
10118 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32
10119 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64
10120
10121 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4:
10122 MOVL (R10), R11
10123 MOVL R11, (CX)
10124 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10125
10126 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7:
10127 MOVL (R10), R11
10128 MOVL -4(R10)(R9*1), R10
10129 MOVL R11, (CX)
10130 MOVL R10, -4(CX)(R9*1)
10131 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10132
10133 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16:
10134 MOVQ (R10), R11
10135 MOVQ -8(R10)(R9*1), R10
10136 MOVQ R11, (CX)
10137 MOVQ R10, -8(CX)(R9*1)
10138 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10139
10140 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32:
10141 MOVOU (R10), X0
10142 MOVOU -16(R10)(R9*1), X1
10143 MOVOU X0, (CX)
10144 MOVOU X1, -16(CX)(R9*1)
10145 JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B
10146
10147 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64:
10148 MOVOU (R10), X0
10149 MOVOU 16(R10), X1
10150 MOVOU -32(R10)(R9*1), X2
10151 MOVOU -16(R10)(R9*1), X3
10152 MOVOU X0, (CX)
10153 MOVOU X1, 16(CX)
10154 MOVOU X2, -32(CX)(R9*1)
10155 MOVOU X3, -16(CX)(R9*1)
10156
10157 memmove_end_copy_match_emit_encodeBetterBlockAsm8B:
10158 MOVQ SI, CX
10159 JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B
10160
10161 memmove_long_match_emit_encodeBetterBlockAsm8B:
10162 LEAQ (CX)(R9*1), SI
10163
10164 // genMemMoveLong
10165 MOVOU (R10), X0
10166 MOVOU 16(R10), X1
10167 MOVOU -32(R10)(R9*1), X2
10168 MOVOU -16(R10)(R9*1), X3
10169 MOVQ R9, R13
10170 SHRQ $0x05, R13
10171 MOVQ CX, R11
10172 ANDL $0x0000001f, R11
10173 MOVQ $0x00000040, R14
10174 SUBQ R11, R14
10175 DECQ R13
10176 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10177 LEAQ -32(R10)(R14*1), R11
10178 LEAQ -32(CX)(R14*1), R15
10179
10180 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back:
10181 MOVOU (R11), X4
10182 MOVOU 16(R11), X5
10183 MOVOA X4, (R15)
10184 MOVOA X5, 16(R15)
10185 ADDQ $0x20, R15
10186 ADDQ $0x20, R11
10187 ADDQ $0x20, R14
10188 DECQ R13
10189 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back
10190
10191 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10192 MOVOU -32(R10)(R14*1), X4
10193 MOVOU -16(R10)(R14*1), X5
10194 MOVOA X4, -32(CX)(R14*1)
10195 MOVOA X5, -16(CX)(R14*1)
10196 ADDQ $0x20, R14
10197 CMPQ R9, R14
10198 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10199 MOVOU X0, (CX)
10200 MOVOU X1, 16(CX)
10201 MOVOU X2, -32(CX)(R9*1)
10202 MOVOU X3, -16(CX)(R9*1)
10203 MOVQ SI, CX
10204
10205 emit_literal_done_match_emit_encodeBetterBlockAsm8B:
10206 ADDL R12, DX
10207 ADDL $0x04, R12
10208 MOVL DX, 12(SP)
10209
10210 // emitCopy
10211 CMPL R12, $0x40
10212 JBE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
10213 CMPL R8, $0x00000800
10214 JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B
10215 MOVL $0x00000001, SI
10216 LEAL 16(SI), SI
10217 MOVB R8, 1(CX)
10218 SHRL $0x08, R8
10219 SHLL $0x05, R8
10220 ORL R8, SI
10221 MOVB SI, (CX)
10222 ADDQ $0x02, CX
10223 SUBL $0x08, R12
10224
10225 // emitRepeat
10226 LEAL -4(R12), R12
10227 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10228 MOVL R12, SI
10229 LEAL -4(R12), R12
10230 CMPL SI, $0x08
10231 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10232 CMPL SI, $0x0c
10233 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10234
10235 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10236 CMPL R12, $0x00000104
10237 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b
10238 LEAL -256(R12), R12
10239 MOVW $0x0019, (CX)
10240 MOVW R12, 2(CX)
10241 ADDQ $0x04, CX
10242 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10243
10244 repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10245 LEAL -4(R12), R12
10246 MOVW $0x0015, (CX)
10247 MOVB R12, 2(CX)
10248 ADDQ $0x03, CX
10249 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10250
10251 repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b:
10252 SHLL $0x02, R12
10253 ORL $0x01, R12
10254 MOVW R12, (CX)
10255 ADDQ $0x02, CX
10256 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10257 XORQ SI, SI
10258 LEAL 1(SI)(R12*4), R12
10259 MOVB R8, 1(CX)
10260 SARL $0x08, R8
10261 SHLL $0x05, R8
10262 ORL R8, R12
10263 MOVB R12, (CX)
10264 ADDQ $0x02, CX
10265 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10266
10267 long_offset_short_match_nolit_encodeBetterBlockAsm8B:
10268 MOVB $0xee, (CX)
10269 MOVW R8, 1(CX)
10270 LEAL -60(R12), R12
10271 ADDQ $0x03, CX
10272
10273 // emitRepeat
10274 MOVL R12, SI
10275 LEAL -4(R12), R12
10276 CMPL SI, $0x08
10277 JBE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10278 CMPL SI, $0x0c
10279 JAE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10280
10281 cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10282 CMPL R12, $0x00000104
10283 JB repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short
10284 LEAL -256(R12), R12
10285 MOVW $0x0019, (CX)
10286 MOVW R12, 2(CX)
10287 ADDQ $0x04, CX
10288 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10289
10290 repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10291 LEAL -4(R12), R12
10292 MOVW $0x0015, (CX)
10293 MOVB R12, 2(CX)
10294 ADDQ $0x03, CX
10295 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10296
10297 repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
10298 SHLL $0x02, R12
10299 ORL $0x01, R12
10300 MOVW R12, (CX)
10301 ADDQ $0x02, CX
10302 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10303 XORQ SI, SI
10304 LEAL 1(SI)(R12*4), R12
10305 MOVB R8, 1(CX)
10306 SARL $0x08, R8
10307 SHLL $0x05, R8
10308 ORL R8, R12
10309 MOVB R12, (CX)
10310 ADDQ $0x02, CX
10311 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10312
10313 two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
10314 MOVL R12, SI
10315 SHLL $0x02, SI
10316 CMPL R12, $0x0c
10317 JAE emit_copy_three_match_nolit_encodeBetterBlockAsm8B
10318 LEAL -15(SI), SI
10319 MOVB R8, 1(CX)
10320 SHRL $0x08, R8
10321 SHLL $0x05, R8
10322 ORL R8, SI
10323 MOVB SI, (CX)
10324 ADDQ $0x02, CX
10325 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10326
10327 emit_copy_three_match_nolit_encodeBetterBlockAsm8B:
10328 LEAL -2(SI), SI
10329 MOVB SI, (CX)
10330 MOVW R8, 1(CX)
10331 ADDQ $0x03, CX
10332 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10333
10334 match_is_repeat_encodeBetterBlockAsm8B:
10335 MOVL 12(SP), SI
10336 CMPL SI, DI
10337 JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
10338 MOVL DI, R8
10339 MOVL DI, 12(SP)
10340 LEAQ (BX)(SI*1), R9
10341 SUBL SI, R8
10342 LEAL -1(R8), SI
10343 CMPL SI, $0x3c
10344 JB one_byte_match_emit_repeat_encodeBetterBlockAsm8B
10345 CMPL SI, $0x00000100
10346 JB two_bytes_match_emit_repeat_encodeBetterBlockAsm8B
10347 JB three_bytes_match_emit_repeat_encodeBetterBlockAsm8B
10348
10349 three_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
10350 MOVB $0xf4, (CX)
10351 MOVW SI, 1(CX)
10352 ADDQ $0x03, CX
10353 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
10354
10355 two_bytes_match_emit_repeat_encodeBetterBlockAsm8B:
10356 MOVB $0xf0, (CX)
10357 MOVB SI, 1(CX)
10358 ADDQ $0x02, CX
10359 CMPL SI, $0x40
10360 JB memmove_match_emit_repeat_encodeBetterBlockAsm8B
10361 JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B
10362
10363 one_byte_match_emit_repeat_encodeBetterBlockAsm8B:
10364 SHLB $0x02, SI
10365 MOVB SI, (CX)
10366 ADDQ $0x01, CX
10367
10368 memmove_match_emit_repeat_encodeBetterBlockAsm8B:
10369 LEAQ (CX)(R8*1), SI
10370
10371 // genMemMoveShort
10372 CMPQ R8, $0x04
10373 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4
10374 CMPQ R8, $0x08
10375 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7
10376 CMPQ R8, $0x10
10377 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16
10378 CMPQ R8, $0x20
10379 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32
10380 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64
10381
10382 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4:
10383 MOVL (R9), R10
10384 MOVL R10, (CX)
10385 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10386
10387 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7:
10388 MOVL (R9), R10
10389 MOVL -4(R9)(R8*1), R9
10390 MOVL R10, (CX)
10391 MOVL R9, -4(CX)(R8*1)
10392 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10393
10394 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16:
10395 MOVQ (R9), R10
10396 MOVQ -8(R9)(R8*1), R9
10397 MOVQ R10, (CX)
10398 MOVQ R9, -8(CX)(R8*1)
10399 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10400
10401 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32:
10402 MOVOU (R9), X0
10403 MOVOU -16(R9)(R8*1), X1
10404 MOVOU X0, (CX)
10405 MOVOU X1, -16(CX)(R8*1)
10406 JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B
10407
10408 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64:
10409 MOVOU (R9), X0
10410 MOVOU 16(R9), X1
10411 MOVOU -32(R9)(R8*1), X2
10412 MOVOU -16(R9)(R8*1), X3
10413 MOVOU X0, (CX)
10414 MOVOU X1, 16(CX)
10415 MOVOU X2, -32(CX)(R8*1)
10416 MOVOU X3, -16(CX)(R8*1)
10417
10418 memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B:
10419 MOVQ SI, CX
10420 JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B
10421
10422 memmove_long_match_emit_repeat_encodeBetterBlockAsm8B:
10423 LEAQ (CX)(R8*1), SI
10424
10425 // genMemMoveLong
10426 MOVOU (R9), X0
10427 MOVOU 16(R9), X1
10428 MOVOU -32(R9)(R8*1), X2
10429 MOVOU -16(R9)(R8*1), X3
10430 MOVQ R8, R11
10431 SHRQ $0x05, R11
10432 MOVQ CX, R10
10433 ANDL $0x0000001f, R10
10434 MOVQ $0x00000040, R13
10435 SUBQ R10, R13
10436 DECQ R11
10437 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10438 LEAQ -32(R9)(R13*1), R10
10439 LEAQ -32(CX)(R13*1), R14
10440
10441 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back:
10442 MOVOU (R10), X4
10443 MOVOU 16(R10), X5
10444 MOVOA X4, (R14)
10445 MOVOA X5, 16(R14)
10446 ADDQ $0x20, R14
10447 ADDQ $0x20, R10
10448 ADDQ $0x20, R13
10449 DECQ R11
10450 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back
10451
10452 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10453 MOVOU -32(R9)(R13*1), X4
10454 MOVOU -16(R9)(R13*1), X5
10455 MOVOA X4, -32(CX)(R13*1)
10456 MOVOA X5, -16(CX)(R13*1)
10457 ADDQ $0x20, R13
10458 CMPQ R8, R13
10459 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10460 MOVOU X0, (CX)
10461 MOVOU X1, 16(CX)
10462 MOVOU X2, -32(CX)(R8*1)
10463 MOVOU X3, -16(CX)(R8*1)
10464 MOVQ SI, CX
10465
10466 emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B:
10467 ADDL R12, DX
10468 ADDL $0x04, R12
10469 MOVL DX, 12(SP)
10470
10471 // emitRepeat
10472 MOVL R12, SI
10473 LEAL -4(R12), R12
10474 CMPL SI, $0x08
10475 JBE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B
10476 CMPL SI, $0x0c
10477 JAE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B
10478
10479 cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B:
10480 CMPL R12, $0x00000104
10481 JB repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B
10482 LEAL -256(R12), R12
10483 MOVW $0x0019, (CX)
10484 MOVW R12, 2(CX)
10485 ADDQ $0x04, CX
10486 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10487
10488 repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B:
10489 LEAL -4(R12), R12
10490 MOVW $0x0015, (CX)
10491 MOVB R12, 2(CX)
10492 ADDQ $0x03, CX
10493 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10494
10495 repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B:
10496 SHLL $0x02, R12
10497 ORL $0x01, R12
10498 MOVW R12, (CX)
10499 ADDQ $0x02, CX
10500 JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B
10501 XORQ SI, SI
10502 LEAL 1(SI)(R12*4), R12
10503 MOVB R8, 1(CX)
10504 SARL $0x08, R8
10505 SHLL $0x05, R8
10506 ORL R8, R12
10507 MOVB R12, (CX)
10508 ADDQ $0x02, CX
10509
10510 match_nolit_emitcopy_end_encodeBetterBlockAsm8B:
10511 CMPL DX, 8(SP)
10512 JAE emit_remainder_encodeBetterBlockAsm8B
10513 CMPQ CX, (SP)
10514 JB match_nolit_dst_ok_encodeBetterBlockAsm8B
10515 MOVQ $0x00000000, ret+56(FP)
10516 RET
10517
10518 match_nolit_dst_ok_encodeBetterBlockAsm8B:
10519 MOVQ $0x0000cf1bbcdcbf9b, SI
10520 MOVQ $0x9e3779b1, R8
10521 LEAQ 1(DI), DI
10522 LEAQ -2(DX), R9
10523 MOVQ (BX)(DI*1), R10
10524 MOVQ 1(BX)(DI*1), R11
10525 MOVQ (BX)(R9*1), R12
10526 MOVQ 1(BX)(R9*1), R13
10527 SHLQ $0x10, R10
10528 IMULQ SI, R10
10529 SHRQ $0x36, R10
10530 SHLQ $0x20, R11
10531 IMULQ R8, R11
10532 SHRQ $0x38, R11
10533 SHLQ $0x10, R12
10534 IMULQ SI, R12
10535 SHRQ $0x36, R12
10536 SHLQ $0x20, R13
10537 IMULQ R8, R13
10538 SHRQ $0x38, R13
10539 LEAQ 1(DI), R8
10540 LEAQ 1(R9), R14
10541 MOVL DI, (AX)(R10*4)
10542 MOVL R9, (AX)(R12*4)
10543 MOVL R8, 4096(AX)(R11*4)
10544 MOVL R14, 4096(AX)(R13*4)
10545 LEAQ 1(R9)(DI*1), R8
10546 SHRQ $0x01, R8
10547 ADDQ $0x01, DI
10548 SUBQ $0x01, R9
10549
10550 index_loop_encodeBetterBlockAsm8B:
10551 CMPQ R8, R9
10552 JAE search_loop_encodeBetterBlockAsm8B
10553 MOVQ (BX)(DI*1), R10
10554 MOVQ (BX)(R8*1), R11
10555 SHLQ $0x10, R10
10556 IMULQ SI, R10
10557 SHRQ $0x36, R10
10558 SHLQ $0x10, R11
10559 IMULQ SI, R11
10560 SHRQ $0x36, R11
10561 MOVL DI, (AX)(R10*4)
10562 MOVL R8, (AX)(R11*4)
10563 ADDQ $0x02, DI
10564 ADDQ $0x02, R8
10565 JMP index_loop_encodeBetterBlockAsm8B
10566
10567 emit_remainder_encodeBetterBlockAsm8B:
10568 MOVQ src_len+32(FP), AX
10569 SUBL 12(SP), AX
10570 LEAQ 3(CX)(AX*1), AX
10571 CMPQ AX, (SP)
10572 JB emit_remainder_ok_encodeBetterBlockAsm8B
10573 MOVQ $0x00000000, ret+56(FP)
10574 RET
10575
10576 emit_remainder_ok_encodeBetterBlockAsm8B:
10577 MOVQ src_len+32(FP), AX
10578 MOVL 12(SP), DX
10579 CMPL DX, AX
10580 JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
10581 MOVL AX, SI
10582 MOVL AX, 12(SP)
10583 LEAQ (BX)(DX*1), AX
10584 SUBL DX, SI
10585 LEAL -1(SI), DX
10586 CMPL DX, $0x3c
10587 JB one_byte_emit_remainder_encodeBetterBlockAsm8B
10588 CMPL DX, $0x00000100
10589 JB two_bytes_emit_remainder_encodeBetterBlockAsm8B
10590 JB three_bytes_emit_remainder_encodeBetterBlockAsm8B
10591
10592 three_bytes_emit_remainder_encodeBetterBlockAsm8B:
10593 MOVB $0xf4, (CX)
10594 MOVW DX, 1(CX)
10595 ADDQ $0x03, CX
10596 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
10597
10598 two_bytes_emit_remainder_encodeBetterBlockAsm8B:
10599 MOVB $0xf0, (CX)
10600 MOVB DL, 1(CX)
10601 ADDQ $0x02, CX
10602 CMPL DX, $0x40
10603 JB memmove_emit_remainder_encodeBetterBlockAsm8B
10604 JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B
10605
10606 one_byte_emit_remainder_encodeBetterBlockAsm8B:
10607 SHLB $0x02, DL
10608 MOVB DL, (CX)
10609 ADDQ $0x01, CX
10610
10611 memmove_emit_remainder_encodeBetterBlockAsm8B:
10612 LEAQ (CX)(SI*1), DX
10613 MOVL SI, BX
10614
10615 // genMemMoveShort
10616 CMPQ BX, $0x03
10617 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2
10618 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3
10619 CMPQ BX, $0x08
10620 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7
10621 CMPQ BX, $0x10
10622 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16
10623 CMPQ BX, $0x20
10624 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32
10625 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64
10626
10627 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2:
10628 MOVB (AX), SI
10629 MOVB -1(AX)(BX*1), AL
10630 MOVB SI, (CX)
10631 MOVB AL, -1(CX)(BX*1)
10632 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10633
10634 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3:
10635 MOVW (AX), SI
10636 MOVB 2(AX), AL
10637 MOVW SI, (CX)
10638 MOVB AL, 2(CX)
10639 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10640
10641 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7:
10642 MOVL (AX), SI
10643 MOVL -4(AX)(BX*1), AX
10644 MOVL SI, (CX)
10645 MOVL AX, -4(CX)(BX*1)
10646 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10647
10648 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16:
10649 MOVQ (AX), SI
10650 MOVQ -8(AX)(BX*1), AX
10651 MOVQ SI, (CX)
10652 MOVQ AX, -8(CX)(BX*1)
10653 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10654
10655 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32:
10656 MOVOU (AX), X0
10657 MOVOU -16(AX)(BX*1), X1
10658 MOVOU X0, (CX)
10659 MOVOU X1, -16(CX)(BX*1)
10660 JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B
10661
10662 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64:
10663 MOVOU (AX), X0
10664 MOVOU 16(AX), X1
10665 MOVOU -32(AX)(BX*1), X2
10666 MOVOU -16(AX)(BX*1), X3
10667 MOVOU X0, (CX)
10668 MOVOU X1, 16(CX)
10669 MOVOU X2, -32(CX)(BX*1)
10670 MOVOU X3, -16(CX)(BX*1)
10671
10672 memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B:
10673 MOVQ DX, CX
10674 JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B
10675
10676 memmove_long_emit_remainder_encodeBetterBlockAsm8B:
10677 LEAQ (CX)(SI*1), DX
10678 MOVL SI, BX
10679
10680 // genMemMoveLong
10681 MOVOU (AX), X0
10682 MOVOU 16(AX), X1
10683 MOVOU -32(AX)(BX*1), X2
10684 MOVOU -16(AX)(BX*1), X3
10685 MOVQ BX, DI
10686 SHRQ $0x05, DI
10687 MOVQ CX, SI
10688 ANDL $0x0000001f, SI
10689 MOVQ $0x00000040, R8
10690 SUBQ SI, R8
10691 DECQ DI
10692 JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10693 LEAQ -32(AX)(R8*1), SI
10694 LEAQ -32(CX)(R8*1), R9
10695
10696 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back:
10697 MOVOU (SI), X4
10698 MOVOU 16(SI), X5
10699 MOVOA X4, (R9)
10700 MOVOA X5, 16(R9)
10701 ADDQ $0x20, R9
10702 ADDQ $0x20, SI
10703 ADDQ $0x20, R8
10704 DECQ DI
10705 JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back
10706
10707 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32:
10708 MOVOU -32(AX)(R8*1), X4
10709 MOVOU -16(AX)(R8*1), X5
10710 MOVOA X4, -32(CX)(R8*1)
10711 MOVOA X5, -16(CX)(R8*1)
10712 ADDQ $0x20, R8
10713 CMPQ BX, R8
10714 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32
10715 MOVOU X0, (CX)
10716 MOVOU X1, 16(CX)
10717 MOVOU X2, -32(CX)(BX*1)
10718 MOVOU X3, -16(CX)(BX*1)
10719 MOVQ DX, CX
10720
10721 emit_literal_done_emit_remainder_encodeBetterBlockAsm8B:
10722 MOVQ dst_base+0(FP), AX
10723 SUBQ AX, CX
10724 MOVQ CX, ret+56(FP)
10725 RET
10726
10727 // func encodeSnappyBlockAsm(dst []byte, src []byte, tmp *[65536]byte) int
10728 // Requires: BMI, SSE2
10729 TEXT ·encodeSnappyBlockAsm(SB), $24-64
10730 MOVQ tmp+48(FP), AX
10731 MOVQ dst_base+0(FP), CX
10732 MOVQ $0x00000200, DX
10733 MOVQ AX, BX
10734 PXOR X0, X0
10735
10736 zero_loop_encodeSnappyBlockAsm:
10737 MOVOU X0, (BX)
10738 MOVOU X0, 16(BX)
10739 MOVOU X0, 32(BX)
10740 MOVOU X0, 48(BX)
10741 MOVOU X0, 64(BX)
10742 MOVOU X0, 80(BX)
10743 MOVOU X0, 96(BX)
10744 MOVOU X0, 112(BX)
10745 ADDQ $0x80, BX
10746 DECQ DX
10747 JNZ zero_loop_encodeSnappyBlockAsm
10748 MOVL $0x00000000, 12(SP)
10749 MOVQ src_len+32(FP), DX
10750 LEAQ -9(DX), BX
10751 LEAQ -8(DX), SI
10752 MOVL SI, 8(SP)
10753 SHRQ $0x05, DX
10754 SUBL DX, BX
10755 LEAQ (CX)(BX*1), BX
10756 MOVQ BX, (SP)
10757 MOVL $0x00000001, DX
10758 MOVL DX, 16(SP)
10759 MOVQ src_base+24(FP), BX
10760
10761 search_loop_encodeSnappyBlockAsm:
10762 MOVL DX, SI
10763 SUBL 12(SP), SI
10764 SHRL $0x06, SI
10765 LEAL 4(DX)(SI*1), SI
10766 CMPL SI, 8(SP)
10767 JAE emit_remainder_encodeSnappyBlockAsm
10768 MOVQ (BX)(DX*1), DI
10769 MOVL SI, 20(SP)
10770 MOVQ $0x0000cf1bbcdcbf9b, R9
10771 MOVQ DI, R10
10772 MOVQ DI, R11
10773 SHRQ $0x08, R11
10774 SHLQ $0x10, R10
10775 IMULQ R9, R10
10776 SHRQ $0x32, R10
10777 SHLQ $0x10, R11
10778 IMULQ R9, R11
10779 SHRQ $0x32, R11
10780 MOVL (AX)(R10*4), SI
10781 MOVL (AX)(R11*4), R8
10782 MOVL DX, (AX)(R10*4)
10783 LEAL 1(DX), R10
10784 MOVL R10, (AX)(R11*4)
10785 MOVQ DI, R10
10786 SHRQ $0x10, R10
10787 SHLQ $0x10, R10
10788 IMULQ R9, R10
10789 SHRQ $0x32, R10
10790 MOVL DX, R9
10791 SUBL 16(SP), R9
10792 MOVL 1(BX)(R9*1), R11
10793 MOVQ DI, R9
10794 SHRQ $0x08, R9
10795 CMPL R9, R11
10796 JNE no_repeat_found_encodeSnappyBlockAsm
10797 LEAL 1(DX), DI
10798 MOVL 12(SP), SI
10799 MOVL DI, R8
10800 SUBL 16(SP), R8
10801 JZ repeat_extend_back_end_encodeSnappyBlockAsm
10802
10803 repeat_extend_back_loop_encodeSnappyBlockAsm:
10804 CMPL DI, SI
10805 JBE repeat_extend_back_end_encodeSnappyBlockAsm
10806 MOVB -1(BX)(R8*1), R9
10807 MOVB -1(BX)(DI*1), R10
10808 CMPB R9, R10
10809 JNE repeat_extend_back_end_encodeSnappyBlockAsm
10810 LEAL -1(DI), DI
10811 DECL R8
10812 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm
10813
10814 repeat_extend_back_end_encodeSnappyBlockAsm:
10815 MOVL DI, SI
10816 SUBL 12(SP), SI
10817 LEAQ 5(CX)(SI*1), SI
10818 CMPQ SI, (SP)
10819 JB repeat_dst_size_check_encodeSnappyBlockAsm
10820 MOVQ $0x00000000, ret+56(FP)
10821 RET
10822
10823 repeat_dst_size_check_encodeSnappyBlockAsm:
10824 MOVL 12(SP), SI
10825 CMPL SI, DI
10826 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm
10827 MOVL DI, R8
10828 MOVL DI, 12(SP)
10829 LEAQ (BX)(SI*1), R9
10830 SUBL SI, R8
10831 LEAL -1(R8), SI
10832 CMPL SI, $0x3c
10833 JB one_byte_repeat_emit_encodeSnappyBlockAsm
10834 CMPL SI, $0x00000100
10835 JB two_bytes_repeat_emit_encodeSnappyBlockAsm
10836 CMPL SI, $0x00010000
10837 JB three_bytes_repeat_emit_encodeSnappyBlockAsm
10838 CMPL SI, $0x01000000
10839 JB four_bytes_repeat_emit_encodeSnappyBlockAsm
10840 MOVB $0xfc, (CX)
10841 MOVL SI, 1(CX)
10842 ADDQ $0x05, CX
10843 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10844
10845 four_bytes_repeat_emit_encodeSnappyBlockAsm:
10846 MOVL SI, R10
10847 SHRL $0x10, R10
10848 MOVB $0xf8, (CX)
10849 MOVW SI, 1(CX)
10850 MOVB R10, 3(CX)
10851 ADDQ $0x04, CX
10852 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10853
10854 three_bytes_repeat_emit_encodeSnappyBlockAsm:
10855 MOVB $0xf4, (CX)
10856 MOVW SI, 1(CX)
10857 ADDQ $0x03, CX
10858 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10859
10860 two_bytes_repeat_emit_encodeSnappyBlockAsm:
10861 MOVB $0xf0, (CX)
10862 MOVB SI, 1(CX)
10863 ADDQ $0x02, CX
10864 CMPL SI, $0x40
10865 JB memmove_repeat_emit_encodeSnappyBlockAsm
10866 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm
10867
10868 one_byte_repeat_emit_encodeSnappyBlockAsm:
10869 SHLB $0x02, SI
10870 MOVB SI, (CX)
10871 ADDQ $0x01, CX
10872
10873 memmove_repeat_emit_encodeSnappyBlockAsm:
10874 LEAQ (CX)(R8*1), SI
10875
10876 // genMemMoveShort
10877 CMPQ R8, $0x08
10878 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8
10879 CMPQ R8, $0x10
10880 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16
10881 CMPQ R8, $0x20
10882 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32
10883 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64
10884
10885 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8:
10886 MOVQ (R9), R10
10887 MOVQ R10, (CX)
10888 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10889
10890 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16:
10891 MOVQ (R9), R10
10892 MOVQ -8(R9)(R8*1), R9
10893 MOVQ R10, (CX)
10894 MOVQ R9, -8(CX)(R8*1)
10895 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10896
10897 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32:
10898 MOVOU (R9), X0
10899 MOVOU -16(R9)(R8*1), X1
10900 MOVOU X0, (CX)
10901 MOVOU X1, -16(CX)(R8*1)
10902 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm
10903
10904 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64:
10905 MOVOU (R9), X0
10906 MOVOU 16(R9), X1
10907 MOVOU -32(R9)(R8*1), X2
10908 MOVOU -16(R9)(R8*1), X3
10909 MOVOU X0, (CX)
10910 MOVOU X1, 16(CX)
10911 MOVOU X2, -32(CX)(R8*1)
10912 MOVOU X3, -16(CX)(R8*1)
10913
10914 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm:
10915 MOVQ SI, CX
10916 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm
10917
10918 memmove_long_repeat_emit_encodeSnappyBlockAsm:
10919 LEAQ (CX)(R8*1), SI
10920
10921 // genMemMoveLong
10922 MOVOU (R9), X0
10923 MOVOU 16(R9), X1
10924 MOVOU -32(R9)(R8*1), X2
10925 MOVOU -16(R9)(R8*1), X3
10926 MOVQ R8, R11
10927 SHRQ $0x05, R11
10928 MOVQ CX, R10
10929 ANDL $0x0000001f, R10
10930 MOVQ $0x00000040, R12
10931 SUBQ R10, R12
10932 DECQ R11
10933 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10934 LEAQ -32(R9)(R12*1), R10
10935 LEAQ -32(CX)(R12*1), R13
10936
10937 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back:
10938 MOVOU (R10), X4
10939 MOVOU 16(R10), X5
10940 MOVOA X4, (R13)
10941 MOVOA X5, 16(R13)
10942 ADDQ $0x20, R13
10943 ADDQ $0x20, R10
10944 ADDQ $0x20, R12
10945 DECQ R11
10946 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back
10947
10948 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
10949 MOVOU -32(R9)(R12*1), X4
10950 MOVOU -16(R9)(R12*1), X5
10951 MOVOA X4, -32(CX)(R12*1)
10952 MOVOA X5, -16(CX)(R12*1)
10953 ADDQ $0x20, R12
10954 CMPQ R8, R12
10955 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
10956 MOVOU X0, (CX)
10957 MOVOU X1, 16(CX)
10958 MOVOU X2, -32(CX)(R8*1)
10959 MOVOU X3, -16(CX)(R8*1)
10960 MOVQ SI, CX
10961
10962 emit_literal_done_repeat_emit_encodeSnappyBlockAsm:
10963 ADDL $0x05, DX
10964 MOVL DX, SI
10965 SUBL 16(SP), SI
10966 MOVQ src_len+32(FP), R8
10967 SUBL DX, R8
10968 LEAQ (BX)(DX*1), R9
10969 LEAQ (BX)(SI*1), SI
10970
10971 // matchLen
10972 XORL R11, R11
10973
10974 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm:
10975 CMPL R8, $0x10
10976 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm
10977 MOVQ (R9)(R11*1), R10
10978 MOVQ 8(R9)(R11*1), R12
10979 XORQ (SI)(R11*1), R10
10980 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
10981 XORQ 8(SI)(R11*1), R12
10982 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm
10983 LEAL -16(R8), R8
10984 LEAL 16(R11), R11
10985 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm
10986
10987 matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm:
10988 #ifdef GOAMD64_v3
10989 TZCNTQ R12, R12
10990
10991 #else
10992 BSFQ R12, R12
10993
10994 #endif
10995 SARQ $0x03, R12
10996 LEAL 8(R11)(R12*1), R11
10997 JMP repeat_extend_forward_end_encodeSnappyBlockAsm
10998
10999 matchlen_match8_repeat_extend_encodeSnappyBlockAsm:
11000 CMPL R8, $0x08
11001 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm
11002 MOVQ (R9)(R11*1), R10
11003 XORQ (SI)(R11*1), R10
11004 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm
11005 LEAL -8(R8), R8
11006 LEAL 8(R11), R11
11007 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm
11008
11009 matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm:
11010 #ifdef GOAMD64_v3
11011 TZCNTQ R10, R10
11012
11013 #else
11014 BSFQ R10, R10
11015
11016 #endif
11017 SARQ $0x03, R10
11018 LEAL (R11)(R10*1), R11
11019 JMP repeat_extend_forward_end_encodeSnappyBlockAsm
11020
11021 matchlen_match4_repeat_extend_encodeSnappyBlockAsm:
11022 CMPL R8, $0x04
11023 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm
11024 MOVL (R9)(R11*1), R10
11025 CMPL (SI)(R11*1), R10
11026 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm
11027 LEAL -4(R8), R8
11028 LEAL 4(R11), R11
11029
11030 matchlen_match2_repeat_extend_encodeSnappyBlockAsm:
11031 CMPL R8, $0x01
11032 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
11033 JB repeat_extend_forward_end_encodeSnappyBlockAsm
11034 MOVW (R9)(R11*1), R10
11035 CMPW (SI)(R11*1), R10
11036 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm
11037 LEAL 2(R11), R11
11038 SUBL $0x02, R8
11039 JZ repeat_extend_forward_end_encodeSnappyBlockAsm
11040
11041 matchlen_match1_repeat_extend_encodeSnappyBlockAsm:
11042 MOVB (R9)(R11*1), R10
11043 CMPB (SI)(R11*1), R10
11044 JNE repeat_extend_forward_end_encodeSnappyBlockAsm
11045 LEAL 1(R11), R11
11046
11047 repeat_extend_forward_end_encodeSnappyBlockAsm:
11048 ADDL R11, DX
11049 MOVL DX, SI
11050 SUBL DI, SI
11051 MOVL 16(SP), DI
11052
11053 // emitCopy
11054 CMPL DI, $0x00010000
11055 JB two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
11056
11057 four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm:
11058 CMPL SI, $0x40
11059 JBE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
11060 MOVB $0xff, (CX)
11061 MOVL DI, 1(CX)
11062 LEAL -64(SI), SI
11063 ADDQ $0x05, CX
11064 CMPL SI, $0x04
11065 JB four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm
11066 JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm
11067
11068 four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm:
11069 TESTL SI, SI
11070 JZ repeat_end_emit_encodeSnappyBlockAsm
11071 XORL R8, R8
11072 LEAL -1(R8)(SI*4), SI
11073 MOVB SI, (CX)
11074 MOVL DI, 1(CX)
11075 ADDQ $0x05, CX
11076 JMP repeat_end_emit_encodeSnappyBlockAsm
11077
11078 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm:
11079 CMPL SI, $0x40
11080 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm
11081 MOVB $0xee, (CX)
11082 MOVW DI, 1(CX)
11083 LEAL -60(SI), SI
11084 ADDQ $0x03, CX
11085 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm
11086
11087 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm:
11088 MOVL SI, R8
11089 SHLL $0x02, R8
11090 CMPL SI, $0x0c
11091 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
11092 CMPL DI, $0x00000800
11093 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm
11094 LEAL -15(R8), R8
11095 MOVB DI, 1(CX)
11096 SHRL $0x08, DI
11097 SHLL $0x05, DI
11098 ORL DI, R8
11099 MOVB R8, (CX)
11100 ADDQ $0x02, CX
11101 JMP repeat_end_emit_encodeSnappyBlockAsm
11102
11103 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm:
11104 LEAL -2(R8), R8
11105 MOVB R8, (CX)
11106 MOVW DI, 1(CX)
11107 ADDQ $0x03, CX
11108
11109 repeat_end_emit_encodeSnappyBlockAsm:
11110 MOVL DX, 12(SP)
11111 JMP search_loop_encodeSnappyBlockAsm
11112
11113 no_repeat_found_encodeSnappyBlockAsm:
11114 CMPL (BX)(SI*1), DI
11115 JEQ candidate_match_encodeSnappyBlockAsm
11116 SHRQ $0x08, DI
11117 MOVL (AX)(R10*4), SI
11118 LEAL 2(DX), R9
11119 CMPL (BX)(R8*1), DI
11120 JEQ candidate2_match_encodeSnappyBlockAsm
11121 MOVL R9, (AX)(R10*4)
11122 SHRQ $0x08, DI
11123 CMPL (BX)(SI*1), DI
11124 JEQ candidate3_match_encodeSnappyBlockAsm
11125 MOVL 20(SP), DX
11126 JMP search_loop_encodeSnappyBlockAsm
11127
11128 candidate3_match_encodeSnappyBlockAsm:
11129 ADDL $0x02, DX
11130 JMP candidate_match_encodeSnappyBlockAsm
11131
11132 candidate2_match_encodeSnappyBlockAsm:
11133 MOVL R9, (AX)(R10*4)
11134 INCL DX
11135 MOVL R8, SI
11136
11137 candidate_match_encodeSnappyBlockAsm:
11138 MOVL 12(SP), DI
11139 TESTL SI, SI
11140 JZ match_extend_back_end_encodeSnappyBlockAsm
11141
11142 match_extend_back_loop_encodeSnappyBlockAsm:
11143 CMPL DX, DI
11144 JBE match_extend_back_end_encodeSnappyBlockAsm
11145 MOVB -1(BX)(SI*1), R8
11146 MOVB -1(BX)(DX*1), R9
11147 CMPB R8, R9
11148 JNE match_extend_back_end_encodeSnappyBlockAsm
11149 LEAL -1(DX), DX
11150 DECL SI
11151 JZ match_extend_back_end_encodeSnappyBlockAsm
11152 JMP match_extend_back_loop_encodeSnappyBlockAsm
11153
11154 match_extend_back_end_encodeSnappyBlockAsm:
11155 MOVL DX, DI
11156 SUBL 12(SP), DI
11157 LEAQ 5(CX)(DI*1), DI
11158 CMPQ DI, (SP)
11159 JB match_dst_size_check_encodeSnappyBlockAsm
11160 MOVQ $0x00000000, ret+56(FP)
11161 RET
11162
11163 match_dst_size_check_encodeSnappyBlockAsm:
11164 MOVL DX, DI
11165 MOVL 12(SP), R8
11166 CMPL R8, DI
11167 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm
11168 MOVL DI, R9
11169 MOVL DI, 12(SP)
11170 LEAQ (BX)(R8*1), DI
11171 SUBL R8, R9
11172 LEAL -1(R9), R8
11173 CMPL R8, $0x3c
11174 JB one_byte_match_emit_encodeSnappyBlockAsm
11175 CMPL R8, $0x00000100
11176 JB two_bytes_match_emit_encodeSnappyBlockAsm
11177 CMPL R8, $0x00010000
11178 JB three_bytes_match_emit_encodeSnappyBlockAsm
11179 CMPL R8, $0x01000000
11180 JB four_bytes_match_emit_encodeSnappyBlockAsm
11181 MOVB $0xfc, (CX)
11182 MOVL R8, 1(CX)
11183 ADDQ $0x05, CX
11184 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11185
11186 four_bytes_match_emit_encodeSnappyBlockAsm:
11187 MOVL R8, R10
11188 SHRL $0x10, R10
11189 MOVB $0xf8, (CX)
11190 MOVW R8, 1(CX)
11191 MOVB R10, 3(CX)
11192 ADDQ $0x04, CX
11193 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11194
11195 three_bytes_match_emit_encodeSnappyBlockAsm:
11196 MOVB $0xf4, (CX)
11197 MOVW R8, 1(CX)
11198 ADDQ $0x03, CX
11199 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11200
11201 two_bytes_match_emit_encodeSnappyBlockAsm:
11202 MOVB $0xf0, (CX)
11203 MOVB R8, 1(CX)
11204 ADDQ $0x02, CX
11205 CMPL R8, $0x40
11206 JB memmove_match_emit_encodeSnappyBlockAsm
11207 JMP memmove_long_match_emit_encodeSnappyBlockAsm
11208
11209 one_byte_match_emit_encodeSnappyBlockAsm:
11210 SHLB $0x02, R8
11211 MOVB R8, (CX)
11212 ADDQ $0x01, CX
11213
11214 memmove_match_emit_encodeSnappyBlockAsm:
11215 LEAQ (CX)(R9*1), R8
11216
11217 // genMemMoveShort
11218 CMPQ R9, $0x08
11219 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8
11220 CMPQ R9, $0x10
11221 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16
11222 CMPQ R9, $0x20
11223 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32
11224 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64
11225
11226 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8:
11227 MOVQ (DI), R10
11228 MOVQ R10, (CX)
11229 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11230
11231 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16:
11232 MOVQ (DI), R10
11233 MOVQ -8(DI)(R9*1), DI
11234 MOVQ R10, (CX)
11235 MOVQ DI, -8(CX)(R9*1)
11236 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11237
11238 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32:
11239 MOVOU (DI), X0
11240 MOVOU -16(DI)(R9*1), X1
11241 MOVOU X0, (CX)
11242 MOVOU X1, -16(CX)(R9*1)
11243 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm
11244
11245 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64:
11246 MOVOU (DI), X0
11247 MOVOU 16(DI), X1
11248 MOVOU -32(DI)(R9*1), X2
11249 MOVOU -16(DI)(R9*1), X3
11250 MOVOU X0, (CX)
11251 MOVOU X1, 16(CX)
11252 MOVOU X2, -32(CX)(R9*1)
11253 MOVOU X3, -16(CX)(R9*1)
11254
11255 memmove_end_copy_match_emit_encodeSnappyBlockAsm:
11256 MOVQ R8, CX
11257 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm
11258
11259 memmove_long_match_emit_encodeSnappyBlockAsm:
11260 LEAQ (CX)(R9*1), R8
11261
11262 // genMemMoveLong
11263 MOVOU (DI), X0
11264 MOVOU 16(DI), X1
11265 MOVOU -32(DI)(R9*1), X2
11266 MOVOU -16(DI)(R9*1), X3
11267 MOVQ R9, R11
11268 SHRQ $0x05, R11
11269 MOVQ CX, R10
11270 ANDL $0x0000001f, R10
11271 MOVQ $0x00000040, R12
11272 SUBQ R10, R12
11273 DECQ R11
11274 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11275 LEAQ -32(DI)(R12*1), R10
11276 LEAQ -32(CX)(R12*1), R13
11277
11278 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back:
11279 MOVOU (R10), X4
11280 MOVOU 16(R10), X5
11281 MOVOA X4, (R13)
11282 MOVOA X5, 16(R13)
11283 ADDQ $0x20, R13
11284 ADDQ $0x20, R10
11285 ADDQ $0x20, R12
11286 DECQ R11
11287 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back
11288
11289 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
11290 MOVOU -32(DI)(R12*1), X4
11291 MOVOU -16(DI)(R12*1), X5
11292 MOVOA X4, -32(CX)(R12*1)
11293 MOVOA X5, -16(CX)(R12*1)
11294 ADDQ $0x20, R12
11295 CMPQ R9, R12
11296 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11297 MOVOU X0, (CX)
11298 MOVOU X1, 16(CX)
11299 MOVOU X2, -32(CX)(R9*1)
11300 MOVOU X3, -16(CX)(R9*1)
11301 MOVQ R8, CX
11302
11303 emit_literal_done_match_emit_encodeSnappyBlockAsm:
11304 match_nolit_loop_encodeSnappyBlockAsm:
11305 MOVL DX, DI
11306 SUBL SI, DI
11307 MOVL DI, 16(SP)
11308 ADDL $0x04, DX
11309 ADDL $0x04, SI
11310 MOVQ src_len+32(FP), DI
11311 SUBL DX, DI
11312 LEAQ (BX)(DX*1), R8
11313 LEAQ (BX)(SI*1), SI
11314
11315 // matchLen
11316 XORL R10, R10
11317
11318 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm:
11319 CMPL DI, $0x10
11320 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm
11321 MOVQ (R8)(R10*1), R9
11322 MOVQ 8(R8)(R10*1), R11
11323 XORQ (SI)(R10*1), R9
11324 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
11325 XORQ 8(SI)(R10*1), R11
11326 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm
11327 LEAL -16(DI), DI
11328 LEAL 16(R10), R10
11329 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm
11330
11331 matchlen_bsf_16match_nolit_encodeSnappyBlockAsm:
11332 #ifdef GOAMD64_v3
11333 TZCNTQ R11, R11
11334
11335 #else
11336 BSFQ R11, R11
11337
11338 #endif
11339 SARQ $0x03, R11
11340 LEAL 8(R10)(R11*1), R10
11341 JMP match_nolit_end_encodeSnappyBlockAsm
11342
11343 matchlen_match8_match_nolit_encodeSnappyBlockAsm:
11344 CMPL DI, $0x08
11345 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm
11346 MOVQ (R8)(R10*1), R9
11347 XORQ (SI)(R10*1), R9
11348 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm
11349 LEAL -8(DI), DI
11350 LEAL 8(R10), R10
11351 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm
11352
11353 matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm:
11354 #ifdef GOAMD64_v3
11355 TZCNTQ R9, R9
11356
11357 #else
11358 BSFQ R9, R9
11359
11360 #endif
11361 SARQ $0x03, R9
11362 LEAL (R10)(R9*1), R10
11363 JMP match_nolit_end_encodeSnappyBlockAsm
11364
11365 matchlen_match4_match_nolit_encodeSnappyBlockAsm:
11366 CMPL DI, $0x04
11367 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm
11368 MOVL (R8)(R10*1), R9
11369 CMPL (SI)(R10*1), R9
11370 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm
11371 LEAL -4(DI), DI
11372 LEAL 4(R10), R10
11373
11374 matchlen_match2_match_nolit_encodeSnappyBlockAsm:
11375 CMPL DI, $0x01
11376 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm
11377 JB match_nolit_end_encodeSnappyBlockAsm
11378 MOVW (R8)(R10*1), R9
11379 CMPW (SI)(R10*1), R9
11380 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm
11381 LEAL 2(R10), R10
11382 SUBL $0x02, DI
11383 JZ match_nolit_end_encodeSnappyBlockAsm
11384
11385 matchlen_match1_match_nolit_encodeSnappyBlockAsm:
11386 MOVB (R8)(R10*1), R9
11387 CMPB (SI)(R10*1), R9
11388 JNE match_nolit_end_encodeSnappyBlockAsm
11389 LEAL 1(R10), R10
11390
11391 match_nolit_end_encodeSnappyBlockAsm:
11392 ADDL R10, DX
11393 MOVL 16(SP), SI
11394 ADDL $0x04, R10
11395 MOVL DX, 12(SP)
11396
11397 // emitCopy
11398 CMPL SI, $0x00010000
11399 JB two_byte_offset_match_nolit_encodeSnappyBlockAsm
11400
11401 four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm:
11402 CMPL R10, $0x40
11403 JBE four_bytes_remain_match_nolit_encodeSnappyBlockAsm
11404 MOVB $0xff, (CX)
11405 MOVL SI, 1(CX)
11406 LEAL -64(R10), R10
11407 ADDQ $0x05, CX
11408 CMPL R10, $0x04
11409 JB four_bytes_remain_match_nolit_encodeSnappyBlockAsm
11410 JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm
11411
11412 four_bytes_remain_match_nolit_encodeSnappyBlockAsm:
11413 TESTL R10, R10
11414 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm
11415 XORL DI, DI
11416 LEAL -1(DI)(R10*4), R10
11417 MOVB R10, (CX)
11418 MOVL SI, 1(CX)
11419 ADDQ $0x05, CX
11420 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
11421
11422 two_byte_offset_match_nolit_encodeSnappyBlockAsm:
11423 CMPL R10, $0x40
11424 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm
11425 MOVB $0xee, (CX)
11426 MOVW SI, 1(CX)
11427 LEAL -60(R10), R10
11428 ADDQ $0x03, CX
11429 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm
11430
11431 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm:
11432 MOVL R10, DI
11433 SHLL $0x02, DI
11434 CMPL R10, $0x0c
11435 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
11436 CMPL SI, $0x00000800
11437 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm
11438 LEAL -15(DI), DI
11439 MOVB SI, 1(CX)
11440 SHRL $0x08, SI
11441 SHLL $0x05, SI
11442 ORL SI, DI
11443 MOVB DI, (CX)
11444 ADDQ $0x02, CX
11445 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm
11446
11447 emit_copy_three_match_nolit_encodeSnappyBlockAsm:
11448 LEAL -2(DI), DI
11449 MOVB DI, (CX)
11450 MOVW SI, 1(CX)
11451 ADDQ $0x03, CX
11452
11453 match_nolit_emitcopy_end_encodeSnappyBlockAsm:
11454 CMPL DX, 8(SP)
11455 JAE emit_remainder_encodeSnappyBlockAsm
11456 MOVQ -2(BX)(DX*1), DI
11457 CMPQ CX, (SP)
11458 JB match_nolit_dst_ok_encodeSnappyBlockAsm
11459 MOVQ $0x00000000, ret+56(FP)
11460 RET
11461
11462 match_nolit_dst_ok_encodeSnappyBlockAsm:
11463 MOVQ $0x0000cf1bbcdcbf9b, R9
11464 MOVQ DI, R8
11465 SHRQ $0x10, DI
11466 MOVQ DI, SI
11467 SHLQ $0x10, R8
11468 IMULQ R9, R8
11469 SHRQ $0x32, R8
11470 SHLQ $0x10, SI
11471 IMULQ R9, SI
11472 SHRQ $0x32, SI
11473 LEAL -2(DX), R9
11474 LEAQ (AX)(SI*4), R10
11475 MOVL (R10), SI
11476 MOVL R9, (AX)(R8*4)
11477 MOVL DX, (R10)
11478 CMPL (BX)(SI*1), DI
11479 JEQ match_nolit_loop_encodeSnappyBlockAsm
11480 INCL DX
11481 JMP search_loop_encodeSnappyBlockAsm
11482
11483 emit_remainder_encodeSnappyBlockAsm:
11484 MOVQ src_len+32(FP), AX
11485 SUBL 12(SP), AX
11486 LEAQ 5(CX)(AX*1), AX
11487 CMPQ AX, (SP)
11488 JB emit_remainder_ok_encodeSnappyBlockAsm
11489 MOVQ $0x00000000, ret+56(FP)
11490 RET
11491
11492 emit_remainder_ok_encodeSnappyBlockAsm:
11493 MOVQ src_len+32(FP), AX
11494 MOVL 12(SP), DX
11495 CMPL DX, AX
11496 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm
11497 MOVL AX, SI
11498 MOVL AX, 12(SP)
11499 LEAQ (BX)(DX*1), AX
11500 SUBL DX, SI
11501 LEAL -1(SI), DX
11502 CMPL DX, $0x3c
11503 JB one_byte_emit_remainder_encodeSnappyBlockAsm
11504 CMPL DX, $0x00000100
11505 JB two_bytes_emit_remainder_encodeSnappyBlockAsm
11506 CMPL DX, $0x00010000
11507 JB three_bytes_emit_remainder_encodeSnappyBlockAsm
11508 CMPL DX, $0x01000000
11509 JB four_bytes_emit_remainder_encodeSnappyBlockAsm
11510 MOVB $0xfc, (CX)
11511 MOVL DX, 1(CX)
11512 ADDQ $0x05, CX
11513 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11514
11515 four_bytes_emit_remainder_encodeSnappyBlockAsm:
11516 MOVL DX, BX
11517 SHRL $0x10, BX
11518 MOVB $0xf8, (CX)
11519 MOVW DX, 1(CX)
11520 MOVB BL, 3(CX)
11521 ADDQ $0x04, CX
11522 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11523
11524 three_bytes_emit_remainder_encodeSnappyBlockAsm:
11525 MOVB $0xf4, (CX)
11526 MOVW DX, 1(CX)
11527 ADDQ $0x03, CX
11528 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11529
11530 two_bytes_emit_remainder_encodeSnappyBlockAsm:
11531 MOVB $0xf0, (CX)
11532 MOVB DL, 1(CX)
11533 ADDQ $0x02, CX
11534 CMPL DX, $0x40
11535 JB memmove_emit_remainder_encodeSnappyBlockAsm
11536 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm
11537
11538 one_byte_emit_remainder_encodeSnappyBlockAsm:
11539 SHLB $0x02, DL
11540 MOVB DL, (CX)
11541 ADDQ $0x01, CX
11542
11543 memmove_emit_remainder_encodeSnappyBlockAsm:
11544 LEAQ (CX)(SI*1), DX
11545 MOVL SI, BX
11546
11547 // genMemMoveShort
11548 CMPQ BX, $0x03
11549 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2
11550 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3
11551 CMPQ BX, $0x08
11552 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7
11553 CMPQ BX, $0x10
11554 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16
11555 CMPQ BX, $0x20
11556 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32
11557 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64
11558
11559 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2:
11560 MOVB (AX), SI
11561 MOVB -1(AX)(BX*1), AL
11562 MOVB SI, (CX)
11563 MOVB AL, -1(CX)(BX*1)
11564 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11565
11566 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3:
11567 MOVW (AX), SI
11568 MOVB 2(AX), AL
11569 MOVW SI, (CX)
11570 MOVB AL, 2(CX)
11571 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11572
11573 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7:
11574 MOVL (AX), SI
11575 MOVL -4(AX)(BX*1), AX
11576 MOVL SI, (CX)
11577 MOVL AX, -4(CX)(BX*1)
11578 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11579
11580 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16:
11581 MOVQ (AX), SI
11582 MOVQ -8(AX)(BX*1), AX
11583 MOVQ SI, (CX)
11584 MOVQ AX, -8(CX)(BX*1)
11585 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11586
11587 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32:
11588 MOVOU (AX), X0
11589 MOVOU -16(AX)(BX*1), X1
11590 MOVOU X0, (CX)
11591 MOVOU X1, -16(CX)(BX*1)
11592 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm
11593
11594 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64:
11595 MOVOU (AX), X0
11596 MOVOU 16(AX), X1
11597 MOVOU -32(AX)(BX*1), X2
11598 MOVOU -16(AX)(BX*1), X3
11599 MOVOU X0, (CX)
11600 MOVOU X1, 16(CX)
11601 MOVOU X2, -32(CX)(BX*1)
11602 MOVOU X3, -16(CX)(BX*1)
11603
11604 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm:
11605 MOVQ DX, CX
11606 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm
11607
11608 memmove_long_emit_remainder_encodeSnappyBlockAsm:
11609 LEAQ (CX)(SI*1), DX
11610 MOVL SI, BX
11611
11612 // genMemMoveLong
11613 MOVOU (AX), X0
11614 MOVOU 16(AX), X1
11615 MOVOU -32(AX)(BX*1), X2
11616 MOVOU -16(AX)(BX*1), X3
11617 MOVQ BX, DI
11618 SHRQ $0x05, DI
11619 MOVQ CX, SI
11620 ANDL $0x0000001f, SI
11621 MOVQ $0x00000040, R8
11622 SUBQ SI, R8
11623 DECQ DI
11624 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11625 LEAQ -32(AX)(R8*1), SI
11626 LEAQ -32(CX)(R8*1), R9
11627
11628 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back:
11629 MOVOU (SI), X4
11630 MOVOU 16(SI), X5
11631 MOVOA X4, (R9)
11632 MOVOA X5, 16(R9)
11633 ADDQ $0x20, R9
11634 ADDQ $0x20, SI
11635 ADDQ $0x20, R8
11636 DECQ DI
11637 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back
11638
11639 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32:
11640 MOVOU -32(AX)(R8*1), X4
11641 MOVOU -16(AX)(R8*1), X5
11642 MOVOA X4, -32(CX)(R8*1)
11643 MOVOA X5, -16(CX)(R8*1)
11644 ADDQ $0x20, R8
11645 CMPQ BX, R8
11646 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32
11647 MOVOU X0, (CX)
11648 MOVOU X1, 16(CX)
11649 MOVOU X2, -32(CX)(BX*1)
11650 MOVOU X3, -16(CX)(BX*1)
11651 MOVQ DX, CX
11652
11653 emit_literal_done_emit_remainder_encodeSnappyBlockAsm:
11654 MOVQ dst_base+0(FP), AX
11655 SUBQ AX, CX
11656 MOVQ CX, ret+56(FP)
11657 RET
11658
11659 // func encodeSnappyBlockAsm64K(dst []byte, src []byte, tmp *[65536]byte) int
11660 // Requires: BMI, SSE2
11661 TEXT ·encodeSnappyBlockAsm64K(SB), $24-64
11662 MOVQ tmp+48(FP), AX
11663 MOVQ dst_base+0(FP), CX
11664 MOVQ $0x00000200, DX
11665 MOVQ AX, BX
11666 PXOR X0, X0
11667
11668 zero_loop_encodeSnappyBlockAsm64K:
11669 MOVOU X0, (BX)
11670 MOVOU X0, 16(BX)
11671 MOVOU X0, 32(BX)
11672 MOVOU X0, 48(BX)
11673 MOVOU X0, 64(BX)
11674 MOVOU X0, 80(BX)
11675 MOVOU X0, 96(BX)
11676 MOVOU X0, 112(BX)
11677 ADDQ $0x80, BX
11678 DECQ DX
11679 JNZ zero_loop_encodeSnappyBlockAsm64K
11680 MOVL $0x00000000, 12(SP)
11681 MOVQ src_len+32(FP), DX
11682 LEAQ -9(DX), BX
11683 LEAQ -8(DX), SI
11684 MOVL SI, 8(SP)
11685 SHRQ $0x05, DX
11686 SUBL DX, BX
11687 LEAQ (CX)(BX*1), BX
11688 MOVQ BX, (SP)
11689 MOVL $0x00000001, DX
11690 MOVL DX, 16(SP)
11691 MOVQ src_base+24(FP), BX
11692
11693 search_loop_encodeSnappyBlockAsm64K:
11694 MOVL DX, SI
11695 SUBL 12(SP), SI
11696 SHRL $0x06, SI
11697 LEAL 4(DX)(SI*1), SI
11698 CMPL SI, 8(SP)
11699 JAE emit_remainder_encodeSnappyBlockAsm64K
11700 MOVQ (BX)(DX*1), DI
11701 MOVL SI, 20(SP)
11702 MOVQ $0x0000cf1bbcdcbf9b, R9
11703 MOVQ DI, R10
11704 MOVQ DI, R11
11705 SHRQ $0x08, R11
11706 SHLQ $0x10, R10
11707 IMULQ R9, R10
11708 SHRQ $0x32, R10
11709 SHLQ $0x10, R11
11710 IMULQ R9, R11
11711 SHRQ $0x32, R11
11712 MOVL (AX)(R10*4), SI
11713 MOVL (AX)(R11*4), R8
11714 MOVL DX, (AX)(R10*4)
11715 LEAL 1(DX), R10
11716 MOVL R10, (AX)(R11*4)
11717 MOVQ DI, R10
11718 SHRQ $0x10, R10
11719 SHLQ $0x10, R10
11720 IMULQ R9, R10
11721 SHRQ $0x32, R10
11722 MOVL DX, R9
11723 SUBL 16(SP), R9
11724 MOVL 1(BX)(R9*1), R11
11725 MOVQ DI, R9
11726 SHRQ $0x08, R9
11727 CMPL R9, R11
11728 JNE no_repeat_found_encodeSnappyBlockAsm64K
11729 LEAL 1(DX), DI
11730 MOVL 12(SP), SI
11731 MOVL DI, R8
11732 SUBL 16(SP), R8
11733 JZ repeat_extend_back_end_encodeSnappyBlockAsm64K
11734
11735 repeat_extend_back_loop_encodeSnappyBlockAsm64K:
11736 CMPL DI, SI
11737 JBE repeat_extend_back_end_encodeSnappyBlockAsm64K
11738 MOVB -1(BX)(R8*1), R9
11739 MOVB -1(BX)(DI*1), R10
11740 CMPB R9, R10
11741 JNE repeat_extend_back_end_encodeSnappyBlockAsm64K
11742 LEAL -1(DI), DI
11743 DECL R8
11744 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K
11745
11746 repeat_extend_back_end_encodeSnappyBlockAsm64K:
11747 MOVL DI, SI
11748 SUBL 12(SP), SI
11749 LEAQ 3(CX)(SI*1), SI
11750 CMPQ SI, (SP)
11751 JB repeat_dst_size_check_encodeSnappyBlockAsm64K
11752 MOVQ $0x00000000, ret+56(FP)
11753 RET
11754
11755 repeat_dst_size_check_encodeSnappyBlockAsm64K:
11756 MOVL 12(SP), SI
11757 CMPL SI, DI
11758 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
11759 MOVL DI, R8
11760 MOVL DI, 12(SP)
11761 LEAQ (BX)(SI*1), R9
11762 SUBL SI, R8
11763 LEAL -1(R8), SI
11764 CMPL SI, $0x3c
11765 JB one_byte_repeat_emit_encodeSnappyBlockAsm64K
11766 CMPL SI, $0x00000100
11767 JB two_bytes_repeat_emit_encodeSnappyBlockAsm64K
11768 JB three_bytes_repeat_emit_encodeSnappyBlockAsm64K
11769
11770 three_bytes_repeat_emit_encodeSnappyBlockAsm64K:
11771 MOVB $0xf4, (CX)
11772 MOVW SI, 1(CX)
11773 ADDQ $0x03, CX
11774 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
11775
11776 two_bytes_repeat_emit_encodeSnappyBlockAsm64K:
11777 MOVB $0xf0, (CX)
11778 MOVB SI, 1(CX)
11779 ADDQ $0x02, CX
11780 CMPL SI, $0x40
11781 JB memmove_repeat_emit_encodeSnappyBlockAsm64K
11782 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K
11783
11784 one_byte_repeat_emit_encodeSnappyBlockAsm64K:
11785 SHLB $0x02, SI
11786 MOVB SI, (CX)
11787 ADDQ $0x01, CX
11788
11789 memmove_repeat_emit_encodeSnappyBlockAsm64K:
11790 LEAQ (CX)(R8*1), SI
11791
11792 // genMemMoveShort
11793 CMPQ R8, $0x08
11794 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8
11795 CMPQ R8, $0x10
11796 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
11797 CMPQ R8, $0x20
11798 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
11799 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
11800
11801 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8:
11802 MOVQ (R9), R10
11803 MOVQ R10, (CX)
11804 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11805
11806 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
11807 MOVQ (R9), R10
11808 MOVQ -8(R9)(R8*1), R9
11809 MOVQ R10, (CX)
11810 MOVQ R9, -8(CX)(R8*1)
11811 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11812
11813 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
11814 MOVOU (R9), X0
11815 MOVOU -16(R9)(R8*1), X1
11816 MOVOU X0, (CX)
11817 MOVOU X1, -16(CX)(R8*1)
11818 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K
11819
11820 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
11821 MOVOU (R9), X0
11822 MOVOU 16(R9), X1
11823 MOVOU -32(R9)(R8*1), X2
11824 MOVOU -16(R9)(R8*1), X3
11825 MOVOU X0, (CX)
11826 MOVOU X1, 16(CX)
11827 MOVOU X2, -32(CX)(R8*1)
11828 MOVOU X3, -16(CX)(R8*1)
11829
11830 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K:
11831 MOVQ SI, CX
11832 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K
11833
11834 memmove_long_repeat_emit_encodeSnappyBlockAsm64K:
11835 LEAQ (CX)(R8*1), SI
11836
11837 // genMemMoveLong
11838 MOVOU (R9), X0
11839 MOVOU 16(R9), X1
11840 MOVOU -32(R9)(R8*1), X2
11841 MOVOU -16(R9)(R8*1), X3
11842 MOVQ R8, R11
11843 SHRQ $0x05, R11
11844 MOVQ CX, R10
11845 ANDL $0x0000001f, R10
11846 MOVQ $0x00000040, R12
11847 SUBQ R10, R12
11848 DECQ R11
11849 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
11850 LEAQ -32(R9)(R12*1), R10
11851 LEAQ -32(CX)(R12*1), R13
11852
11853 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
11854 MOVOU (R10), X4
11855 MOVOU 16(R10), X5
11856 MOVOA X4, (R13)
11857 MOVOA X5, 16(R13)
11858 ADDQ $0x20, R13
11859 ADDQ $0x20, R10
11860 ADDQ $0x20, R12
11861 DECQ R11
11862 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
11863
11864 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
11865 MOVOU -32(R9)(R12*1), X4
11866 MOVOU -16(R9)(R12*1), X5
11867 MOVOA X4, -32(CX)(R12*1)
11868 MOVOA X5, -16(CX)(R12*1)
11869 ADDQ $0x20, R12
11870 CMPQ R8, R12
11871 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
11872 MOVOU X0, (CX)
11873 MOVOU X1, 16(CX)
11874 MOVOU X2, -32(CX)(R8*1)
11875 MOVOU X3, -16(CX)(R8*1)
11876 MOVQ SI, CX
11877
11878 emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K:
11879 ADDL $0x05, DX
11880 MOVL DX, SI
11881 SUBL 16(SP), SI
11882 MOVQ src_len+32(FP), R8
11883 SUBL DX, R8
11884 LEAQ (BX)(DX*1), R9
11885 LEAQ (BX)(SI*1), SI
11886
11887 // matchLen
11888 XORL R11, R11
11889
11890 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K:
11891 CMPL R8, $0x10
11892 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K
11893 MOVQ (R9)(R11*1), R10
11894 MOVQ 8(R9)(R11*1), R12
11895 XORQ (SI)(R11*1), R10
11896 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
11897 XORQ 8(SI)(R11*1), R12
11898 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K
11899 LEAL -16(R8), R8
11900 LEAL 16(R11), R11
11901 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K
11902
11903 matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K:
11904 #ifdef GOAMD64_v3
11905 TZCNTQ R12, R12
11906
11907 #else
11908 BSFQ R12, R12
11909
11910 #endif
11911 SARQ $0x03, R12
11912 LEAL 8(R11)(R12*1), R11
11913 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
11914
11915 matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K:
11916 CMPL R8, $0x08
11917 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
11918 MOVQ (R9)(R11*1), R10
11919 XORQ (SI)(R11*1), R10
11920 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K
11921 LEAL -8(R8), R8
11922 LEAL 8(R11), R11
11923 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K
11924
11925 matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K:
11926 #ifdef GOAMD64_v3
11927 TZCNTQ R10, R10
11928
11929 #else
11930 BSFQ R10, R10
11931
11932 #endif
11933 SARQ $0x03, R10
11934 LEAL (R11)(R10*1), R11
11935 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K
11936
11937 matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K:
11938 CMPL R8, $0x04
11939 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
11940 MOVL (R9)(R11*1), R10
11941 CMPL (SI)(R11*1), R10
11942 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K
11943 LEAL -4(R8), R8
11944 LEAL 4(R11), R11
11945
11946 matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K:
11947 CMPL R8, $0x01
11948 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
11949 JB repeat_extend_forward_end_encodeSnappyBlockAsm64K
11950 MOVW (R9)(R11*1), R10
11951 CMPW (SI)(R11*1), R10
11952 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K
11953 LEAL 2(R11), R11
11954 SUBL $0x02, R8
11955 JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K
11956
11957 matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K:
11958 MOVB (R9)(R11*1), R10
11959 CMPB (SI)(R11*1), R10
11960 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K
11961 LEAL 1(R11), R11
11962
11963 repeat_extend_forward_end_encodeSnappyBlockAsm64K:
11964 ADDL R11, DX
11965 MOVL DX, SI
11966 SUBL DI, SI
11967 MOVL 16(SP), DI
11968
11969 // emitCopy
11970 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K:
11971 CMPL SI, $0x40
11972 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K
11973 MOVB $0xee, (CX)
11974 MOVW DI, 1(CX)
11975 LEAL -60(SI), SI
11976 ADDQ $0x03, CX
11977 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K
11978
11979 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K:
11980 MOVL SI, R8
11981 SHLL $0x02, R8
11982 CMPL SI, $0x0c
11983 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
11984 CMPL DI, $0x00000800
11985 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K
11986 LEAL -15(R8), R8
11987 MOVB DI, 1(CX)
11988 SHRL $0x08, DI
11989 SHLL $0x05, DI
11990 ORL DI, R8
11991 MOVB R8, (CX)
11992 ADDQ $0x02, CX
11993 JMP repeat_end_emit_encodeSnappyBlockAsm64K
11994
11995 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K:
11996 LEAL -2(R8), R8
11997 MOVB R8, (CX)
11998 MOVW DI, 1(CX)
11999 ADDQ $0x03, CX
12000
12001 repeat_end_emit_encodeSnappyBlockAsm64K:
12002 MOVL DX, 12(SP)
12003 JMP search_loop_encodeSnappyBlockAsm64K
12004
12005 no_repeat_found_encodeSnappyBlockAsm64K:
12006 CMPL (BX)(SI*1), DI
12007 JEQ candidate_match_encodeSnappyBlockAsm64K
12008 SHRQ $0x08, DI
12009 MOVL (AX)(R10*4), SI
12010 LEAL 2(DX), R9
12011 CMPL (BX)(R8*1), DI
12012 JEQ candidate2_match_encodeSnappyBlockAsm64K
12013 MOVL R9, (AX)(R10*4)
12014 SHRQ $0x08, DI
12015 CMPL (BX)(SI*1), DI
12016 JEQ candidate3_match_encodeSnappyBlockAsm64K
12017 MOVL 20(SP), DX
12018 JMP search_loop_encodeSnappyBlockAsm64K
12019
12020 candidate3_match_encodeSnappyBlockAsm64K:
12021 ADDL $0x02, DX
12022 JMP candidate_match_encodeSnappyBlockAsm64K
12023
12024 candidate2_match_encodeSnappyBlockAsm64K:
12025 MOVL R9, (AX)(R10*4)
12026 INCL DX
12027 MOVL R8, SI
12028
12029 candidate_match_encodeSnappyBlockAsm64K:
12030 MOVL 12(SP), DI
12031 TESTL SI, SI
12032 JZ match_extend_back_end_encodeSnappyBlockAsm64K
12033
12034 match_extend_back_loop_encodeSnappyBlockAsm64K:
12035 CMPL DX, DI
12036 JBE match_extend_back_end_encodeSnappyBlockAsm64K
12037 MOVB -1(BX)(SI*1), R8
12038 MOVB -1(BX)(DX*1), R9
12039 CMPB R8, R9
12040 JNE match_extend_back_end_encodeSnappyBlockAsm64K
12041 LEAL -1(DX), DX
12042 DECL SI
12043 JZ match_extend_back_end_encodeSnappyBlockAsm64K
12044 JMP match_extend_back_loop_encodeSnappyBlockAsm64K
12045
12046 match_extend_back_end_encodeSnappyBlockAsm64K:
12047 MOVL DX, DI
12048 SUBL 12(SP), DI
12049 LEAQ 3(CX)(DI*1), DI
12050 CMPQ DI, (SP)
12051 JB match_dst_size_check_encodeSnappyBlockAsm64K
12052 MOVQ $0x00000000, ret+56(FP)
12053 RET
12054
12055 match_dst_size_check_encodeSnappyBlockAsm64K:
12056 MOVL DX, DI
12057 MOVL 12(SP), R8
12058 CMPL R8, DI
12059 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K
12060 MOVL DI, R9
12061 MOVL DI, 12(SP)
12062 LEAQ (BX)(R8*1), DI
12063 SUBL R8, R9
12064 LEAL -1(R9), R8
12065 CMPL R8, $0x3c
12066 JB one_byte_match_emit_encodeSnappyBlockAsm64K
12067 CMPL R8, $0x00000100
12068 JB two_bytes_match_emit_encodeSnappyBlockAsm64K
12069 JB three_bytes_match_emit_encodeSnappyBlockAsm64K
12070
12071 three_bytes_match_emit_encodeSnappyBlockAsm64K:
12072 MOVB $0xf4, (CX)
12073 MOVW R8, 1(CX)
12074 ADDQ $0x03, CX
12075 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
12076
12077 two_bytes_match_emit_encodeSnappyBlockAsm64K:
12078 MOVB $0xf0, (CX)
12079 MOVB R8, 1(CX)
12080 ADDQ $0x02, CX
12081 CMPL R8, $0x40
12082 JB memmove_match_emit_encodeSnappyBlockAsm64K
12083 JMP memmove_long_match_emit_encodeSnappyBlockAsm64K
12084
12085 one_byte_match_emit_encodeSnappyBlockAsm64K:
12086 SHLB $0x02, R8
12087 MOVB R8, (CX)
12088 ADDQ $0x01, CX
12089
12090 memmove_match_emit_encodeSnappyBlockAsm64K:
12091 LEAQ (CX)(R9*1), R8
12092
12093 // genMemMoveShort
12094 CMPQ R9, $0x08
12095 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8
12096 CMPQ R9, $0x10
12097 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16
12098 CMPQ R9, $0x20
12099 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32
12100 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64
12101
12102 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8:
12103 MOVQ (DI), R10
12104 MOVQ R10, (CX)
12105 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12106
12107 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16:
12108 MOVQ (DI), R10
12109 MOVQ -8(DI)(R9*1), DI
12110 MOVQ R10, (CX)
12111 MOVQ DI, -8(CX)(R9*1)
12112 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12113
12114 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32:
12115 MOVOU (DI), X0
12116 MOVOU -16(DI)(R9*1), X1
12117 MOVOU X0, (CX)
12118 MOVOU X1, -16(CX)(R9*1)
12119 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K
12120
12121 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64:
12122 MOVOU (DI), X0
12123 MOVOU 16(DI), X1
12124 MOVOU -32(DI)(R9*1), X2
12125 MOVOU -16(DI)(R9*1), X3
12126 MOVOU X0, (CX)
12127 MOVOU X1, 16(CX)
12128 MOVOU X2, -32(CX)(R9*1)
12129 MOVOU X3, -16(CX)(R9*1)
12130
12131 memmove_end_copy_match_emit_encodeSnappyBlockAsm64K:
12132 MOVQ R8, CX
12133 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K
12134
12135 memmove_long_match_emit_encodeSnappyBlockAsm64K:
12136 LEAQ (CX)(R9*1), R8
12137
12138 // genMemMoveLong
12139 MOVOU (DI), X0
12140 MOVOU 16(DI), X1
12141 MOVOU -32(DI)(R9*1), X2
12142 MOVOU -16(DI)(R9*1), X3
12143 MOVQ R9, R11
12144 SHRQ $0x05, R11
12145 MOVQ CX, R10
12146 ANDL $0x0000001f, R10
12147 MOVQ $0x00000040, R12
12148 SUBQ R10, R12
12149 DECQ R11
12150 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12151 LEAQ -32(DI)(R12*1), R10
12152 LEAQ -32(CX)(R12*1), R13
12153
12154 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back:
12155 MOVOU (R10), X4
12156 MOVOU 16(R10), X5
12157 MOVOA X4, (R13)
12158 MOVOA X5, 16(R13)
12159 ADDQ $0x20, R13
12160 ADDQ $0x20, R10
12161 ADDQ $0x20, R12
12162 DECQ R11
12163 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back
12164
12165 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
12166 MOVOU -32(DI)(R12*1), X4
12167 MOVOU -16(DI)(R12*1), X5
12168 MOVOA X4, -32(CX)(R12*1)
12169 MOVOA X5, -16(CX)(R12*1)
12170 ADDQ $0x20, R12
12171 CMPQ R9, R12
12172 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12173 MOVOU X0, (CX)
12174 MOVOU X1, 16(CX)
12175 MOVOU X2, -32(CX)(R9*1)
12176 MOVOU X3, -16(CX)(R9*1)
12177 MOVQ R8, CX
12178
12179 emit_literal_done_match_emit_encodeSnappyBlockAsm64K:
12180 match_nolit_loop_encodeSnappyBlockAsm64K:
12181 MOVL DX, DI
12182 SUBL SI, DI
12183 MOVL DI, 16(SP)
12184 ADDL $0x04, DX
12185 ADDL $0x04, SI
12186 MOVQ src_len+32(FP), DI
12187 SUBL DX, DI
12188 LEAQ (BX)(DX*1), R8
12189 LEAQ (BX)(SI*1), SI
12190
12191 // matchLen
12192 XORL R10, R10
12193
12194 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K:
12195 CMPL DI, $0x10
12196 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K
12197 MOVQ (R8)(R10*1), R9
12198 MOVQ 8(R8)(R10*1), R11
12199 XORQ (SI)(R10*1), R9
12200 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
12201 XORQ 8(SI)(R10*1), R11
12202 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K
12203 LEAL -16(DI), DI
12204 LEAL 16(R10), R10
12205 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K
12206
12207 matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K:
12208 #ifdef GOAMD64_v3
12209 TZCNTQ R11, R11
12210
12211 #else
12212 BSFQ R11, R11
12213
12214 #endif
12215 SARQ $0x03, R11
12216 LEAL 8(R10)(R11*1), R10
12217 JMP match_nolit_end_encodeSnappyBlockAsm64K
12218
12219 matchlen_match8_match_nolit_encodeSnappyBlockAsm64K:
12220 CMPL DI, $0x08
12221 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
12222 MOVQ (R8)(R10*1), R9
12223 XORQ (SI)(R10*1), R9
12224 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K
12225 LEAL -8(DI), DI
12226 LEAL 8(R10), R10
12227 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K
12228
12229 matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K:
12230 #ifdef GOAMD64_v3
12231 TZCNTQ R9, R9
12232
12233 #else
12234 BSFQ R9, R9
12235
12236 #endif
12237 SARQ $0x03, R9
12238 LEAL (R10)(R9*1), R10
12239 JMP match_nolit_end_encodeSnappyBlockAsm64K
12240
12241 matchlen_match4_match_nolit_encodeSnappyBlockAsm64K:
12242 CMPL DI, $0x04
12243 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
12244 MOVL (R8)(R10*1), R9
12245 CMPL (SI)(R10*1), R9
12246 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K
12247 LEAL -4(DI), DI
12248 LEAL 4(R10), R10
12249
12250 matchlen_match2_match_nolit_encodeSnappyBlockAsm64K:
12251 CMPL DI, $0x01
12252 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
12253 JB match_nolit_end_encodeSnappyBlockAsm64K
12254 MOVW (R8)(R10*1), R9
12255 CMPW (SI)(R10*1), R9
12256 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K
12257 LEAL 2(R10), R10
12258 SUBL $0x02, DI
12259 JZ match_nolit_end_encodeSnappyBlockAsm64K
12260
12261 matchlen_match1_match_nolit_encodeSnappyBlockAsm64K:
12262 MOVB (R8)(R10*1), R9
12263 CMPB (SI)(R10*1), R9
12264 JNE match_nolit_end_encodeSnappyBlockAsm64K
12265 LEAL 1(R10), R10
12266
12267 match_nolit_end_encodeSnappyBlockAsm64K:
12268 ADDL R10, DX
12269 MOVL 16(SP), SI
12270 ADDL $0x04, R10
12271 MOVL DX, 12(SP)
12272
12273 // emitCopy
12274 two_byte_offset_match_nolit_encodeSnappyBlockAsm64K:
12275 CMPL R10, $0x40
12276 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K
12277 MOVB $0xee, (CX)
12278 MOVW SI, 1(CX)
12279 LEAL -60(R10), R10
12280 ADDQ $0x03, CX
12281 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K
12282
12283 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K:
12284 MOVL R10, DI
12285 SHLL $0x02, DI
12286 CMPL R10, $0x0c
12287 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
12288 CMPL SI, $0x00000800
12289 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K
12290 LEAL -15(DI), DI
12291 MOVB SI, 1(CX)
12292 SHRL $0x08, SI
12293 SHLL $0x05, SI
12294 ORL SI, DI
12295 MOVB DI, (CX)
12296 ADDQ $0x02, CX
12297 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K
12298
12299 emit_copy_three_match_nolit_encodeSnappyBlockAsm64K:
12300 LEAL -2(DI), DI
12301 MOVB DI, (CX)
12302 MOVW SI, 1(CX)
12303 ADDQ $0x03, CX
12304
12305 match_nolit_emitcopy_end_encodeSnappyBlockAsm64K:
12306 CMPL DX, 8(SP)
12307 JAE emit_remainder_encodeSnappyBlockAsm64K
12308 MOVQ -2(BX)(DX*1), DI
12309 CMPQ CX, (SP)
12310 JB match_nolit_dst_ok_encodeSnappyBlockAsm64K
12311 MOVQ $0x00000000, ret+56(FP)
12312 RET
12313
12314 match_nolit_dst_ok_encodeSnappyBlockAsm64K:
12315 MOVQ $0x0000cf1bbcdcbf9b, R9
12316 MOVQ DI, R8
12317 SHRQ $0x10, DI
12318 MOVQ DI, SI
12319 SHLQ $0x10, R8
12320 IMULQ R9, R8
12321 SHRQ $0x32, R8
12322 SHLQ $0x10, SI
12323 IMULQ R9, SI
12324 SHRQ $0x32, SI
12325 LEAL -2(DX), R9
12326 LEAQ (AX)(SI*4), R10
12327 MOVL (R10), SI
12328 MOVL R9, (AX)(R8*4)
12329 MOVL DX, (R10)
12330 CMPL (BX)(SI*1), DI
12331 JEQ match_nolit_loop_encodeSnappyBlockAsm64K
12332 INCL DX
12333 JMP search_loop_encodeSnappyBlockAsm64K
12334
12335 emit_remainder_encodeSnappyBlockAsm64K:
12336 MOVQ src_len+32(FP), AX
12337 SUBL 12(SP), AX
12338 LEAQ 3(CX)(AX*1), AX
12339 CMPQ AX, (SP)
12340 JB emit_remainder_ok_encodeSnappyBlockAsm64K
12341 MOVQ $0x00000000, ret+56(FP)
12342 RET
12343
12344 emit_remainder_ok_encodeSnappyBlockAsm64K:
12345 MOVQ src_len+32(FP), AX
12346 MOVL 12(SP), DX
12347 CMPL DX, AX
12348 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
12349 MOVL AX, SI
12350 MOVL AX, 12(SP)
12351 LEAQ (BX)(DX*1), AX
12352 SUBL DX, SI
12353 LEAL -1(SI), DX
12354 CMPL DX, $0x3c
12355 JB one_byte_emit_remainder_encodeSnappyBlockAsm64K
12356 CMPL DX, $0x00000100
12357 JB two_bytes_emit_remainder_encodeSnappyBlockAsm64K
12358 JB three_bytes_emit_remainder_encodeSnappyBlockAsm64K
12359
12360 three_bytes_emit_remainder_encodeSnappyBlockAsm64K:
12361 MOVB $0xf4, (CX)
12362 MOVW DX, 1(CX)
12363 ADDQ $0x03, CX
12364 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
12365
12366 two_bytes_emit_remainder_encodeSnappyBlockAsm64K:
12367 MOVB $0xf0, (CX)
12368 MOVB DL, 1(CX)
12369 ADDQ $0x02, CX
12370 CMPL DX, $0x40
12371 JB memmove_emit_remainder_encodeSnappyBlockAsm64K
12372 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K
12373
12374 one_byte_emit_remainder_encodeSnappyBlockAsm64K:
12375 SHLB $0x02, DL
12376 MOVB DL, (CX)
12377 ADDQ $0x01, CX
12378
12379 memmove_emit_remainder_encodeSnappyBlockAsm64K:
12380 LEAQ (CX)(SI*1), DX
12381 MOVL SI, BX
12382
12383 // genMemMoveShort
12384 CMPQ BX, $0x03
12385 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2
12386 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3
12387 CMPQ BX, $0x08
12388 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7
12389 CMPQ BX, $0x10
12390 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16
12391 CMPQ BX, $0x20
12392 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32
12393 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64
12394
12395 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2:
12396 MOVB (AX), SI
12397 MOVB -1(AX)(BX*1), AL
12398 MOVB SI, (CX)
12399 MOVB AL, -1(CX)(BX*1)
12400 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12401
12402 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3:
12403 MOVW (AX), SI
12404 MOVB 2(AX), AL
12405 MOVW SI, (CX)
12406 MOVB AL, 2(CX)
12407 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12408
12409 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7:
12410 MOVL (AX), SI
12411 MOVL -4(AX)(BX*1), AX
12412 MOVL SI, (CX)
12413 MOVL AX, -4(CX)(BX*1)
12414 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12415
12416 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16:
12417 MOVQ (AX), SI
12418 MOVQ -8(AX)(BX*1), AX
12419 MOVQ SI, (CX)
12420 MOVQ AX, -8(CX)(BX*1)
12421 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12422
12423 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32:
12424 MOVOU (AX), X0
12425 MOVOU -16(AX)(BX*1), X1
12426 MOVOU X0, (CX)
12427 MOVOU X1, -16(CX)(BX*1)
12428 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K
12429
12430 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64:
12431 MOVOU (AX), X0
12432 MOVOU 16(AX), X1
12433 MOVOU -32(AX)(BX*1), X2
12434 MOVOU -16(AX)(BX*1), X3
12435 MOVOU X0, (CX)
12436 MOVOU X1, 16(CX)
12437 MOVOU X2, -32(CX)(BX*1)
12438 MOVOU X3, -16(CX)(BX*1)
12439
12440 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K:
12441 MOVQ DX, CX
12442 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K
12443
12444 memmove_long_emit_remainder_encodeSnappyBlockAsm64K:
12445 LEAQ (CX)(SI*1), DX
12446 MOVL SI, BX
12447
12448 // genMemMoveLong
12449 MOVOU (AX), X0
12450 MOVOU 16(AX), X1
12451 MOVOU -32(AX)(BX*1), X2
12452 MOVOU -16(AX)(BX*1), X3
12453 MOVQ BX, DI
12454 SHRQ $0x05, DI
12455 MOVQ CX, SI
12456 ANDL $0x0000001f, SI
12457 MOVQ $0x00000040, R8
12458 SUBQ SI, R8
12459 DECQ DI
12460 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12461 LEAQ -32(AX)(R8*1), SI
12462 LEAQ -32(CX)(R8*1), R9
12463
12464 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back:
12465 MOVOU (SI), X4
12466 MOVOU 16(SI), X5
12467 MOVOA X4, (R9)
12468 MOVOA X5, 16(R9)
12469 ADDQ $0x20, R9
12470 ADDQ $0x20, SI
12471 ADDQ $0x20, R8
12472 DECQ DI
12473 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back
12474
12475 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32:
12476 MOVOU -32(AX)(R8*1), X4
12477 MOVOU -16(AX)(R8*1), X5
12478 MOVOA X4, -32(CX)(R8*1)
12479 MOVOA X5, -16(CX)(R8*1)
12480 ADDQ $0x20, R8
12481 CMPQ BX, R8
12482 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32
12483 MOVOU X0, (CX)
12484 MOVOU X1, 16(CX)
12485 MOVOU X2, -32(CX)(BX*1)
12486 MOVOU X3, -16(CX)(BX*1)
12487 MOVQ DX, CX
12488
12489 emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K:
12490 MOVQ dst_base+0(FP), AX
12491 SUBQ AX, CX
12492 MOVQ CX, ret+56(FP)
12493 RET
12494
12495 // func encodeSnappyBlockAsm12B(dst []byte, src []byte, tmp *[16384]byte) int
12496 // Requires: BMI, SSE2
12497 TEXT ·encodeSnappyBlockAsm12B(SB), $24-64
12498 MOVQ tmp+48(FP), AX
12499 MOVQ dst_base+0(FP), CX
12500 MOVQ $0x00000080, DX
12501 MOVQ AX, BX
12502 PXOR X0, X0
12503
12504 zero_loop_encodeSnappyBlockAsm12B:
12505 MOVOU X0, (BX)
12506 MOVOU X0, 16(BX)
12507 MOVOU X0, 32(BX)
12508 MOVOU X0, 48(BX)
12509 MOVOU X0, 64(BX)
12510 MOVOU X0, 80(BX)
12511 MOVOU X0, 96(BX)
12512 MOVOU X0, 112(BX)
12513 ADDQ $0x80, BX
12514 DECQ DX
12515 JNZ zero_loop_encodeSnappyBlockAsm12B
12516 MOVL $0x00000000, 12(SP)
12517 MOVQ src_len+32(FP), DX
12518 LEAQ -9(DX), BX
12519 LEAQ -8(DX), SI
12520 MOVL SI, 8(SP)
12521 SHRQ $0x05, DX
12522 SUBL DX, BX
12523 LEAQ (CX)(BX*1), BX
12524 MOVQ BX, (SP)
12525 MOVL $0x00000001, DX
12526 MOVL DX, 16(SP)
12527 MOVQ src_base+24(FP), BX
12528
12529 search_loop_encodeSnappyBlockAsm12B:
12530 MOVL DX, SI
12531 SUBL 12(SP), SI
12532 SHRL $0x05, SI
12533 LEAL 4(DX)(SI*1), SI
12534 CMPL SI, 8(SP)
12535 JAE emit_remainder_encodeSnappyBlockAsm12B
12536 MOVQ (BX)(DX*1), DI
12537 MOVL SI, 20(SP)
12538 MOVQ $0x000000cf1bbcdcbb, R9
12539 MOVQ DI, R10
12540 MOVQ DI, R11
12541 SHRQ $0x08, R11
12542 SHLQ $0x18, R10
12543 IMULQ R9, R10
12544 SHRQ $0x34, R10
12545 SHLQ $0x18, R11
12546 IMULQ R9, R11
12547 SHRQ $0x34, R11
12548 MOVL (AX)(R10*4), SI
12549 MOVL (AX)(R11*4), R8
12550 MOVL DX, (AX)(R10*4)
12551 LEAL 1(DX), R10
12552 MOVL R10, (AX)(R11*4)
12553 MOVQ DI, R10
12554 SHRQ $0x10, R10
12555 SHLQ $0x18, R10
12556 IMULQ R9, R10
12557 SHRQ $0x34, R10
12558 MOVL DX, R9
12559 SUBL 16(SP), R9
12560 MOVL 1(BX)(R9*1), R11
12561 MOVQ DI, R9
12562 SHRQ $0x08, R9
12563 CMPL R9, R11
12564 JNE no_repeat_found_encodeSnappyBlockAsm12B
12565 LEAL 1(DX), DI
12566 MOVL 12(SP), SI
12567 MOVL DI, R8
12568 SUBL 16(SP), R8
12569 JZ repeat_extend_back_end_encodeSnappyBlockAsm12B
12570
12571 repeat_extend_back_loop_encodeSnappyBlockAsm12B:
12572 CMPL DI, SI
12573 JBE repeat_extend_back_end_encodeSnappyBlockAsm12B
12574 MOVB -1(BX)(R8*1), R9
12575 MOVB -1(BX)(DI*1), R10
12576 CMPB R9, R10
12577 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B
12578 LEAL -1(DI), DI
12579 DECL R8
12580 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B
12581
12582 repeat_extend_back_end_encodeSnappyBlockAsm12B:
12583 MOVL DI, SI
12584 SUBL 12(SP), SI
12585 LEAQ 3(CX)(SI*1), SI
12586 CMPQ SI, (SP)
12587 JB repeat_dst_size_check_encodeSnappyBlockAsm12B
12588 MOVQ $0x00000000, ret+56(FP)
12589 RET
12590
12591 repeat_dst_size_check_encodeSnappyBlockAsm12B:
12592 MOVL 12(SP), SI
12593 CMPL SI, DI
12594 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
12595 MOVL DI, R8
12596 MOVL DI, 12(SP)
12597 LEAQ (BX)(SI*1), R9
12598 SUBL SI, R8
12599 LEAL -1(R8), SI
12600 CMPL SI, $0x3c
12601 JB one_byte_repeat_emit_encodeSnappyBlockAsm12B
12602 CMPL SI, $0x00000100
12603 JB two_bytes_repeat_emit_encodeSnappyBlockAsm12B
12604 JB three_bytes_repeat_emit_encodeSnappyBlockAsm12B
12605
12606 three_bytes_repeat_emit_encodeSnappyBlockAsm12B:
12607 MOVB $0xf4, (CX)
12608 MOVW SI, 1(CX)
12609 ADDQ $0x03, CX
12610 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
12611
12612 two_bytes_repeat_emit_encodeSnappyBlockAsm12B:
12613 MOVB $0xf0, (CX)
12614 MOVB SI, 1(CX)
12615 ADDQ $0x02, CX
12616 CMPL SI, $0x40
12617 JB memmove_repeat_emit_encodeSnappyBlockAsm12B
12618 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B
12619
12620 one_byte_repeat_emit_encodeSnappyBlockAsm12B:
12621 SHLB $0x02, SI
12622 MOVB SI, (CX)
12623 ADDQ $0x01, CX
12624
12625 memmove_repeat_emit_encodeSnappyBlockAsm12B:
12626 LEAQ (CX)(R8*1), SI
12627
12628 // genMemMoveShort
12629 CMPQ R8, $0x08
12630 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8
12631 CMPQ R8, $0x10
12632 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
12633 CMPQ R8, $0x20
12634 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
12635 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
12636
12637 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8:
12638 MOVQ (R9), R10
12639 MOVQ R10, (CX)
12640 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12641
12642 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
12643 MOVQ (R9), R10
12644 MOVQ -8(R9)(R8*1), R9
12645 MOVQ R10, (CX)
12646 MOVQ R9, -8(CX)(R8*1)
12647 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12648
12649 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
12650 MOVOU (R9), X0
12651 MOVOU -16(R9)(R8*1), X1
12652 MOVOU X0, (CX)
12653 MOVOU X1, -16(CX)(R8*1)
12654 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B
12655
12656 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
12657 MOVOU (R9), X0
12658 MOVOU 16(R9), X1
12659 MOVOU -32(R9)(R8*1), X2
12660 MOVOU -16(R9)(R8*1), X3
12661 MOVOU X0, (CX)
12662 MOVOU X1, 16(CX)
12663 MOVOU X2, -32(CX)(R8*1)
12664 MOVOU X3, -16(CX)(R8*1)
12665
12666 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B:
12667 MOVQ SI, CX
12668 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B
12669
12670 memmove_long_repeat_emit_encodeSnappyBlockAsm12B:
12671 LEAQ (CX)(R8*1), SI
12672
12673 // genMemMoveLong
12674 MOVOU (R9), X0
12675 MOVOU 16(R9), X1
12676 MOVOU -32(R9)(R8*1), X2
12677 MOVOU -16(R9)(R8*1), X3
12678 MOVQ R8, R11
12679 SHRQ $0x05, R11
12680 MOVQ CX, R10
12681 ANDL $0x0000001f, R10
12682 MOVQ $0x00000040, R12
12683 SUBQ R10, R12
12684 DECQ R11
12685 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12686 LEAQ -32(R9)(R12*1), R10
12687 LEAQ -32(CX)(R12*1), R13
12688
12689 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
12690 MOVOU (R10), X4
12691 MOVOU 16(R10), X5
12692 MOVOA X4, (R13)
12693 MOVOA X5, 16(R13)
12694 ADDQ $0x20, R13
12695 ADDQ $0x20, R10
12696 ADDQ $0x20, R12
12697 DECQ R11
12698 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
12699
12700 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
12701 MOVOU -32(R9)(R12*1), X4
12702 MOVOU -16(R9)(R12*1), X5
12703 MOVOA X4, -32(CX)(R12*1)
12704 MOVOA X5, -16(CX)(R12*1)
12705 ADDQ $0x20, R12
12706 CMPQ R8, R12
12707 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12708 MOVOU X0, (CX)
12709 MOVOU X1, 16(CX)
12710 MOVOU X2, -32(CX)(R8*1)
12711 MOVOU X3, -16(CX)(R8*1)
12712 MOVQ SI, CX
12713
12714 emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B:
12715 ADDL $0x05, DX
12716 MOVL DX, SI
12717 SUBL 16(SP), SI
12718 MOVQ src_len+32(FP), R8
12719 SUBL DX, R8
12720 LEAQ (BX)(DX*1), R9
12721 LEAQ (BX)(SI*1), SI
12722
12723 // matchLen
12724 XORL R11, R11
12725
12726 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B:
12727 CMPL R8, $0x10
12728 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B
12729 MOVQ (R9)(R11*1), R10
12730 MOVQ 8(R9)(R11*1), R12
12731 XORQ (SI)(R11*1), R10
12732 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
12733 XORQ 8(SI)(R11*1), R12
12734 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B
12735 LEAL -16(R8), R8
12736 LEAL 16(R11), R11
12737 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B
12738
12739 matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B:
12740 #ifdef GOAMD64_v3
12741 TZCNTQ R12, R12
12742
12743 #else
12744 BSFQ R12, R12
12745
12746 #endif
12747 SARQ $0x03, R12
12748 LEAL 8(R11)(R12*1), R11
12749 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
12750
12751 matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B:
12752 CMPL R8, $0x08
12753 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
12754 MOVQ (R9)(R11*1), R10
12755 XORQ (SI)(R11*1), R10
12756 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B
12757 LEAL -8(R8), R8
12758 LEAL 8(R11), R11
12759 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B
12760
12761 matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B:
12762 #ifdef GOAMD64_v3
12763 TZCNTQ R10, R10
12764
12765 #else
12766 BSFQ R10, R10
12767
12768 #endif
12769 SARQ $0x03, R10
12770 LEAL (R11)(R10*1), R11
12771 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B
12772
12773 matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B:
12774 CMPL R8, $0x04
12775 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
12776 MOVL (R9)(R11*1), R10
12777 CMPL (SI)(R11*1), R10
12778 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B
12779 LEAL -4(R8), R8
12780 LEAL 4(R11), R11
12781
12782 matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B:
12783 CMPL R8, $0x01
12784 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
12785 JB repeat_extend_forward_end_encodeSnappyBlockAsm12B
12786 MOVW (R9)(R11*1), R10
12787 CMPW (SI)(R11*1), R10
12788 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B
12789 LEAL 2(R11), R11
12790 SUBL $0x02, R8
12791 JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B
12792
12793 matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B:
12794 MOVB (R9)(R11*1), R10
12795 CMPB (SI)(R11*1), R10
12796 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B
12797 LEAL 1(R11), R11
12798
12799 repeat_extend_forward_end_encodeSnappyBlockAsm12B:
12800 ADDL R11, DX
12801 MOVL DX, SI
12802 SUBL DI, SI
12803 MOVL 16(SP), DI
12804
12805 // emitCopy
12806 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B:
12807 CMPL SI, $0x40
12808 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B
12809 MOVB $0xee, (CX)
12810 MOVW DI, 1(CX)
12811 LEAL -60(SI), SI
12812 ADDQ $0x03, CX
12813 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B
12814
12815 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B:
12816 MOVL SI, R8
12817 SHLL $0x02, R8
12818 CMPL SI, $0x0c
12819 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
12820 CMPL DI, $0x00000800
12821 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B
12822 LEAL -15(R8), R8
12823 MOVB DI, 1(CX)
12824 SHRL $0x08, DI
12825 SHLL $0x05, DI
12826 ORL DI, R8
12827 MOVB R8, (CX)
12828 ADDQ $0x02, CX
12829 JMP repeat_end_emit_encodeSnappyBlockAsm12B
12830
12831 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B:
12832 LEAL -2(R8), R8
12833 MOVB R8, (CX)
12834 MOVW DI, 1(CX)
12835 ADDQ $0x03, CX
12836
12837 repeat_end_emit_encodeSnappyBlockAsm12B:
12838 MOVL DX, 12(SP)
12839 JMP search_loop_encodeSnappyBlockAsm12B
12840
12841 no_repeat_found_encodeSnappyBlockAsm12B:
12842 CMPL (BX)(SI*1), DI
12843 JEQ candidate_match_encodeSnappyBlockAsm12B
12844 SHRQ $0x08, DI
12845 MOVL (AX)(R10*4), SI
12846 LEAL 2(DX), R9
12847 CMPL (BX)(R8*1), DI
12848 JEQ candidate2_match_encodeSnappyBlockAsm12B
12849 MOVL R9, (AX)(R10*4)
12850 SHRQ $0x08, DI
12851 CMPL (BX)(SI*1), DI
12852 JEQ candidate3_match_encodeSnappyBlockAsm12B
12853 MOVL 20(SP), DX
12854 JMP search_loop_encodeSnappyBlockAsm12B
12855
12856 candidate3_match_encodeSnappyBlockAsm12B:
12857 ADDL $0x02, DX
12858 JMP candidate_match_encodeSnappyBlockAsm12B
12859
12860 candidate2_match_encodeSnappyBlockAsm12B:
12861 MOVL R9, (AX)(R10*4)
12862 INCL DX
12863 MOVL R8, SI
12864
12865 candidate_match_encodeSnappyBlockAsm12B:
12866 MOVL 12(SP), DI
12867 TESTL SI, SI
12868 JZ match_extend_back_end_encodeSnappyBlockAsm12B
12869
12870 match_extend_back_loop_encodeSnappyBlockAsm12B:
12871 CMPL DX, DI
12872 JBE match_extend_back_end_encodeSnappyBlockAsm12B
12873 MOVB -1(BX)(SI*1), R8
12874 MOVB -1(BX)(DX*1), R9
12875 CMPB R8, R9
12876 JNE match_extend_back_end_encodeSnappyBlockAsm12B
12877 LEAL -1(DX), DX
12878 DECL SI
12879 JZ match_extend_back_end_encodeSnappyBlockAsm12B
12880 JMP match_extend_back_loop_encodeSnappyBlockAsm12B
12881
12882 match_extend_back_end_encodeSnappyBlockAsm12B:
12883 MOVL DX, DI
12884 SUBL 12(SP), DI
12885 LEAQ 3(CX)(DI*1), DI
12886 CMPQ DI, (SP)
12887 JB match_dst_size_check_encodeSnappyBlockAsm12B
12888 MOVQ $0x00000000, ret+56(FP)
12889 RET
12890
12891 match_dst_size_check_encodeSnappyBlockAsm12B:
12892 MOVL DX, DI
12893 MOVL 12(SP), R8
12894 CMPL R8, DI
12895 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B
12896 MOVL DI, R9
12897 MOVL DI, 12(SP)
12898 LEAQ (BX)(R8*1), DI
12899 SUBL R8, R9
12900 LEAL -1(R9), R8
12901 CMPL R8, $0x3c
12902 JB one_byte_match_emit_encodeSnappyBlockAsm12B
12903 CMPL R8, $0x00000100
12904 JB two_bytes_match_emit_encodeSnappyBlockAsm12B
12905 JB three_bytes_match_emit_encodeSnappyBlockAsm12B
12906
12907 three_bytes_match_emit_encodeSnappyBlockAsm12B:
12908 MOVB $0xf4, (CX)
12909 MOVW R8, 1(CX)
12910 ADDQ $0x03, CX
12911 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
12912
12913 two_bytes_match_emit_encodeSnappyBlockAsm12B:
12914 MOVB $0xf0, (CX)
12915 MOVB R8, 1(CX)
12916 ADDQ $0x02, CX
12917 CMPL R8, $0x40
12918 JB memmove_match_emit_encodeSnappyBlockAsm12B
12919 JMP memmove_long_match_emit_encodeSnappyBlockAsm12B
12920
12921 one_byte_match_emit_encodeSnappyBlockAsm12B:
12922 SHLB $0x02, R8
12923 MOVB R8, (CX)
12924 ADDQ $0x01, CX
12925
12926 memmove_match_emit_encodeSnappyBlockAsm12B:
12927 LEAQ (CX)(R9*1), R8
12928
12929 // genMemMoveShort
12930 CMPQ R9, $0x08
12931 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8
12932 CMPQ R9, $0x10
12933 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16
12934 CMPQ R9, $0x20
12935 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32
12936 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64
12937
12938 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8:
12939 MOVQ (DI), R10
12940 MOVQ R10, (CX)
12941 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12942
12943 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16:
12944 MOVQ (DI), R10
12945 MOVQ -8(DI)(R9*1), DI
12946 MOVQ R10, (CX)
12947 MOVQ DI, -8(CX)(R9*1)
12948 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12949
12950 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32:
12951 MOVOU (DI), X0
12952 MOVOU -16(DI)(R9*1), X1
12953 MOVOU X0, (CX)
12954 MOVOU X1, -16(CX)(R9*1)
12955 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B
12956
12957 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64:
12958 MOVOU (DI), X0
12959 MOVOU 16(DI), X1
12960 MOVOU -32(DI)(R9*1), X2
12961 MOVOU -16(DI)(R9*1), X3
12962 MOVOU X0, (CX)
12963 MOVOU X1, 16(CX)
12964 MOVOU X2, -32(CX)(R9*1)
12965 MOVOU X3, -16(CX)(R9*1)
12966
12967 memmove_end_copy_match_emit_encodeSnappyBlockAsm12B:
12968 MOVQ R8, CX
12969 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B
12970
12971 memmove_long_match_emit_encodeSnappyBlockAsm12B:
12972 LEAQ (CX)(R9*1), R8
12973
12974 // genMemMoveLong
12975 MOVOU (DI), X0
12976 MOVOU 16(DI), X1
12977 MOVOU -32(DI)(R9*1), X2
12978 MOVOU -16(DI)(R9*1), X3
12979 MOVQ R9, R11
12980 SHRQ $0x05, R11
12981 MOVQ CX, R10
12982 ANDL $0x0000001f, R10
12983 MOVQ $0x00000040, R12
12984 SUBQ R10, R12
12985 DECQ R11
12986 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
12987 LEAQ -32(DI)(R12*1), R10
12988 LEAQ -32(CX)(R12*1), R13
12989
12990 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back:
12991 MOVOU (R10), X4
12992 MOVOU 16(R10), X5
12993 MOVOA X4, (R13)
12994 MOVOA X5, 16(R13)
12995 ADDQ $0x20, R13
12996 ADDQ $0x20, R10
12997 ADDQ $0x20, R12
12998 DECQ R11
12999 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back
13000
13001 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
13002 MOVOU -32(DI)(R12*1), X4
13003 MOVOU -16(DI)(R12*1), X5
13004 MOVOA X4, -32(CX)(R12*1)
13005 MOVOA X5, -16(CX)(R12*1)
13006 ADDQ $0x20, R12
13007 CMPQ R9, R12
13008 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
13009 MOVOU X0, (CX)
13010 MOVOU X1, 16(CX)
13011 MOVOU X2, -32(CX)(R9*1)
13012 MOVOU X3, -16(CX)(R9*1)
13013 MOVQ R8, CX
13014
13015 emit_literal_done_match_emit_encodeSnappyBlockAsm12B:
13016 match_nolit_loop_encodeSnappyBlockAsm12B:
13017 MOVL DX, DI
13018 SUBL SI, DI
13019 MOVL DI, 16(SP)
13020 ADDL $0x04, DX
13021 ADDL $0x04, SI
13022 MOVQ src_len+32(FP), DI
13023 SUBL DX, DI
13024 LEAQ (BX)(DX*1), R8
13025 LEAQ (BX)(SI*1), SI
13026
13027 // matchLen
13028 XORL R10, R10
13029
13030 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B:
13031 CMPL DI, $0x10
13032 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B
13033 MOVQ (R8)(R10*1), R9
13034 MOVQ 8(R8)(R10*1), R11
13035 XORQ (SI)(R10*1), R9
13036 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
13037 XORQ 8(SI)(R10*1), R11
13038 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B
13039 LEAL -16(DI), DI
13040 LEAL 16(R10), R10
13041 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B
13042
13043 matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B:
13044 #ifdef GOAMD64_v3
13045 TZCNTQ R11, R11
13046
13047 #else
13048 BSFQ R11, R11
13049
13050 #endif
13051 SARQ $0x03, R11
13052 LEAL 8(R10)(R11*1), R10
13053 JMP match_nolit_end_encodeSnappyBlockAsm12B
13054
13055 matchlen_match8_match_nolit_encodeSnappyBlockAsm12B:
13056 CMPL DI, $0x08
13057 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
13058 MOVQ (R8)(R10*1), R9
13059 XORQ (SI)(R10*1), R9
13060 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B
13061 LEAL -8(DI), DI
13062 LEAL 8(R10), R10
13063 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B
13064
13065 matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B:
13066 #ifdef GOAMD64_v3
13067 TZCNTQ R9, R9
13068
13069 #else
13070 BSFQ R9, R9
13071
13072 #endif
13073 SARQ $0x03, R9
13074 LEAL (R10)(R9*1), R10
13075 JMP match_nolit_end_encodeSnappyBlockAsm12B
13076
13077 matchlen_match4_match_nolit_encodeSnappyBlockAsm12B:
13078 CMPL DI, $0x04
13079 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
13080 MOVL (R8)(R10*1), R9
13081 CMPL (SI)(R10*1), R9
13082 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B
13083 LEAL -4(DI), DI
13084 LEAL 4(R10), R10
13085
13086 matchlen_match2_match_nolit_encodeSnappyBlockAsm12B:
13087 CMPL DI, $0x01
13088 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
13089 JB match_nolit_end_encodeSnappyBlockAsm12B
13090 MOVW (R8)(R10*1), R9
13091 CMPW (SI)(R10*1), R9
13092 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B
13093 LEAL 2(R10), R10
13094 SUBL $0x02, DI
13095 JZ match_nolit_end_encodeSnappyBlockAsm12B
13096
13097 matchlen_match1_match_nolit_encodeSnappyBlockAsm12B:
13098 MOVB (R8)(R10*1), R9
13099 CMPB (SI)(R10*1), R9
13100 JNE match_nolit_end_encodeSnappyBlockAsm12B
13101 LEAL 1(R10), R10
13102
13103 match_nolit_end_encodeSnappyBlockAsm12B:
13104 ADDL R10, DX
13105 MOVL 16(SP), SI
13106 ADDL $0x04, R10
13107 MOVL DX, 12(SP)
13108
13109 // emitCopy
13110 two_byte_offset_match_nolit_encodeSnappyBlockAsm12B:
13111 CMPL R10, $0x40
13112 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B
13113 MOVB $0xee, (CX)
13114 MOVW SI, 1(CX)
13115 LEAL -60(R10), R10
13116 ADDQ $0x03, CX
13117 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B
13118
13119 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B:
13120 MOVL R10, DI
13121 SHLL $0x02, DI
13122 CMPL R10, $0x0c
13123 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
13124 CMPL SI, $0x00000800
13125 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B
13126 LEAL -15(DI), DI
13127 MOVB SI, 1(CX)
13128 SHRL $0x08, SI
13129 SHLL $0x05, SI
13130 ORL SI, DI
13131 MOVB DI, (CX)
13132 ADDQ $0x02, CX
13133 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B
13134
13135 emit_copy_three_match_nolit_encodeSnappyBlockAsm12B:
13136 LEAL -2(DI), DI
13137 MOVB DI, (CX)
13138 MOVW SI, 1(CX)
13139 ADDQ $0x03, CX
13140
13141 match_nolit_emitcopy_end_encodeSnappyBlockAsm12B:
13142 CMPL DX, 8(SP)
13143 JAE emit_remainder_encodeSnappyBlockAsm12B
13144 MOVQ -2(BX)(DX*1), DI
13145 CMPQ CX, (SP)
13146 JB match_nolit_dst_ok_encodeSnappyBlockAsm12B
13147 MOVQ $0x00000000, ret+56(FP)
13148 RET
13149
13150 match_nolit_dst_ok_encodeSnappyBlockAsm12B:
13151 MOVQ $0x000000cf1bbcdcbb, R9
13152 MOVQ DI, R8
13153 SHRQ $0x10, DI
13154 MOVQ DI, SI
13155 SHLQ $0x18, R8
13156 IMULQ R9, R8
13157 SHRQ $0x34, R8
13158 SHLQ $0x18, SI
13159 IMULQ R9, SI
13160 SHRQ $0x34, SI
13161 LEAL -2(DX), R9
13162 LEAQ (AX)(SI*4), R10
13163 MOVL (R10), SI
13164 MOVL R9, (AX)(R8*4)
13165 MOVL DX, (R10)
13166 CMPL (BX)(SI*1), DI
13167 JEQ match_nolit_loop_encodeSnappyBlockAsm12B
13168 INCL DX
13169 JMP search_loop_encodeSnappyBlockAsm12B
13170
13171 emit_remainder_encodeSnappyBlockAsm12B:
13172 MOVQ src_len+32(FP), AX
13173 SUBL 12(SP), AX
13174 LEAQ 3(CX)(AX*1), AX
13175 CMPQ AX, (SP)
13176 JB emit_remainder_ok_encodeSnappyBlockAsm12B
13177 MOVQ $0x00000000, ret+56(FP)
13178 RET
13179
13180 emit_remainder_ok_encodeSnappyBlockAsm12B:
13181 MOVQ src_len+32(FP), AX
13182 MOVL 12(SP), DX
13183 CMPL DX, AX
13184 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
13185 MOVL AX, SI
13186 MOVL AX, 12(SP)
13187 LEAQ (BX)(DX*1), AX
13188 SUBL DX, SI
13189 LEAL -1(SI), DX
13190 CMPL DX, $0x3c
13191 JB one_byte_emit_remainder_encodeSnappyBlockAsm12B
13192 CMPL DX, $0x00000100
13193 JB two_bytes_emit_remainder_encodeSnappyBlockAsm12B
13194 JB three_bytes_emit_remainder_encodeSnappyBlockAsm12B
13195
13196 three_bytes_emit_remainder_encodeSnappyBlockAsm12B:
13197 MOVB $0xf4, (CX)
13198 MOVW DX, 1(CX)
13199 ADDQ $0x03, CX
13200 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
13201
13202 two_bytes_emit_remainder_encodeSnappyBlockAsm12B:
13203 MOVB $0xf0, (CX)
13204 MOVB DL, 1(CX)
13205 ADDQ $0x02, CX
13206 CMPL DX, $0x40
13207 JB memmove_emit_remainder_encodeSnappyBlockAsm12B
13208 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B
13209
13210 one_byte_emit_remainder_encodeSnappyBlockAsm12B:
13211 SHLB $0x02, DL
13212 MOVB DL, (CX)
13213 ADDQ $0x01, CX
13214
13215 memmove_emit_remainder_encodeSnappyBlockAsm12B:
13216 LEAQ (CX)(SI*1), DX
13217 MOVL SI, BX
13218
13219 // genMemMoveShort
13220 CMPQ BX, $0x03
13221 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2
13222 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3
13223 CMPQ BX, $0x08
13224 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7
13225 CMPQ BX, $0x10
13226 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16
13227 CMPQ BX, $0x20
13228 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32
13229 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64
13230
13231 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2:
13232 MOVB (AX), SI
13233 MOVB -1(AX)(BX*1), AL
13234 MOVB SI, (CX)
13235 MOVB AL, -1(CX)(BX*1)
13236 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13237
13238 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3:
13239 MOVW (AX), SI
13240 MOVB 2(AX), AL
13241 MOVW SI, (CX)
13242 MOVB AL, 2(CX)
13243 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13244
13245 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7:
13246 MOVL (AX), SI
13247 MOVL -4(AX)(BX*1), AX
13248 MOVL SI, (CX)
13249 MOVL AX, -4(CX)(BX*1)
13250 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13251
13252 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16:
13253 MOVQ (AX), SI
13254 MOVQ -8(AX)(BX*1), AX
13255 MOVQ SI, (CX)
13256 MOVQ AX, -8(CX)(BX*1)
13257 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13258
13259 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32:
13260 MOVOU (AX), X0
13261 MOVOU -16(AX)(BX*1), X1
13262 MOVOU X0, (CX)
13263 MOVOU X1, -16(CX)(BX*1)
13264 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B
13265
13266 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64:
13267 MOVOU (AX), X0
13268 MOVOU 16(AX), X1
13269 MOVOU -32(AX)(BX*1), X2
13270 MOVOU -16(AX)(BX*1), X3
13271 MOVOU X0, (CX)
13272 MOVOU X1, 16(CX)
13273 MOVOU X2, -32(CX)(BX*1)
13274 MOVOU X3, -16(CX)(BX*1)
13275
13276 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B:
13277 MOVQ DX, CX
13278 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B
13279
13280 memmove_long_emit_remainder_encodeSnappyBlockAsm12B:
13281 LEAQ (CX)(SI*1), DX
13282 MOVL SI, BX
13283
13284 // genMemMoveLong
13285 MOVOU (AX), X0
13286 MOVOU 16(AX), X1
13287 MOVOU -32(AX)(BX*1), X2
13288 MOVOU -16(AX)(BX*1), X3
13289 MOVQ BX, DI
13290 SHRQ $0x05, DI
13291 MOVQ CX, SI
13292 ANDL $0x0000001f, SI
13293 MOVQ $0x00000040, R8
13294 SUBQ SI, R8
13295 DECQ DI
13296 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
13297 LEAQ -32(AX)(R8*1), SI
13298 LEAQ -32(CX)(R8*1), R9
13299
13300 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back:
13301 MOVOU (SI), X4
13302 MOVOU 16(SI), X5
13303 MOVOA X4, (R9)
13304 MOVOA X5, 16(R9)
13305 ADDQ $0x20, R9
13306 ADDQ $0x20, SI
13307 ADDQ $0x20, R8
13308 DECQ DI
13309 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back
13310
13311 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32:
13312 MOVOU -32(AX)(R8*1), X4
13313 MOVOU -16(AX)(R8*1), X5
13314 MOVOA X4, -32(CX)(R8*1)
13315 MOVOA X5, -16(CX)(R8*1)
13316 ADDQ $0x20, R8
13317 CMPQ BX, R8
13318 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32
13319 MOVOU X0, (CX)
13320 MOVOU X1, 16(CX)
13321 MOVOU X2, -32(CX)(BX*1)
13322 MOVOU X3, -16(CX)(BX*1)
13323 MOVQ DX, CX
13324
13325 emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B:
13326 MOVQ dst_base+0(FP), AX
13327 SUBQ AX, CX
13328 MOVQ CX, ret+56(FP)
13329 RET
13330
13331 // func encodeSnappyBlockAsm10B(dst []byte, src []byte, tmp *[4096]byte) int
13332 // Requires: BMI, SSE2
13333 TEXT ·encodeSnappyBlockAsm10B(SB), $24-64
13334 MOVQ tmp+48(FP), AX
13335 MOVQ dst_base+0(FP), CX
13336 MOVQ $0x00000020, DX
13337 MOVQ AX, BX
13338 PXOR X0, X0
13339
13340 zero_loop_encodeSnappyBlockAsm10B:
13341 MOVOU X0, (BX)
13342 MOVOU X0, 16(BX)
13343 MOVOU X0, 32(BX)
13344 MOVOU X0, 48(BX)
13345 MOVOU X0, 64(BX)
13346 MOVOU X0, 80(BX)
13347 MOVOU X0, 96(BX)
13348 MOVOU X0, 112(BX)
13349 ADDQ $0x80, BX
13350 DECQ DX
13351 JNZ zero_loop_encodeSnappyBlockAsm10B
13352 MOVL $0x00000000, 12(SP)
13353 MOVQ src_len+32(FP), DX
13354 LEAQ -9(DX), BX
13355 LEAQ -8(DX), SI
13356 MOVL SI, 8(SP)
13357 SHRQ $0x05, DX
13358 SUBL DX, BX
13359 LEAQ (CX)(BX*1), BX
13360 MOVQ BX, (SP)
13361 MOVL $0x00000001, DX
13362 MOVL DX, 16(SP)
13363 MOVQ src_base+24(FP), BX
13364
13365 search_loop_encodeSnappyBlockAsm10B:
13366 MOVL DX, SI
13367 SUBL 12(SP), SI
13368 SHRL $0x05, SI
13369 LEAL 4(DX)(SI*1), SI
13370 CMPL SI, 8(SP)
13371 JAE emit_remainder_encodeSnappyBlockAsm10B
13372 MOVQ (BX)(DX*1), DI
13373 MOVL SI, 20(SP)
13374 MOVQ $0x9e3779b1, R9
13375 MOVQ DI, R10
13376 MOVQ DI, R11
13377 SHRQ $0x08, R11
13378 SHLQ $0x20, R10
13379 IMULQ R9, R10
13380 SHRQ $0x36, R10
13381 SHLQ $0x20, R11
13382 IMULQ R9, R11
13383 SHRQ $0x36, R11
13384 MOVL (AX)(R10*4), SI
13385 MOVL (AX)(R11*4), R8
13386 MOVL DX, (AX)(R10*4)
13387 LEAL 1(DX), R10
13388 MOVL R10, (AX)(R11*4)
13389 MOVQ DI, R10
13390 SHRQ $0x10, R10
13391 SHLQ $0x20, R10
13392 IMULQ R9, R10
13393 SHRQ $0x36, R10
13394 MOVL DX, R9
13395 SUBL 16(SP), R9
13396 MOVL 1(BX)(R9*1), R11
13397 MOVQ DI, R9
13398 SHRQ $0x08, R9
13399 CMPL R9, R11
13400 JNE no_repeat_found_encodeSnappyBlockAsm10B
13401 LEAL 1(DX), DI
13402 MOVL 12(SP), SI
13403 MOVL DI, R8
13404 SUBL 16(SP), R8
13405 JZ repeat_extend_back_end_encodeSnappyBlockAsm10B
13406
13407 repeat_extend_back_loop_encodeSnappyBlockAsm10B:
13408 CMPL DI, SI
13409 JBE repeat_extend_back_end_encodeSnappyBlockAsm10B
13410 MOVB -1(BX)(R8*1), R9
13411 MOVB -1(BX)(DI*1), R10
13412 CMPB R9, R10
13413 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B
13414 LEAL -1(DI), DI
13415 DECL R8
13416 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B
13417
13418 repeat_extend_back_end_encodeSnappyBlockAsm10B:
13419 MOVL DI, SI
13420 SUBL 12(SP), SI
13421 LEAQ 3(CX)(SI*1), SI
13422 CMPQ SI, (SP)
13423 JB repeat_dst_size_check_encodeSnappyBlockAsm10B
13424 MOVQ $0x00000000, ret+56(FP)
13425 RET
13426
13427 repeat_dst_size_check_encodeSnappyBlockAsm10B:
13428 MOVL 12(SP), SI
13429 CMPL SI, DI
13430 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
13431 MOVL DI, R8
13432 MOVL DI, 12(SP)
13433 LEAQ (BX)(SI*1), R9
13434 SUBL SI, R8
13435 LEAL -1(R8), SI
13436 CMPL SI, $0x3c
13437 JB one_byte_repeat_emit_encodeSnappyBlockAsm10B
13438 CMPL SI, $0x00000100
13439 JB two_bytes_repeat_emit_encodeSnappyBlockAsm10B
13440 JB three_bytes_repeat_emit_encodeSnappyBlockAsm10B
13441
13442 three_bytes_repeat_emit_encodeSnappyBlockAsm10B:
13443 MOVB $0xf4, (CX)
13444 MOVW SI, 1(CX)
13445 ADDQ $0x03, CX
13446 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
13447
13448 two_bytes_repeat_emit_encodeSnappyBlockAsm10B:
13449 MOVB $0xf0, (CX)
13450 MOVB SI, 1(CX)
13451 ADDQ $0x02, CX
13452 CMPL SI, $0x40
13453 JB memmove_repeat_emit_encodeSnappyBlockAsm10B
13454 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B
13455
13456 one_byte_repeat_emit_encodeSnappyBlockAsm10B:
13457 SHLB $0x02, SI
13458 MOVB SI, (CX)
13459 ADDQ $0x01, CX
13460
13461 memmove_repeat_emit_encodeSnappyBlockAsm10B:
13462 LEAQ (CX)(R8*1), SI
13463
13464 // genMemMoveShort
13465 CMPQ R8, $0x08
13466 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8
13467 CMPQ R8, $0x10
13468 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
13469 CMPQ R8, $0x20
13470 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
13471 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
13472
13473 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8:
13474 MOVQ (R9), R10
13475 MOVQ R10, (CX)
13476 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13477
13478 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
13479 MOVQ (R9), R10
13480 MOVQ -8(R9)(R8*1), R9
13481 MOVQ R10, (CX)
13482 MOVQ R9, -8(CX)(R8*1)
13483 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13484
13485 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
13486 MOVOU (R9), X0
13487 MOVOU -16(R9)(R8*1), X1
13488 MOVOU X0, (CX)
13489 MOVOU X1, -16(CX)(R8*1)
13490 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B
13491
13492 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
13493 MOVOU (R9), X0
13494 MOVOU 16(R9), X1
13495 MOVOU -32(R9)(R8*1), X2
13496 MOVOU -16(R9)(R8*1), X3
13497 MOVOU X0, (CX)
13498 MOVOU X1, 16(CX)
13499 MOVOU X2, -32(CX)(R8*1)
13500 MOVOU X3, -16(CX)(R8*1)
13501
13502 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B:
13503 MOVQ SI, CX
13504 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B
13505
13506 memmove_long_repeat_emit_encodeSnappyBlockAsm10B:
13507 LEAQ (CX)(R8*1), SI
13508
13509 // genMemMoveLong
13510 MOVOU (R9), X0
13511 MOVOU 16(R9), X1
13512 MOVOU -32(R9)(R8*1), X2
13513 MOVOU -16(R9)(R8*1), X3
13514 MOVQ R8, R11
13515 SHRQ $0x05, R11
13516 MOVQ CX, R10
13517 ANDL $0x0000001f, R10
13518 MOVQ $0x00000040, R12
13519 SUBQ R10, R12
13520 DECQ R11
13521 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13522 LEAQ -32(R9)(R12*1), R10
13523 LEAQ -32(CX)(R12*1), R13
13524
13525 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
13526 MOVOU (R10), X4
13527 MOVOU 16(R10), X5
13528 MOVOA X4, (R13)
13529 MOVOA X5, 16(R13)
13530 ADDQ $0x20, R13
13531 ADDQ $0x20, R10
13532 ADDQ $0x20, R12
13533 DECQ R11
13534 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
13535
13536 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
13537 MOVOU -32(R9)(R12*1), X4
13538 MOVOU -16(R9)(R12*1), X5
13539 MOVOA X4, -32(CX)(R12*1)
13540 MOVOA X5, -16(CX)(R12*1)
13541 ADDQ $0x20, R12
13542 CMPQ R8, R12
13543 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13544 MOVOU X0, (CX)
13545 MOVOU X1, 16(CX)
13546 MOVOU X2, -32(CX)(R8*1)
13547 MOVOU X3, -16(CX)(R8*1)
13548 MOVQ SI, CX
13549
13550 emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B:
13551 ADDL $0x05, DX
13552 MOVL DX, SI
13553 SUBL 16(SP), SI
13554 MOVQ src_len+32(FP), R8
13555 SUBL DX, R8
13556 LEAQ (BX)(DX*1), R9
13557 LEAQ (BX)(SI*1), SI
13558
13559 // matchLen
13560 XORL R11, R11
13561
13562 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B:
13563 CMPL R8, $0x10
13564 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B
13565 MOVQ (R9)(R11*1), R10
13566 MOVQ 8(R9)(R11*1), R12
13567 XORQ (SI)(R11*1), R10
13568 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
13569 XORQ 8(SI)(R11*1), R12
13570 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B
13571 LEAL -16(R8), R8
13572 LEAL 16(R11), R11
13573 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B
13574
13575 matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B:
13576 #ifdef GOAMD64_v3
13577 TZCNTQ R12, R12
13578
13579 #else
13580 BSFQ R12, R12
13581
13582 #endif
13583 SARQ $0x03, R12
13584 LEAL 8(R11)(R12*1), R11
13585 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
13586
13587 matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B:
13588 CMPL R8, $0x08
13589 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
13590 MOVQ (R9)(R11*1), R10
13591 XORQ (SI)(R11*1), R10
13592 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B
13593 LEAL -8(R8), R8
13594 LEAL 8(R11), R11
13595 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B
13596
13597 matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B:
13598 #ifdef GOAMD64_v3
13599 TZCNTQ R10, R10
13600
13601 #else
13602 BSFQ R10, R10
13603
13604 #endif
13605 SARQ $0x03, R10
13606 LEAL (R11)(R10*1), R11
13607 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B
13608
13609 matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B:
13610 CMPL R8, $0x04
13611 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
13612 MOVL (R9)(R11*1), R10
13613 CMPL (SI)(R11*1), R10
13614 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B
13615 LEAL -4(R8), R8
13616 LEAL 4(R11), R11
13617
13618 matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B:
13619 CMPL R8, $0x01
13620 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
13621 JB repeat_extend_forward_end_encodeSnappyBlockAsm10B
13622 MOVW (R9)(R11*1), R10
13623 CMPW (SI)(R11*1), R10
13624 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B
13625 LEAL 2(R11), R11
13626 SUBL $0x02, R8
13627 JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B
13628
13629 matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B:
13630 MOVB (R9)(R11*1), R10
13631 CMPB (SI)(R11*1), R10
13632 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B
13633 LEAL 1(R11), R11
13634
13635 repeat_extend_forward_end_encodeSnappyBlockAsm10B:
13636 ADDL R11, DX
13637 MOVL DX, SI
13638 SUBL DI, SI
13639 MOVL 16(SP), DI
13640
13641 // emitCopy
13642 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B:
13643 CMPL SI, $0x40
13644 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B
13645 MOVB $0xee, (CX)
13646 MOVW DI, 1(CX)
13647 LEAL -60(SI), SI
13648 ADDQ $0x03, CX
13649 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B
13650
13651 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B:
13652 MOVL SI, R8
13653 SHLL $0x02, R8
13654 CMPL SI, $0x0c
13655 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
13656 CMPL DI, $0x00000800
13657 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B
13658 LEAL -15(R8), R8
13659 MOVB DI, 1(CX)
13660 SHRL $0x08, DI
13661 SHLL $0x05, DI
13662 ORL DI, R8
13663 MOVB R8, (CX)
13664 ADDQ $0x02, CX
13665 JMP repeat_end_emit_encodeSnappyBlockAsm10B
13666
13667 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B:
13668 LEAL -2(R8), R8
13669 MOVB R8, (CX)
13670 MOVW DI, 1(CX)
13671 ADDQ $0x03, CX
13672
13673 repeat_end_emit_encodeSnappyBlockAsm10B:
13674 MOVL DX, 12(SP)
13675 JMP search_loop_encodeSnappyBlockAsm10B
13676
13677 no_repeat_found_encodeSnappyBlockAsm10B:
13678 CMPL (BX)(SI*1), DI
13679 JEQ candidate_match_encodeSnappyBlockAsm10B
13680 SHRQ $0x08, DI
13681 MOVL (AX)(R10*4), SI
13682 LEAL 2(DX), R9
13683 CMPL (BX)(R8*1), DI
13684 JEQ candidate2_match_encodeSnappyBlockAsm10B
13685 MOVL R9, (AX)(R10*4)
13686 SHRQ $0x08, DI
13687 CMPL (BX)(SI*1), DI
13688 JEQ candidate3_match_encodeSnappyBlockAsm10B
13689 MOVL 20(SP), DX
13690 JMP search_loop_encodeSnappyBlockAsm10B
13691
13692 candidate3_match_encodeSnappyBlockAsm10B:
13693 ADDL $0x02, DX
13694 JMP candidate_match_encodeSnappyBlockAsm10B
13695
13696 candidate2_match_encodeSnappyBlockAsm10B:
13697 MOVL R9, (AX)(R10*4)
13698 INCL DX
13699 MOVL R8, SI
13700
13701 candidate_match_encodeSnappyBlockAsm10B:
13702 MOVL 12(SP), DI
13703 TESTL SI, SI
13704 JZ match_extend_back_end_encodeSnappyBlockAsm10B
13705
13706 match_extend_back_loop_encodeSnappyBlockAsm10B:
13707 CMPL DX, DI
13708 JBE match_extend_back_end_encodeSnappyBlockAsm10B
13709 MOVB -1(BX)(SI*1), R8
13710 MOVB -1(BX)(DX*1), R9
13711 CMPB R8, R9
13712 JNE match_extend_back_end_encodeSnappyBlockAsm10B
13713 LEAL -1(DX), DX
13714 DECL SI
13715 JZ match_extend_back_end_encodeSnappyBlockAsm10B
13716 JMP match_extend_back_loop_encodeSnappyBlockAsm10B
13717
13718 match_extend_back_end_encodeSnappyBlockAsm10B:
13719 MOVL DX, DI
13720 SUBL 12(SP), DI
13721 LEAQ 3(CX)(DI*1), DI
13722 CMPQ DI, (SP)
13723 JB match_dst_size_check_encodeSnappyBlockAsm10B
13724 MOVQ $0x00000000, ret+56(FP)
13725 RET
13726
13727 match_dst_size_check_encodeSnappyBlockAsm10B:
13728 MOVL DX, DI
13729 MOVL 12(SP), R8
13730 CMPL R8, DI
13731 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B
13732 MOVL DI, R9
13733 MOVL DI, 12(SP)
13734 LEAQ (BX)(R8*1), DI
13735 SUBL R8, R9
13736 LEAL -1(R9), R8
13737 CMPL R8, $0x3c
13738 JB one_byte_match_emit_encodeSnappyBlockAsm10B
13739 CMPL R8, $0x00000100
13740 JB two_bytes_match_emit_encodeSnappyBlockAsm10B
13741 JB three_bytes_match_emit_encodeSnappyBlockAsm10B
13742
13743 three_bytes_match_emit_encodeSnappyBlockAsm10B:
13744 MOVB $0xf4, (CX)
13745 MOVW R8, 1(CX)
13746 ADDQ $0x03, CX
13747 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
13748
13749 two_bytes_match_emit_encodeSnappyBlockAsm10B:
13750 MOVB $0xf0, (CX)
13751 MOVB R8, 1(CX)
13752 ADDQ $0x02, CX
13753 CMPL R8, $0x40
13754 JB memmove_match_emit_encodeSnappyBlockAsm10B
13755 JMP memmove_long_match_emit_encodeSnappyBlockAsm10B
13756
13757 one_byte_match_emit_encodeSnappyBlockAsm10B:
13758 SHLB $0x02, R8
13759 MOVB R8, (CX)
13760 ADDQ $0x01, CX
13761
13762 memmove_match_emit_encodeSnappyBlockAsm10B:
13763 LEAQ (CX)(R9*1), R8
13764
13765 // genMemMoveShort
13766 CMPQ R9, $0x08
13767 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8
13768 CMPQ R9, $0x10
13769 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16
13770 CMPQ R9, $0x20
13771 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32
13772 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64
13773
13774 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8:
13775 MOVQ (DI), R10
13776 MOVQ R10, (CX)
13777 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13778
13779 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16:
13780 MOVQ (DI), R10
13781 MOVQ -8(DI)(R9*1), DI
13782 MOVQ R10, (CX)
13783 MOVQ DI, -8(CX)(R9*1)
13784 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13785
13786 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32:
13787 MOVOU (DI), X0
13788 MOVOU -16(DI)(R9*1), X1
13789 MOVOU X0, (CX)
13790 MOVOU X1, -16(CX)(R9*1)
13791 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B
13792
13793 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64:
13794 MOVOU (DI), X0
13795 MOVOU 16(DI), X1
13796 MOVOU -32(DI)(R9*1), X2
13797 MOVOU -16(DI)(R9*1), X3
13798 MOVOU X0, (CX)
13799 MOVOU X1, 16(CX)
13800 MOVOU X2, -32(CX)(R9*1)
13801 MOVOU X3, -16(CX)(R9*1)
13802
13803 memmove_end_copy_match_emit_encodeSnappyBlockAsm10B:
13804 MOVQ R8, CX
13805 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B
13806
13807 memmove_long_match_emit_encodeSnappyBlockAsm10B:
13808 LEAQ (CX)(R9*1), R8
13809
13810 // genMemMoveLong
13811 MOVOU (DI), X0
13812 MOVOU 16(DI), X1
13813 MOVOU -32(DI)(R9*1), X2
13814 MOVOU -16(DI)(R9*1), X3
13815 MOVQ R9, R11
13816 SHRQ $0x05, R11
13817 MOVQ CX, R10
13818 ANDL $0x0000001f, R10
13819 MOVQ $0x00000040, R12
13820 SUBQ R10, R12
13821 DECQ R11
13822 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13823 LEAQ -32(DI)(R12*1), R10
13824 LEAQ -32(CX)(R12*1), R13
13825
13826 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back:
13827 MOVOU (R10), X4
13828 MOVOU 16(R10), X5
13829 MOVOA X4, (R13)
13830 MOVOA X5, 16(R13)
13831 ADDQ $0x20, R13
13832 ADDQ $0x20, R10
13833 ADDQ $0x20, R12
13834 DECQ R11
13835 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back
13836
13837 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
13838 MOVOU -32(DI)(R12*1), X4
13839 MOVOU -16(DI)(R12*1), X5
13840 MOVOA X4, -32(CX)(R12*1)
13841 MOVOA X5, -16(CX)(R12*1)
13842 ADDQ $0x20, R12
13843 CMPQ R9, R12
13844 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
13845 MOVOU X0, (CX)
13846 MOVOU X1, 16(CX)
13847 MOVOU X2, -32(CX)(R9*1)
13848 MOVOU X3, -16(CX)(R9*1)
13849 MOVQ R8, CX
13850
13851 emit_literal_done_match_emit_encodeSnappyBlockAsm10B:
13852 match_nolit_loop_encodeSnappyBlockAsm10B:
13853 MOVL DX, DI
13854 SUBL SI, DI
13855 MOVL DI, 16(SP)
13856 ADDL $0x04, DX
13857 ADDL $0x04, SI
13858 MOVQ src_len+32(FP), DI
13859 SUBL DX, DI
13860 LEAQ (BX)(DX*1), R8
13861 LEAQ (BX)(SI*1), SI
13862
13863 // matchLen
13864 XORL R10, R10
13865
13866 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B:
13867 CMPL DI, $0x10
13868 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B
13869 MOVQ (R8)(R10*1), R9
13870 MOVQ 8(R8)(R10*1), R11
13871 XORQ (SI)(R10*1), R9
13872 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
13873 XORQ 8(SI)(R10*1), R11
13874 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B
13875 LEAL -16(DI), DI
13876 LEAL 16(R10), R10
13877 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B
13878
13879 matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B:
13880 #ifdef GOAMD64_v3
13881 TZCNTQ R11, R11
13882
13883 #else
13884 BSFQ R11, R11
13885
13886 #endif
13887 SARQ $0x03, R11
13888 LEAL 8(R10)(R11*1), R10
13889 JMP match_nolit_end_encodeSnappyBlockAsm10B
13890
13891 matchlen_match8_match_nolit_encodeSnappyBlockAsm10B:
13892 CMPL DI, $0x08
13893 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
13894 MOVQ (R8)(R10*1), R9
13895 XORQ (SI)(R10*1), R9
13896 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B
13897 LEAL -8(DI), DI
13898 LEAL 8(R10), R10
13899 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B
13900
13901 matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B:
13902 #ifdef GOAMD64_v3
13903 TZCNTQ R9, R9
13904
13905 #else
13906 BSFQ R9, R9
13907
13908 #endif
13909 SARQ $0x03, R9
13910 LEAL (R10)(R9*1), R10
13911 JMP match_nolit_end_encodeSnappyBlockAsm10B
13912
13913 matchlen_match4_match_nolit_encodeSnappyBlockAsm10B:
13914 CMPL DI, $0x04
13915 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
13916 MOVL (R8)(R10*1), R9
13917 CMPL (SI)(R10*1), R9
13918 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B
13919 LEAL -4(DI), DI
13920 LEAL 4(R10), R10
13921
13922 matchlen_match2_match_nolit_encodeSnappyBlockAsm10B:
13923 CMPL DI, $0x01
13924 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
13925 JB match_nolit_end_encodeSnappyBlockAsm10B
13926 MOVW (R8)(R10*1), R9
13927 CMPW (SI)(R10*1), R9
13928 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B
13929 LEAL 2(R10), R10
13930 SUBL $0x02, DI
13931 JZ match_nolit_end_encodeSnappyBlockAsm10B
13932
13933 matchlen_match1_match_nolit_encodeSnappyBlockAsm10B:
13934 MOVB (R8)(R10*1), R9
13935 CMPB (SI)(R10*1), R9
13936 JNE match_nolit_end_encodeSnappyBlockAsm10B
13937 LEAL 1(R10), R10
13938
13939 match_nolit_end_encodeSnappyBlockAsm10B:
13940 ADDL R10, DX
13941 MOVL 16(SP), SI
13942 ADDL $0x04, R10
13943 MOVL DX, 12(SP)
13944
13945 // emitCopy
13946 two_byte_offset_match_nolit_encodeSnappyBlockAsm10B:
13947 CMPL R10, $0x40
13948 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B
13949 MOVB $0xee, (CX)
13950 MOVW SI, 1(CX)
13951 LEAL -60(R10), R10
13952 ADDQ $0x03, CX
13953 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B
13954
13955 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B:
13956 MOVL R10, DI
13957 SHLL $0x02, DI
13958 CMPL R10, $0x0c
13959 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
13960 CMPL SI, $0x00000800
13961 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B
13962 LEAL -15(DI), DI
13963 MOVB SI, 1(CX)
13964 SHRL $0x08, SI
13965 SHLL $0x05, SI
13966 ORL SI, DI
13967 MOVB DI, (CX)
13968 ADDQ $0x02, CX
13969 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B
13970
13971 emit_copy_three_match_nolit_encodeSnappyBlockAsm10B:
13972 LEAL -2(DI), DI
13973 MOVB DI, (CX)
13974 MOVW SI, 1(CX)
13975 ADDQ $0x03, CX
13976
13977 match_nolit_emitcopy_end_encodeSnappyBlockAsm10B:
13978 CMPL DX, 8(SP)
13979 JAE emit_remainder_encodeSnappyBlockAsm10B
13980 MOVQ -2(BX)(DX*1), DI
13981 CMPQ CX, (SP)
13982 JB match_nolit_dst_ok_encodeSnappyBlockAsm10B
13983 MOVQ $0x00000000, ret+56(FP)
13984 RET
13985
13986 match_nolit_dst_ok_encodeSnappyBlockAsm10B:
13987 MOVQ $0x9e3779b1, R9
13988 MOVQ DI, R8
13989 SHRQ $0x10, DI
13990 MOVQ DI, SI
13991 SHLQ $0x20, R8
13992 IMULQ R9, R8
13993 SHRQ $0x36, R8
13994 SHLQ $0x20, SI
13995 IMULQ R9, SI
13996 SHRQ $0x36, SI
13997 LEAL -2(DX), R9
13998 LEAQ (AX)(SI*4), R10
13999 MOVL (R10), SI
14000 MOVL R9, (AX)(R8*4)
14001 MOVL DX, (R10)
14002 CMPL (BX)(SI*1), DI
14003 JEQ match_nolit_loop_encodeSnappyBlockAsm10B
14004 INCL DX
14005 JMP search_loop_encodeSnappyBlockAsm10B
14006
14007 emit_remainder_encodeSnappyBlockAsm10B:
14008 MOVQ src_len+32(FP), AX
14009 SUBL 12(SP), AX
14010 LEAQ 3(CX)(AX*1), AX
14011 CMPQ AX, (SP)
14012 JB emit_remainder_ok_encodeSnappyBlockAsm10B
14013 MOVQ $0x00000000, ret+56(FP)
14014 RET
14015
14016 emit_remainder_ok_encodeSnappyBlockAsm10B:
14017 MOVQ src_len+32(FP), AX
14018 MOVL 12(SP), DX
14019 CMPL DX, AX
14020 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
14021 MOVL AX, SI
14022 MOVL AX, 12(SP)
14023 LEAQ (BX)(DX*1), AX
14024 SUBL DX, SI
14025 LEAL -1(SI), DX
14026 CMPL DX, $0x3c
14027 JB one_byte_emit_remainder_encodeSnappyBlockAsm10B
14028 CMPL DX, $0x00000100
14029 JB two_bytes_emit_remainder_encodeSnappyBlockAsm10B
14030 JB three_bytes_emit_remainder_encodeSnappyBlockAsm10B
14031
14032 three_bytes_emit_remainder_encodeSnappyBlockAsm10B:
14033 MOVB $0xf4, (CX)
14034 MOVW DX, 1(CX)
14035 ADDQ $0x03, CX
14036 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
14037
14038 two_bytes_emit_remainder_encodeSnappyBlockAsm10B:
14039 MOVB $0xf0, (CX)
14040 MOVB DL, 1(CX)
14041 ADDQ $0x02, CX
14042 CMPL DX, $0x40
14043 JB memmove_emit_remainder_encodeSnappyBlockAsm10B
14044 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B
14045
14046 one_byte_emit_remainder_encodeSnappyBlockAsm10B:
14047 SHLB $0x02, DL
14048 MOVB DL, (CX)
14049 ADDQ $0x01, CX
14050
14051 memmove_emit_remainder_encodeSnappyBlockAsm10B:
14052 LEAQ (CX)(SI*1), DX
14053 MOVL SI, BX
14054
14055 // genMemMoveShort
14056 CMPQ BX, $0x03
14057 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2
14058 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3
14059 CMPQ BX, $0x08
14060 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7
14061 CMPQ BX, $0x10
14062 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16
14063 CMPQ BX, $0x20
14064 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32
14065 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64
14066
14067 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2:
14068 MOVB (AX), SI
14069 MOVB -1(AX)(BX*1), AL
14070 MOVB SI, (CX)
14071 MOVB AL, -1(CX)(BX*1)
14072 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14073
14074 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3:
14075 MOVW (AX), SI
14076 MOVB 2(AX), AL
14077 MOVW SI, (CX)
14078 MOVB AL, 2(CX)
14079 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14080
14081 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7:
14082 MOVL (AX), SI
14083 MOVL -4(AX)(BX*1), AX
14084 MOVL SI, (CX)
14085 MOVL AX, -4(CX)(BX*1)
14086 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14087
14088 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16:
14089 MOVQ (AX), SI
14090 MOVQ -8(AX)(BX*1), AX
14091 MOVQ SI, (CX)
14092 MOVQ AX, -8(CX)(BX*1)
14093 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14094
14095 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32:
14096 MOVOU (AX), X0
14097 MOVOU -16(AX)(BX*1), X1
14098 MOVOU X0, (CX)
14099 MOVOU X1, -16(CX)(BX*1)
14100 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B
14101
14102 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64:
14103 MOVOU (AX), X0
14104 MOVOU 16(AX), X1
14105 MOVOU -32(AX)(BX*1), X2
14106 MOVOU -16(AX)(BX*1), X3
14107 MOVOU X0, (CX)
14108 MOVOU X1, 16(CX)
14109 MOVOU X2, -32(CX)(BX*1)
14110 MOVOU X3, -16(CX)(BX*1)
14111
14112 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B:
14113 MOVQ DX, CX
14114 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B
14115
14116 memmove_long_emit_remainder_encodeSnappyBlockAsm10B:
14117 LEAQ (CX)(SI*1), DX
14118 MOVL SI, BX
14119
14120 // genMemMoveLong
14121 MOVOU (AX), X0
14122 MOVOU 16(AX), X1
14123 MOVOU -32(AX)(BX*1), X2
14124 MOVOU -16(AX)(BX*1), X3
14125 MOVQ BX, DI
14126 SHRQ $0x05, DI
14127 MOVQ CX, SI
14128 ANDL $0x0000001f, SI
14129 MOVQ $0x00000040, R8
14130 SUBQ SI, R8
14131 DECQ DI
14132 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
14133 LEAQ -32(AX)(R8*1), SI
14134 LEAQ -32(CX)(R8*1), R9
14135
14136 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back:
14137 MOVOU (SI), X4
14138 MOVOU 16(SI), X5
14139 MOVOA X4, (R9)
14140 MOVOA X5, 16(R9)
14141 ADDQ $0x20, R9
14142 ADDQ $0x20, SI
14143 ADDQ $0x20, R8
14144 DECQ DI
14145 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back
14146
14147 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32:
14148 MOVOU -32(AX)(R8*1), X4
14149 MOVOU -16(AX)(R8*1), X5
14150 MOVOA X4, -32(CX)(R8*1)
14151 MOVOA X5, -16(CX)(R8*1)
14152 ADDQ $0x20, R8
14153 CMPQ BX, R8
14154 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32
14155 MOVOU X0, (CX)
14156 MOVOU X1, 16(CX)
14157 MOVOU X2, -32(CX)(BX*1)
14158 MOVOU X3, -16(CX)(BX*1)
14159 MOVQ DX, CX
14160
14161 emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B:
14162 MOVQ dst_base+0(FP), AX
14163 SUBQ AX, CX
14164 MOVQ CX, ret+56(FP)
14165 RET
14166
14167 // func encodeSnappyBlockAsm8B(dst []byte, src []byte, tmp *[1024]byte) int
14168 // Requires: BMI, SSE2
14169 TEXT ·encodeSnappyBlockAsm8B(SB), $24-64
14170 MOVQ tmp+48(FP), AX
14171 MOVQ dst_base+0(FP), CX
14172 MOVQ $0x00000008, DX
14173 MOVQ AX, BX
14174 PXOR X0, X0
14175
14176 zero_loop_encodeSnappyBlockAsm8B:
14177 MOVOU X0, (BX)
14178 MOVOU X0, 16(BX)
14179 MOVOU X0, 32(BX)
14180 MOVOU X0, 48(BX)
14181 MOVOU X0, 64(BX)
14182 MOVOU X0, 80(BX)
14183 MOVOU X0, 96(BX)
14184 MOVOU X0, 112(BX)
14185 ADDQ $0x80, BX
14186 DECQ DX
14187 JNZ zero_loop_encodeSnappyBlockAsm8B
14188 MOVL $0x00000000, 12(SP)
14189 MOVQ src_len+32(FP), DX
14190 LEAQ -9(DX), BX
14191 LEAQ -8(DX), SI
14192 MOVL SI, 8(SP)
14193 SHRQ $0x05, DX
14194 SUBL DX, BX
14195 LEAQ (CX)(BX*1), BX
14196 MOVQ BX, (SP)
14197 MOVL $0x00000001, DX
14198 MOVL DX, 16(SP)
14199 MOVQ src_base+24(FP), BX
14200
14201 search_loop_encodeSnappyBlockAsm8B:
14202 MOVL DX, SI
14203 SUBL 12(SP), SI
14204 SHRL $0x04, SI
14205 LEAL 4(DX)(SI*1), SI
14206 CMPL SI, 8(SP)
14207 JAE emit_remainder_encodeSnappyBlockAsm8B
14208 MOVQ (BX)(DX*1), DI
14209 MOVL SI, 20(SP)
14210 MOVQ $0x9e3779b1, R9
14211 MOVQ DI, R10
14212 MOVQ DI, R11
14213 SHRQ $0x08, R11
14214 SHLQ $0x20, R10
14215 IMULQ R9, R10
14216 SHRQ $0x38, R10
14217 SHLQ $0x20, R11
14218 IMULQ R9, R11
14219 SHRQ $0x38, R11
14220 MOVL (AX)(R10*4), SI
14221 MOVL (AX)(R11*4), R8
14222 MOVL DX, (AX)(R10*4)
14223 LEAL 1(DX), R10
14224 MOVL R10, (AX)(R11*4)
14225 MOVQ DI, R10
14226 SHRQ $0x10, R10
14227 SHLQ $0x20, R10
14228 IMULQ R9, R10
14229 SHRQ $0x38, R10
14230 MOVL DX, R9
14231 SUBL 16(SP), R9
14232 MOVL 1(BX)(R9*1), R11
14233 MOVQ DI, R9
14234 SHRQ $0x08, R9
14235 CMPL R9, R11
14236 JNE no_repeat_found_encodeSnappyBlockAsm8B
14237 LEAL 1(DX), DI
14238 MOVL 12(SP), SI
14239 MOVL DI, R8
14240 SUBL 16(SP), R8
14241 JZ repeat_extend_back_end_encodeSnappyBlockAsm8B
14242
14243 repeat_extend_back_loop_encodeSnappyBlockAsm8B:
14244 CMPL DI, SI
14245 JBE repeat_extend_back_end_encodeSnappyBlockAsm8B
14246 MOVB -1(BX)(R8*1), R9
14247 MOVB -1(BX)(DI*1), R10
14248 CMPB R9, R10
14249 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B
14250 LEAL -1(DI), DI
14251 DECL R8
14252 JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B
14253
14254 repeat_extend_back_end_encodeSnappyBlockAsm8B:
14255 MOVL DI, SI
14256 SUBL 12(SP), SI
14257 LEAQ 3(CX)(SI*1), SI
14258 CMPQ SI, (SP)
14259 JB repeat_dst_size_check_encodeSnappyBlockAsm8B
14260 MOVQ $0x00000000, ret+56(FP)
14261 RET
14262
14263 repeat_dst_size_check_encodeSnappyBlockAsm8B:
14264 MOVL 12(SP), SI
14265 CMPL SI, DI
14266 JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
14267 MOVL DI, R8
14268 MOVL DI, 12(SP)
14269 LEAQ (BX)(SI*1), R9
14270 SUBL SI, R8
14271 LEAL -1(R8), SI
14272 CMPL SI, $0x3c
14273 JB one_byte_repeat_emit_encodeSnappyBlockAsm8B
14274 CMPL SI, $0x00000100
14275 JB two_bytes_repeat_emit_encodeSnappyBlockAsm8B
14276 JB three_bytes_repeat_emit_encodeSnappyBlockAsm8B
14277
14278 three_bytes_repeat_emit_encodeSnappyBlockAsm8B:
14279 MOVB $0xf4, (CX)
14280 MOVW SI, 1(CX)
14281 ADDQ $0x03, CX
14282 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
14283
14284 two_bytes_repeat_emit_encodeSnappyBlockAsm8B:
14285 MOVB $0xf0, (CX)
14286 MOVB SI, 1(CX)
14287 ADDQ $0x02, CX
14288 CMPL SI, $0x40
14289 JB memmove_repeat_emit_encodeSnappyBlockAsm8B
14290 JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B
14291
14292 one_byte_repeat_emit_encodeSnappyBlockAsm8B:
14293 SHLB $0x02, SI
14294 MOVB SI, (CX)
14295 ADDQ $0x01, CX
14296
14297 memmove_repeat_emit_encodeSnappyBlockAsm8B:
14298 LEAQ (CX)(R8*1), SI
14299
14300 // genMemMoveShort
14301 CMPQ R8, $0x08
14302 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8
14303 CMPQ R8, $0x10
14304 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
14305 CMPQ R8, $0x20
14306 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
14307 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
14308
14309 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8:
14310 MOVQ (R9), R10
14311 MOVQ R10, (CX)
14312 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14313
14314 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
14315 MOVQ (R9), R10
14316 MOVQ -8(R9)(R8*1), R9
14317 MOVQ R10, (CX)
14318 MOVQ R9, -8(CX)(R8*1)
14319 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14320
14321 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
14322 MOVOU (R9), X0
14323 MOVOU -16(R9)(R8*1), X1
14324 MOVOU X0, (CX)
14325 MOVOU X1, -16(CX)(R8*1)
14326 JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B
14327
14328 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
14329 MOVOU (R9), X0
14330 MOVOU 16(R9), X1
14331 MOVOU -32(R9)(R8*1), X2
14332 MOVOU -16(R9)(R8*1), X3
14333 MOVOU X0, (CX)
14334 MOVOU X1, 16(CX)
14335 MOVOU X2, -32(CX)(R8*1)
14336 MOVOU X3, -16(CX)(R8*1)
14337
14338 memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B:
14339 MOVQ SI, CX
14340 JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B
14341
14342 memmove_long_repeat_emit_encodeSnappyBlockAsm8B:
14343 LEAQ (CX)(R8*1), SI
14344
14345 // genMemMoveLong
14346 MOVOU (R9), X0
14347 MOVOU 16(R9), X1
14348 MOVOU -32(R9)(R8*1), X2
14349 MOVOU -16(R9)(R8*1), X3
14350 MOVQ R8, R11
14351 SHRQ $0x05, R11
14352 MOVQ CX, R10
14353 ANDL $0x0000001f, R10
14354 MOVQ $0x00000040, R12
14355 SUBQ R10, R12
14356 DECQ R11
14357 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14358 LEAQ -32(R9)(R12*1), R10
14359 LEAQ -32(CX)(R12*1), R13
14360
14361 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
14362 MOVOU (R10), X4
14363 MOVOU 16(R10), X5
14364 MOVOA X4, (R13)
14365 MOVOA X5, 16(R13)
14366 ADDQ $0x20, R13
14367 ADDQ $0x20, R10
14368 ADDQ $0x20, R12
14369 DECQ R11
14370 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
14371
14372 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14373 MOVOU -32(R9)(R12*1), X4
14374 MOVOU -16(R9)(R12*1), X5
14375 MOVOA X4, -32(CX)(R12*1)
14376 MOVOA X5, -16(CX)(R12*1)
14377 ADDQ $0x20, R12
14378 CMPQ R8, R12
14379 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14380 MOVOU X0, (CX)
14381 MOVOU X1, 16(CX)
14382 MOVOU X2, -32(CX)(R8*1)
14383 MOVOU X3, -16(CX)(R8*1)
14384 MOVQ SI, CX
14385
14386 emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B:
14387 ADDL $0x05, DX
14388 MOVL DX, SI
14389 SUBL 16(SP), SI
14390 MOVQ src_len+32(FP), R8
14391 SUBL DX, R8
14392 LEAQ (BX)(DX*1), R9
14393 LEAQ (BX)(SI*1), SI
14394
14395 // matchLen
14396 XORL R11, R11
14397
14398 matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B:
14399 CMPL R8, $0x10
14400 JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B
14401 MOVQ (R9)(R11*1), R10
14402 MOVQ 8(R9)(R11*1), R12
14403 XORQ (SI)(R11*1), R10
14404 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
14405 XORQ 8(SI)(R11*1), R12
14406 JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B
14407 LEAL -16(R8), R8
14408 LEAL 16(R11), R11
14409 JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B
14410
14411 matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B:
14412 #ifdef GOAMD64_v3
14413 TZCNTQ R12, R12
14414
14415 #else
14416 BSFQ R12, R12
14417
14418 #endif
14419 SARQ $0x03, R12
14420 LEAL 8(R11)(R12*1), R11
14421 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
14422
14423 matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B:
14424 CMPL R8, $0x08
14425 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
14426 MOVQ (R9)(R11*1), R10
14427 XORQ (SI)(R11*1), R10
14428 JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B
14429 LEAL -8(R8), R8
14430 LEAL 8(R11), R11
14431 JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B
14432
14433 matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B:
14434 #ifdef GOAMD64_v3
14435 TZCNTQ R10, R10
14436
14437 #else
14438 BSFQ R10, R10
14439
14440 #endif
14441 SARQ $0x03, R10
14442 LEAL (R11)(R10*1), R11
14443 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B
14444
14445 matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B:
14446 CMPL R8, $0x04
14447 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
14448 MOVL (R9)(R11*1), R10
14449 CMPL (SI)(R11*1), R10
14450 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B
14451 LEAL -4(R8), R8
14452 LEAL 4(R11), R11
14453
14454 matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B:
14455 CMPL R8, $0x01
14456 JE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
14457 JB repeat_extend_forward_end_encodeSnappyBlockAsm8B
14458 MOVW (R9)(R11*1), R10
14459 CMPW (SI)(R11*1), R10
14460 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B
14461 LEAL 2(R11), R11
14462 SUBL $0x02, R8
14463 JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B
14464
14465 matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B:
14466 MOVB (R9)(R11*1), R10
14467 CMPB (SI)(R11*1), R10
14468 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B
14469 LEAL 1(R11), R11
14470
14471 repeat_extend_forward_end_encodeSnappyBlockAsm8B:
14472 ADDL R11, DX
14473 MOVL DX, SI
14474 SUBL DI, SI
14475 MOVL 16(SP), DI
14476
14477 // emitCopy
14478 two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B:
14479 CMPL SI, $0x40
14480 JBE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B
14481 MOVB $0xee, (CX)
14482 MOVW DI, 1(CX)
14483 LEAL -60(SI), SI
14484 ADDQ $0x03, CX
14485 JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B
14486
14487 two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B:
14488 MOVL SI, R8
14489 SHLL $0x02, R8
14490 CMPL SI, $0x0c
14491 JAE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B
14492 LEAL -15(R8), R8
14493 MOVB DI, 1(CX)
14494 SHRL $0x08, DI
14495 SHLL $0x05, DI
14496 ORL DI, R8
14497 MOVB R8, (CX)
14498 ADDQ $0x02, CX
14499 JMP repeat_end_emit_encodeSnappyBlockAsm8B
14500
14501 emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B:
14502 LEAL -2(R8), R8
14503 MOVB R8, (CX)
14504 MOVW DI, 1(CX)
14505 ADDQ $0x03, CX
14506
14507 repeat_end_emit_encodeSnappyBlockAsm8B:
14508 MOVL DX, 12(SP)
14509 JMP search_loop_encodeSnappyBlockAsm8B
14510
14511 no_repeat_found_encodeSnappyBlockAsm8B:
14512 CMPL (BX)(SI*1), DI
14513 JEQ candidate_match_encodeSnappyBlockAsm8B
14514 SHRQ $0x08, DI
14515 MOVL (AX)(R10*4), SI
14516 LEAL 2(DX), R9
14517 CMPL (BX)(R8*1), DI
14518 JEQ candidate2_match_encodeSnappyBlockAsm8B
14519 MOVL R9, (AX)(R10*4)
14520 SHRQ $0x08, DI
14521 CMPL (BX)(SI*1), DI
14522 JEQ candidate3_match_encodeSnappyBlockAsm8B
14523 MOVL 20(SP), DX
14524 JMP search_loop_encodeSnappyBlockAsm8B
14525
14526 candidate3_match_encodeSnappyBlockAsm8B:
14527 ADDL $0x02, DX
14528 JMP candidate_match_encodeSnappyBlockAsm8B
14529
14530 candidate2_match_encodeSnappyBlockAsm8B:
14531 MOVL R9, (AX)(R10*4)
14532 INCL DX
14533 MOVL R8, SI
14534
14535 candidate_match_encodeSnappyBlockAsm8B:
14536 MOVL 12(SP), DI
14537 TESTL SI, SI
14538 JZ match_extend_back_end_encodeSnappyBlockAsm8B
14539
14540 match_extend_back_loop_encodeSnappyBlockAsm8B:
14541 CMPL DX, DI
14542 JBE match_extend_back_end_encodeSnappyBlockAsm8B
14543 MOVB -1(BX)(SI*1), R8
14544 MOVB -1(BX)(DX*1), R9
14545 CMPB R8, R9
14546 JNE match_extend_back_end_encodeSnappyBlockAsm8B
14547 LEAL -1(DX), DX
14548 DECL SI
14549 JZ match_extend_back_end_encodeSnappyBlockAsm8B
14550 JMP match_extend_back_loop_encodeSnappyBlockAsm8B
14551
14552 match_extend_back_end_encodeSnappyBlockAsm8B:
14553 MOVL DX, DI
14554 SUBL 12(SP), DI
14555 LEAQ 3(CX)(DI*1), DI
14556 CMPQ DI, (SP)
14557 JB match_dst_size_check_encodeSnappyBlockAsm8B
14558 MOVQ $0x00000000, ret+56(FP)
14559 RET
14560
14561 match_dst_size_check_encodeSnappyBlockAsm8B:
14562 MOVL DX, DI
14563 MOVL 12(SP), R8
14564 CMPL R8, DI
14565 JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B
14566 MOVL DI, R9
14567 MOVL DI, 12(SP)
14568 LEAQ (BX)(R8*1), DI
14569 SUBL R8, R9
14570 LEAL -1(R9), R8
14571 CMPL R8, $0x3c
14572 JB one_byte_match_emit_encodeSnappyBlockAsm8B
14573 CMPL R8, $0x00000100
14574 JB two_bytes_match_emit_encodeSnappyBlockAsm8B
14575 JB three_bytes_match_emit_encodeSnappyBlockAsm8B
14576
14577 three_bytes_match_emit_encodeSnappyBlockAsm8B:
14578 MOVB $0xf4, (CX)
14579 MOVW R8, 1(CX)
14580 ADDQ $0x03, CX
14581 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
14582
14583 two_bytes_match_emit_encodeSnappyBlockAsm8B:
14584 MOVB $0xf0, (CX)
14585 MOVB R8, 1(CX)
14586 ADDQ $0x02, CX
14587 CMPL R8, $0x40
14588 JB memmove_match_emit_encodeSnappyBlockAsm8B
14589 JMP memmove_long_match_emit_encodeSnappyBlockAsm8B
14590
14591 one_byte_match_emit_encodeSnappyBlockAsm8B:
14592 SHLB $0x02, R8
14593 MOVB R8, (CX)
14594 ADDQ $0x01, CX
14595
14596 memmove_match_emit_encodeSnappyBlockAsm8B:
14597 LEAQ (CX)(R9*1), R8
14598
14599 // genMemMoveShort
14600 CMPQ R9, $0x08
14601 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8
14602 CMPQ R9, $0x10
14603 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16
14604 CMPQ R9, $0x20
14605 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32
14606 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64
14607
14608 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8:
14609 MOVQ (DI), R10
14610 MOVQ R10, (CX)
14611 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14612
14613 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16:
14614 MOVQ (DI), R10
14615 MOVQ -8(DI)(R9*1), DI
14616 MOVQ R10, (CX)
14617 MOVQ DI, -8(CX)(R9*1)
14618 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14619
14620 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32:
14621 MOVOU (DI), X0
14622 MOVOU -16(DI)(R9*1), X1
14623 MOVOU X0, (CX)
14624 MOVOU X1, -16(CX)(R9*1)
14625 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B
14626
14627 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64:
14628 MOVOU (DI), X0
14629 MOVOU 16(DI), X1
14630 MOVOU -32(DI)(R9*1), X2
14631 MOVOU -16(DI)(R9*1), X3
14632 MOVOU X0, (CX)
14633 MOVOU X1, 16(CX)
14634 MOVOU X2, -32(CX)(R9*1)
14635 MOVOU X3, -16(CX)(R9*1)
14636
14637 memmove_end_copy_match_emit_encodeSnappyBlockAsm8B:
14638 MOVQ R8, CX
14639 JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B
14640
14641 memmove_long_match_emit_encodeSnappyBlockAsm8B:
14642 LEAQ (CX)(R9*1), R8
14643
14644 // genMemMoveLong
14645 MOVOU (DI), X0
14646 MOVOU 16(DI), X1
14647 MOVOU -32(DI)(R9*1), X2
14648 MOVOU -16(DI)(R9*1), X3
14649 MOVQ R9, R11
14650 SHRQ $0x05, R11
14651 MOVQ CX, R10
14652 ANDL $0x0000001f, R10
14653 MOVQ $0x00000040, R12
14654 SUBQ R10, R12
14655 DECQ R11
14656 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14657 LEAQ -32(DI)(R12*1), R10
14658 LEAQ -32(CX)(R12*1), R13
14659
14660 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back:
14661 MOVOU (R10), X4
14662 MOVOU 16(R10), X5
14663 MOVOA X4, (R13)
14664 MOVOA X5, 16(R13)
14665 ADDQ $0x20, R13
14666 ADDQ $0x20, R10
14667 ADDQ $0x20, R12
14668 DECQ R11
14669 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back
14670
14671 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14672 MOVOU -32(DI)(R12*1), X4
14673 MOVOU -16(DI)(R12*1), X5
14674 MOVOA X4, -32(CX)(R12*1)
14675 MOVOA X5, -16(CX)(R12*1)
14676 ADDQ $0x20, R12
14677 CMPQ R9, R12
14678 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14679 MOVOU X0, (CX)
14680 MOVOU X1, 16(CX)
14681 MOVOU X2, -32(CX)(R9*1)
14682 MOVOU X3, -16(CX)(R9*1)
14683 MOVQ R8, CX
14684
14685 emit_literal_done_match_emit_encodeSnappyBlockAsm8B:
14686 match_nolit_loop_encodeSnappyBlockAsm8B:
14687 MOVL DX, DI
14688 SUBL SI, DI
14689 MOVL DI, 16(SP)
14690 ADDL $0x04, DX
14691 ADDL $0x04, SI
14692 MOVQ src_len+32(FP), DI
14693 SUBL DX, DI
14694 LEAQ (BX)(DX*1), R8
14695 LEAQ (BX)(SI*1), SI
14696
14697 // matchLen
14698 XORL R10, R10
14699
14700 matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B:
14701 CMPL DI, $0x10
14702 JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B
14703 MOVQ (R8)(R10*1), R9
14704 MOVQ 8(R8)(R10*1), R11
14705 XORQ (SI)(R10*1), R9
14706 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
14707 XORQ 8(SI)(R10*1), R11
14708 JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B
14709 LEAL -16(DI), DI
14710 LEAL 16(R10), R10
14711 JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B
14712
14713 matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B:
14714 #ifdef GOAMD64_v3
14715 TZCNTQ R11, R11
14716
14717 #else
14718 BSFQ R11, R11
14719
14720 #endif
14721 SARQ $0x03, R11
14722 LEAL 8(R10)(R11*1), R10
14723 JMP match_nolit_end_encodeSnappyBlockAsm8B
14724
14725 matchlen_match8_match_nolit_encodeSnappyBlockAsm8B:
14726 CMPL DI, $0x08
14727 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
14728 MOVQ (R8)(R10*1), R9
14729 XORQ (SI)(R10*1), R9
14730 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B
14731 LEAL -8(DI), DI
14732 LEAL 8(R10), R10
14733 JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B
14734
14735 matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B:
14736 #ifdef GOAMD64_v3
14737 TZCNTQ R9, R9
14738
14739 #else
14740 BSFQ R9, R9
14741
14742 #endif
14743 SARQ $0x03, R9
14744 LEAL (R10)(R9*1), R10
14745 JMP match_nolit_end_encodeSnappyBlockAsm8B
14746
14747 matchlen_match4_match_nolit_encodeSnappyBlockAsm8B:
14748 CMPL DI, $0x04
14749 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
14750 MOVL (R8)(R10*1), R9
14751 CMPL (SI)(R10*1), R9
14752 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B
14753 LEAL -4(DI), DI
14754 LEAL 4(R10), R10
14755
14756 matchlen_match2_match_nolit_encodeSnappyBlockAsm8B:
14757 CMPL DI, $0x01
14758 JE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
14759 JB match_nolit_end_encodeSnappyBlockAsm8B
14760 MOVW (R8)(R10*1), R9
14761 CMPW (SI)(R10*1), R9
14762 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B
14763 LEAL 2(R10), R10
14764 SUBL $0x02, DI
14765 JZ match_nolit_end_encodeSnappyBlockAsm8B
14766
14767 matchlen_match1_match_nolit_encodeSnappyBlockAsm8B:
14768 MOVB (R8)(R10*1), R9
14769 CMPB (SI)(R10*1), R9
14770 JNE match_nolit_end_encodeSnappyBlockAsm8B
14771 LEAL 1(R10), R10
14772
14773 match_nolit_end_encodeSnappyBlockAsm8B:
14774 ADDL R10, DX
14775 MOVL 16(SP), SI
14776 ADDL $0x04, R10
14777 MOVL DX, 12(SP)
14778
14779 // emitCopy
14780 two_byte_offset_match_nolit_encodeSnappyBlockAsm8B:
14781 CMPL R10, $0x40
14782 JBE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B
14783 MOVB $0xee, (CX)
14784 MOVW SI, 1(CX)
14785 LEAL -60(R10), R10
14786 ADDQ $0x03, CX
14787 JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B
14788
14789 two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B:
14790 MOVL R10, DI
14791 SHLL $0x02, DI
14792 CMPL R10, $0x0c
14793 JAE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B
14794 LEAL -15(DI), DI
14795 MOVB SI, 1(CX)
14796 SHRL $0x08, SI
14797 SHLL $0x05, SI
14798 ORL SI, DI
14799 MOVB DI, (CX)
14800 ADDQ $0x02, CX
14801 JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B
14802
14803 emit_copy_three_match_nolit_encodeSnappyBlockAsm8B:
14804 LEAL -2(DI), DI
14805 MOVB DI, (CX)
14806 MOVW SI, 1(CX)
14807 ADDQ $0x03, CX
14808
14809 match_nolit_emitcopy_end_encodeSnappyBlockAsm8B:
14810 CMPL DX, 8(SP)
14811 JAE emit_remainder_encodeSnappyBlockAsm8B
14812 MOVQ -2(BX)(DX*1), DI
14813 CMPQ CX, (SP)
14814 JB match_nolit_dst_ok_encodeSnappyBlockAsm8B
14815 MOVQ $0x00000000, ret+56(FP)
14816 RET
14817
14818 match_nolit_dst_ok_encodeSnappyBlockAsm8B:
14819 MOVQ $0x9e3779b1, R9
14820 MOVQ DI, R8
14821 SHRQ $0x10, DI
14822 MOVQ DI, SI
14823 SHLQ $0x20, R8
14824 IMULQ R9, R8
14825 SHRQ $0x38, R8
14826 SHLQ $0x20, SI
14827 IMULQ R9, SI
14828 SHRQ $0x38, SI
14829 LEAL -2(DX), R9
14830 LEAQ (AX)(SI*4), R10
14831 MOVL (R10), SI
14832 MOVL R9, (AX)(R8*4)
14833 MOVL DX, (R10)
14834 CMPL (BX)(SI*1), DI
14835 JEQ match_nolit_loop_encodeSnappyBlockAsm8B
14836 INCL DX
14837 JMP search_loop_encodeSnappyBlockAsm8B
14838
14839 emit_remainder_encodeSnappyBlockAsm8B:
14840 MOVQ src_len+32(FP), AX
14841 SUBL 12(SP), AX
14842 LEAQ 3(CX)(AX*1), AX
14843 CMPQ AX, (SP)
14844 JB emit_remainder_ok_encodeSnappyBlockAsm8B
14845 MOVQ $0x00000000, ret+56(FP)
14846 RET
14847
14848 emit_remainder_ok_encodeSnappyBlockAsm8B:
14849 MOVQ src_len+32(FP), AX
14850 MOVL 12(SP), DX
14851 CMPL DX, AX
14852 JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
14853 MOVL AX, SI
14854 MOVL AX, 12(SP)
14855 LEAQ (BX)(DX*1), AX
14856 SUBL DX, SI
14857 LEAL -1(SI), DX
14858 CMPL DX, $0x3c
14859 JB one_byte_emit_remainder_encodeSnappyBlockAsm8B
14860 CMPL DX, $0x00000100
14861 JB two_bytes_emit_remainder_encodeSnappyBlockAsm8B
14862 JB three_bytes_emit_remainder_encodeSnappyBlockAsm8B
14863
14864 three_bytes_emit_remainder_encodeSnappyBlockAsm8B:
14865 MOVB $0xf4, (CX)
14866 MOVW DX, 1(CX)
14867 ADDQ $0x03, CX
14868 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
14869
14870 two_bytes_emit_remainder_encodeSnappyBlockAsm8B:
14871 MOVB $0xf0, (CX)
14872 MOVB DL, 1(CX)
14873 ADDQ $0x02, CX
14874 CMPL DX, $0x40
14875 JB memmove_emit_remainder_encodeSnappyBlockAsm8B
14876 JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B
14877
14878 one_byte_emit_remainder_encodeSnappyBlockAsm8B:
14879 SHLB $0x02, DL
14880 MOVB DL, (CX)
14881 ADDQ $0x01, CX
14882
14883 memmove_emit_remainder_encodeSnappyBlockAsm8B:
14884 LEAQ (CX)(SI*1), DX
14885 MOVL SI, BX
14886
14887 // genMemMoveShort
14888 CMPQ BX, $0x03
14889 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2
14890 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3
14891 CMPQ BX, $0x08
14892 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7
14893 CMPQ BX, $0x10
14894 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16
14895 CMPQ BX, $0x20
14896 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32
14897 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64
14898
14899 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2:
14900 MOVB (AX), SI
14901 MOVB -1(AX)(BX*1), AL
14902 MOVB SI, (CX)
14903 MOVB AL, -1(CX)(BX*1)
14904 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14905
14906 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3:
14907 MOVW (AX), SI
14908 MOVB 2(AX), AL
14909 MOVW SI, (CX)
14910 MOVB AL, 2(CX)
14911 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14912
14913 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7:
14914 MOVL (AX), SI
14915 MOVL -4(AX)(BX*1), AX
14916 MOVL SI, (CX)
14917 MOVL AX, -4(CX)(BX*1)
14918 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14919
14920 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16:
14921 MOVQ (AX), SI
14922 MOVQ -8(AX)(BX*1), AX
14923 MOVQ SI, (CX)
14924 MOVQ AX, -8(CX)(BX*1)
14925 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14926
14927 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32:
14928 MOVOU (AX), X0
14929 MOVOU -16(AX)(BX*1), X1
14930 MOVOU X0, (CX)
14931 MOVOU X1, -16(CX)(BX*1)
14932 JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B
14933
14934 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64:
14935 MOVOU (AX), X0
14936 MOVOU 16(AX), X1
14937 MOVOU -32(AX)(BX*1), X2
14938 MOVOU -16(AX)(BX*1), X3
14939 MOVOU X0, (CX)
14940 MOVOU X1, 16(CX)
14941 MOVOU X2, -32(CX)(BX*1)
14942 MOVOU X3, -16(CX)(BX*1)
14943
14944 memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B:
14945 MOVQ DX, CX
14946 JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B
14947
14948 memmove_long_emit_remainder_encodeSnappyBlockAsm8B:
14949 LEAQ (CX)(SI*1), DX
14950 MOVL SI, BX
14951
14952 // genMemMoveLong
14953 MOVOU (AX), X0
14954 MOVOU 16(AX), X1
14955 MOVOU -32(AX)(BX*1), X2
14956 MOVOU -16(AX)(BX*1), X3
14957 MOVQ BX, DI
14958 SHRQ $0x05, DI
14959 MOVQ CX, SI
14960 ANDL $0x0000001f, SI
14961 MOVQ $0x00000040, R8
14962 SUBQ SI, R8
14963 DECQ DI
14964 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14965 LEAQ -32(AX)(R8*1), SI
14966 LEAQ -32(CX)(R8*1), R9
14967
14968 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back:
14969 MOVOU (SI), X4
14970 MOVOU 16(SI), X5
14971 MOVOA X4, (R9)
14972 MOVOA X5, 16(R9)
14973 ADDQ $0x20, R9
14974 ADDQ $0x20, SI
14975 ADDQ $0x20, R8
14976 DECQ DI
14977 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back
14978
14979 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32:
14980 MOVOU -32(AX)(R8*1), X4
14981 MOVOU -16(AX)(R8*1), X5
14982 MOVOA X4, -32(CX)(R8*1)
14983 MOVOA X5, -16(CX)(R8*1)
14984 ADDQ $0x20, R8
14985 CMPQ BX, R8
14986 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32
14987 MOVOU X0, (CX)
14988 MOVOU X1, 16(CX)
14989 MOVOU X2, -32(CX)(BX*1)
14990 MOVOU X3, -16(CX)(BX*1)
14991 MOVQ DX, CX
14992
14993 emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B:
14994 MOVQ dst_base+0(FP), AX
14995 SUBQ AX, CX
14996 MOVQ CX, ret+56(FP)
14997 RET
14998
14999 // func encodeSnappyBetterBlockAsm(dst []byte, src []byte, tmp *[589824]byte) int
15000 // Requires: BMI, SSE2
15001 TEXT ·encodeSnappyBetterBlockAsm(SB), $24-64
15002 MOVQ tmp+48(FP), AX
15003 MOVQ dst_base+0(FP), CX
15004 MOVQ $0x00001200, DX
15005 MOVQ AX, BX
15006 PXOR X0, X0
15007
15008 zero_loop_encodeSnappyBetterBlockAsm:
15009 MOVOU X0, (BX)
15010 MOVOU X0, 16(BX)
15011 MOVOU X0, 32(BX)
15012 MOVOU X0, 48(BX)
15013 MOVOU X0, 64(BX)
15014 MOVOU X0, 80(BX)
15015 MOVOU X0, 96(BX)
15016 MOVOU X0, 112(BX)
15017 ADDQ $0x80, BX
15018 DECQ DX
15019 JNZ zero_loop_encodeSnappyBetterBlockAsm
15020 MOVL $0x00000000, 12(SP)
15021 MOVQ src_len+32(FP), DX
15022 LEAQ -9(DX), BX
15023 LEAQ -8(DX), SI
15024 MOVL SI, 8(SP)
15025 SHRQ $0x05, DX
15026 SUBL DX, BX
15027 LEAQ (CX)(BX*1), BX
15028 MOVQ BX, (SP)
15029 MOVL $0x00000001, DX
15030 MOVL $0x00000000, 16(SP)
15031 MOVQ src_base+24(FP), BX
15032
15033 search_loop_encodeSnappyBetterBlockAsm:
15034 MOVL DX, SI
15035 SUBL 12(SP), SI
15036 SHRL $0x07, SI
15037 CMPL SI, $0x63
15038 JBE check_maxskip_ok_encodeSnappyBetterBlockAsm
15039 LEAL 100(DX), SI
15040 JMP check_maxskip_cont_encodeSnappyBetterBlockAsm
15041
15042 check_maxskip_ok_encodeSnappyBetterBlockAsm:
15043 LEAL 1(DX)(SI*1), SI
15044
15045 check_maxskip_cont_encodeSnappyBetterBlockAsm:
15046 CMPL SI, 8(SP)
15047 JAE emit_remainder_encodeSnappyBetterBlockAsm
15048 MOVQ (BX)(DX*1), DI
15049 MOVL SI, 20(SP)
15050 MOVQ $0x00cf1bbcdcbfa563, R9
15051 MOVQ $0x9e3779b1, SI
15052 MOVQ DI, R10
15053 MOVQ DI, R11
15054 SHLQ $0x08, R10
15055 IMULQ R9, R10
15056 SHRQ $0x2f, R10
15057 SHLQ $0x20, R11
15058 IMULQ SI, R11
15059 SHRQ $0x32, R11
15060 MOVL (AX)(R10*4), SI
15061 MOVL 524288(AX)(R11*4), R8
15062 MOVL DX, (AX)(R10*4)
15063 MOVL DX, 524288(AX)(R11*4)
15064 MOVQ (BX)(SI*1), R10
15065 MOVQ (BX)(R8*1), R11
15066 CMPQ R10, DI
15067 JEQ candidate_match_encodeSnappyBetterBlockAsm
15068 CMPQ R11, DI
15069 JNE no_short_found_encodeSnappyBetterBlockAsm
15070 MOVL R8, SI
15071 JMP candidate_match_encodeSnappyBetterBlockAsm
15072
15073 no_short_found_encodeSnappyBetterBlockAsm:
15074 CMPL R10, DI
15075 JEQ candidate_match_encodeSnappyBetterBlockAsm
15076 CMPL R11, DI
15077 JEQ candidateS_match_encodeSnappyBetterBlockAsm
15078 MOVL 20(SP), DX
15079 JMP search_loop_encodeSnappyBetterBlockAsm
15080
15081 candidateS_match_encodeSnappyBetterBlockAsm:
15082 SHRQ $0x08, DI
15083 MOVQ DI, R10
15084 SHLQ $0x08, R10
15085 IMULQ R9, R10
15086 SHRQ $0x2f, R10
15087 MOVL (AX)(R10*4), SI
15088 INCL DX
15089 MOVL DX, (AX)(R10*4)
15090 CMPL (BX)(SI*1), DI
15091 JEQ candidate_match_encodeSnappyBetterBlockAsm
15092 DECL DX
15093 MOVL R8, SI
15094
15095 candidate_match_encodeSnappyBetterBlockAsm:
15096 MOVL 12(SP), DI
15097 TESTL SI, SI
15098 JZ match_extend_back_end_encodeSnappyBetterBlockAsm
15099
15100 match_extend_back_loop_encodeSnappyBetterBlockAsm:
15101 CMPL DX, DI
15102 JBE match_extend_back_end_encodeSnappyBetterBlockAsm
15103 MOVB -1(BX)(SI*1), R8
15104 MOVB -1(BX)(DX*1), R9
15105 CMPB R8, R9
15106 JNE match_extend_back_end_encodeSnappyBetterBlockAsm
15107 LEAL -1(DX), DX
15108 DECL SI
15109 JZ match_extend_back_end_encodeSnappyBetterBlockAsm
15110 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm
15111
15112 match_extend_back_end_encodeSnappyBetterBlockAsm:
15113 MOVL DX, DI
15114 SUBL 12(SP), DI
15115 LEAQ 5(CX)(DI*1), DI
15116 CMPQ DI, (SP)
15117 JB match_dst_size_check_encodeSnappyBetterBlockAsm
15118 MOVQ $0x00000000, ret+56(FP)
15119 RET
15120
15121 match_dst_size_check_encodeSnappyBetterBlockAsm:
15122 MOVL DX, DI
15123 ADDL $0x04, DX
15124 ADDL $0x04, SI
15125 MOVQ src_len+32(FP), R8
15126 SUBL DX, R8
15127 LEAQ (BX)(DX*1), R9
15128 LEAQ (BX)(SI*1), R10
15129
15130 // matchLen
15131 XORL R12, R12
15132
15133 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm:
15134 CMPL R8, $0x10
15135 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm
15136 MOVQ (R9)(R12*1), R11
15137 MOVQ 8(R9)(R12*1), R13
15138 XORQ (R10)(R12*1), R11
15139 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
15140 XORQ 8(R10)(R12*1), R13
15141 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm
15142 LEAL -16(R8), R8
15143 LEAL 16(R12), R12
15144 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm
15145
15146 matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm:
15147 #ifdef GOAMD64_v3
15148 TZCNTQ R13, R13
15149
15150 #else
15151 BSFQ R13, R13
15152
15153 #endif
15154 SARQ $0x03, R13
15155 LEAL 8(R12)(R13*1), R12
15156 JMP match_nolit_end_encodeSnappyBetterBlockAsm
15157
15158 matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm:
15159 CMPL R8, $0x08
15160 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
15161 MOVQ (R9)(R12*1), R11
15162 XORQ (R10)(R12*1), R11
15163 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm
15164 LEAL -8(R8), R8
15165 LEAL 8(R12), R12
15166 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm
15167
15168 matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm:
15169 #ifdef GOAMD64_v3
15170 TZCNTQ R11, R11
15171
15172 #else
15173 BSFQ R11, R11
15174
15175 #endif
15176 SARQ $0x03, R11
15177 LEAL (R12)(R11*1), R12
15178 JMP match_nolit_end_encodeSnappyBetterBlockAsm
15179
15180 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm:
15181 CMPL R8, $0x04
15182 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
15183 MOVL (R9)(R12*1), R11
15184 CMPL (R10)(R12*1), R11
15185 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm
15186 LEAL -4(R8), R8
15187 LEAL 4(R12), R12
15188
15189 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm:
15190 CMPL R8, $0x01
15191 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
15192 JB match_nolit_end_encodeSnappyBetterBlockAsm
15193 MOVW (R9)(R12*1), R11
15194 CMPW (R10)(R12*1), R11
15195 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm
15196 LEAL 2(R12), R12
15197 SUBL $0x02, R8
15198 JZ match_nolit_end_encodeSnappyBetterBlockAsm
15199
15200 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm:
15201 MOVB (R9)(R12*1), R11
15202 CMPB (R10)(R12*1), R11
15203 JNE match_nolit_end_encodeSnappyBetterBlockAsm
15204 LEAL 1(R12), R12
15205
15206 match_nolit_end_encodeSnappyBetterBlockAsm:
15207 MOVL DX, R8
15208 SUBL SI, R8
15209
15210 // Check if repeat
15211 CMPL R12, $0x01
15212 JA match_length_ok_encodeSnappyBetterBlockAsm
15213 CMPL R8, $0x0000ffff
15214 JBE match_length_ok_encodeSnappyBetterBlockAsm
15215 MOVL 20(SP), DX
15216 INCL DX
15217 JMP search_loop_encodeSnappyBetterBlockAsm
15218
15219 match_length_ok_encodeSnappyBetterBlockAsm:
15220 MOVL R8, 16(SP)
15221 MOVL 12(SP), SI
15222 CMPL SI, DI
15223 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
15224 MOVL DI, R9
15225 MOVL DI, 12(SP)
15226 LEAQ (BX)(SI*1), R10
15227 SUBL SI, R9
15228 LEAL -1(R9), SI
15229 CMPL SI, $0x3c
15230 JB one_byte_match_emit_encodeSnappyBetterBlockAsm
15231 CMPL SI, $0x00000100
15232 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm
15233 CMPL SI, $0x00010000
15234 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm
15235 CMPL SI, $0x01000000
15236 JB four_bytes_match_emit_encodeSnappyBetterBlockAsm
15237 MOVB $0xfc, (CX)
15238 MOVL SI, 1(CX)
15239 ADDQ $0x05, CX
15240 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15241
15242 four_bytes_match_emit_encodeSnappyBetterBlockAsm:
15243 MOVL SI, R11
15244 SHRL $0x10, R11
15245 MOVB $0xf8, (CX)
15246 MOVW SI, 1(CX)
15247 MOVB R11, 3(CX)
15248 ADDQ $0x04, CX
15249 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15250
15251 three_bytes_match_emit_encodeSnappyBetterBlockAsm:
15252 MOVB $0xf4, (CX)
15253 MOVW SI, 1(CX)
15254 ADDQ $0x03, CX
15255 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15256
15257 two_bytes_match_emit_encodeSnappyBetterBlockAsm:
15258 MOVB $0xf0, (CX)
15259 MOVB SI, 1(CX)
15260 ADDQ $0x02, CX
15261 CMPL SI, $0x40
15262 JB memmove_match_emit_encodeSnappyBetterBlockAsm
15263 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm
15264
15265 one_byte_match_emit_encodeSnappyBetterBlockAsm:
15266 SHLB $0x02, SI
15267 MOVB SI, (CX)
15268 ADDQ $0x01, CX
15269
15270 memmove_match_emit_encodeSnappyBetterBlockAsm:
15271 LEAQ (CX)(R9*1), SI
15272
15273 // genMemMoveShort
15274 CMPQ R9, $0x08
15275 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8
15276 CMPQ R9, $0x10
15277 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16
15278 CMPQ R9, $0x20
15279 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32
15280 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64
15281
15282 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8:
15283 MOVQ (R10), R11
15284 MOVQ R11, (CX)
15285 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15286
15287 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16:
15288 MOVQ (R10), R11
15289 MOVQ -8(R10)(R9*1), R10
15290 MOVQ R11, (CX)
15291 MOVQ R10, -8(CX)(R9*1)
15292 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15293
15294 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32:
15295 MOVOU (R10), X0
15296 MOVOU -16(R10)(R9*1), X1
15297 MOVOU X0, (CX)
15298 MOVOU X1, -16(CX)(R9*1)
15299 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm
15300
15301 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64:
15302 MOVOU (R10), X0
15303 MOVOU 16(R10), X1
15304 MOVOU -32(R10)(R9*1), X2
15305 MOVOU -16(R10)(R9*1), X3
15306 MOVOU X0, (CX)
15307 MOVOU X1, 16(CX)
15308 MOVOU X2, -32(CX)(R9*1)
15309 MOVOU X3, -16(CX)(R9*1)
15310
15311 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm:
15312 MOVQ SI, CX
15313 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm
15314
15315 memmove_long_match_emit_encodeSnappyBetterBlockAsm:
15316 LEAQ (CX)(R9*1), SI
15317
15318 // genMemMoveLong
15319 MOVOU (R10), X0
15320 MOVOU 16(R10), X1
15321 MOVOU -32(R10)(R9*1), X2
15322 MOVOU -16(R10)(R9*1), X3
15323 MOVQ R9, R13
15324 SHRQ $0x05, R13
15325 MOVQ CX, R11
15326 ANDL $0x0000001f, R11
15327 MOVQ $0x00000040, R14
15328 SUBQ R11, R14
15329 DECQ R13
15330 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15331 LEAQ -32(R10)(R14*1), R11
15332 LEAQ -32(CX)(R14*1), R15
15333
15334 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back:
15335 MOVOU (R11), X4
15336 MOVOU 16(R11), X5
15337 MOVOA X4, (R15)
15338 MOVOA X5, 16(R15)
15339 ADDQ $0x20, R15
15340 ADDQ $0x20, R11
15341 ADDQ $0x20, R14
15342 DECQ R13
15343 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back
15344
15345 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
15346 MOVOU -32(R10)(R14*1), X4
15347 MOVOU -16(R10)(R14*1), X5
15348 MOVOA X4, -32(CX)(R14*1)
15349 MOVOA X5, -16(CX)(R14*1)
15350 ADDQ $0x20, R14
15351 CMPQ R9, R14
15352 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15353 MOVOU X0, (CX)
15354 MOVOU X1, 16(CX)
15355 MOVOU X2, -32(CX)(R9*1)
15356 MOVOU X3, -16(CX)(R9*1)
15357 MOVQ SI, CX
15358
15359 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm:
15360 ADDL R12, DX
15361 ADDL $0x04, R12
15362 MOVL DX, 12(SP)
15363
15364 // emitCopy
15365 CMPL R8, $0x00010000
15366 JB two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
15367
15368 four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm:
15369 CMPL R12, $0x40
15370 JBE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
15371 MOVB $0xff, (CX)
15372 MOVL R8, 1(CX)
15373 LEAL -64(R12), R12
15374 ADDQ $0x05, CX
15375 CMPL R12, $0x04
15376 JB four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm
15377 JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm
15378
15379 four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm:
15380 TESTL R12, R12
15381 JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15382 XORL SI, SI
15383 LEAL -1(SI)(R12*4), R12
15384 MOVB R12, (CX)
15385 MOVL R8, 1(CX)
15386 ADDQ $0x05, CX
15387 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15388
15389 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm:
15390 CMPL R12, $0x40
15391 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm
15392 MOVB $0xee, (CX)
15393 MOVW R8, 1(CX)
15394 LEAL -60(R12), R12
15395 ADDQ $0x03, CX
15396 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm
15397
15398 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm:
15399 MOVL R12, SI
15400 SHLL $0x02, SI
15401 CMPL R12, $0x0c
15402 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
15403 CMPL R8, $0x00000800
15404 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm
15405 LEAL -15(SI), SI
15406 MOVB R8, 1(CX)
15407 SHRL $0x08, R8
15408 SHLL $0x05, R8
15409 ORL R8, SI
15410 MOVB SI, (CX)
15411 ADDQ $0x02, CX
15412 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm
15413
15414 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm:
15415 LEAL -2(SI), SI
15416 MOVB SI, (CX)
15417 MOVW R8, 1(CX)
15418 ADDQ $0x03, CX
15419
15420 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm:
15421 CMPL DX, 8(SP)
15422 JAE emit_remainder_encodeSnappyBetterBlockAsm
15423 CMPQ CX, (SP)
15424 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm
15425 MOVQ $0x00000000, ret+56(FP)
15426 RET
15427
15428 match_nolit_dst_ok_encodeSnappyBetterBlockAsm:
15429 MOVQ $0x00cf1bbcdcbfa563, SI
15430 MOVQ $0x9e3779b1, R8
15431 LEAQ 1(DI), DI
15432 LEAQ -2(DX), R9
15433 MOVQ (BX)(DI*1), R10
15434 MOVQ 1(BX)(DI*1), R11
15435 MOVQ (BX)(R9*1), R12
15436 MOVQ 1(BX)(R9*1), R13
15437 SHLQ $0x08, R10
15438 IMULQ SI, R10
15439 SHRQ $0x2f, R10
15440 SHLQ $0x20, R11
15441 IMULQ R8, R11
15442 SHRQ $0x32, R11
15443 SHLQ $0x08, R12
15444 IMULQ SI, R12
15445 SHRQ $0x2f, R12
15446 SHLQ $0x20, R13
15447 IMULQ R8, R13
15448 SHRQ $0x32, R13
15449 LEAQ 1(DI), R8
15450 LEAQ 1(R9), R14
15451 MOVL DI, (AX)(R10*4)
15452 MOVL R9, (AX)(R12*4)
15453 MOVL R8, 524288(AX)(R11*4)
15454 MOVL R14, 524288(AX)(R13*4)
15455 LEAQ 1(R9)(DI*1), R8
15456 SHRQ $0x01, R8
15457 ADDQ $0x01, DI
15458 SUBQ $0x01, R9
15459
15460 index_loop_encodeSnappyBetterBlockAsm:
15461 CMPQ R8, R9
15462 JAE search_loop_encodeSnappyBetterBlockAsm
15463 MOVQ (BX)(DI*1), R10
15464 MOVQ (BX)(R8*1), R11
15465 SHLQ $0x08, R10
15466 IMULQ SI, R10
15467 SHRQ $0x2f, R10
15468 SHLQ $0x08, R11
15469 IMULQ SI, R11
15470 SHRQ $0x2f, R11
15471 MOVL DI, (AX)(R10*4)
15472 MOVL R8, (AX)(R11*4)
15473 ADDQ $0x02, DI
15474 ADDQ $0x02, R8
15475 JMP index_loop_encodeSnappyBetterBlockAsm
15476
15477 emit_remainder_encodeSnappyBetterBlockAsm:
15478 MOVQ src_len+32(FP), AX
15479 SUBL 12(SP), AX
15480 LEAQ 5(CX)(AX*1), AX
15481 CMPQ AX, (SP)
15482 JB emit_remainder_ok_encodeSnappyBetterBlockAsm
15483 MOVQ $0x00000000, ret+56(FP)
15484 RET
15485
15486 emit_remainder_ok_encodeSnappyBetterBlockAsm:
15487 MOVQ src_len+32(FP), AX
15488 MOVL 12(SP), DX
15489 CMPL DX, AX
15490 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
15491 MOVL AX, SI
15492 MOVL AX, 12(SP)
15493 LEAQ (BX)(DX*1), AX
15494 SUBL DX, SI
15495 LEAL -1(SI), DX
15496 CMPL DX, $0x3c
15497 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm
15498 CMPL DX, $0x00000100
15499 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15500 CMPL DX, $0x00010000
15501 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15502 CMPL DX, $0x01000000
15503 JB four_bytes_emit_remainder_encodeSnappyBetterBlockAsm
15504 MOVB $0xfc, (CX)
15505 MOVL DX, 1(CX)
15506 ADDQ $0x05, CX
15507 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15508
15509 four_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15510 MOVL DX, BX
15511 SHRL $0x10, BX
15512 MOVB $0xf8, (CX)
15513 MOVW DX, 1(CX)
15514 MOVB BL, 3(CX)
15515 ADDQ $0x04, CX
15516 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15517
15518 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15519 MOVB $0xf4, (CX)
15520 MOVW DX, 1(CX)
15521 ADDQ $0x03, CX
15522 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15523
15524 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm:
15525 MOVB $0xf0, (CX)
15526 MOVB DL, 1(CX)
15527 ADDQ $0x02, CX
15528 CMPL DX, $0x40
15529 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm
15530 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm
15531
15532 one_byte_emit_remainder_encodeSnappyBetterBlockAsm:
15533 SHLB $0x02, DL
15534 MOVB DL, (CX)
15535 ADDQ $0x01, CX
15536
15537 memmove_emit_remainder_encodeSnappyBetterBlockAsm:
15538 LEAQ (CX)(SI*1), DX
15539 MOVL SI, BX
15540
15541 // genMemMoveShort
15542 CMPQ BX, $0x03
15543 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2
15544 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3
15545 CMPQ BX, $0x08
15546 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7
15547 CMPQ BX, $0x10
15548 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16
15549 CMPQ BX, $0x20
15550 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32
15551 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64
15552
15553 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2:
15554 MOVB (AX), SI
15555 MOVB -1(AX)(BX*1), AL
15556 MOVB SI, (CX)
15557 MOVB AL, -1(CX)(BX*1)
15558 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15559
15560 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3:
15561 MOVW (AX), SI
15562 MOVB 2(AX), AL
15563 MOVW SI, (CX)
15564 MOVB AL, 2(CX)
15565 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15566
15567 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7:
15568 MOVL (AX), SI
15569 MOVL -4(AX)(BX*1), AX
15570 MOVL SI, (CX)
15571 MOVL AX, -4(CX)(BX*1)
15572 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15573
15574 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16:
15575 MOVQ (AX), SI
15576 MOVQ -8(AX)(BX*1), AX
15577 MOVQ SI, (CX)
15578 MOVQ AX, -8(CX)(BX*1)
15579 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15580
15581 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32:
15582 MOVOU (AX), X0
15583 MOVOU -16(AX)(BX*1), X1
15584 MOVOU X0, (CX)
15585 MOVOU X1, -16(CX)(BX*1)
15586 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm
15587
15588 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64:
15589 MOVOU (AX), X0
15590 MOVOU 16(AX), X1
15591 MOVOU -32(AX)(BX*1), X2
15592 MOVOU -16(AX)(BX*1), X3
15593 MOVOU X0, (CX)
15594 MOVOU X1, 16(CX)
15595 MOVOU X2, -32(CX)(BX*1)
15596 MOVOU X3, -16(CX)(BX*1)
15597
15598 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm:
15599 MOVQ DX, CX
15600 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm
15601
15602 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm:
15603 LEAQ (CX)(SI*1), DX
15604 MOVL SI, BX
15605
15606 // genMemMoveLong
15607 MOVOU (AX), X0
15608 MOVOU 16(AX), X1
15609 MOVOU -32(AX)(BX*1), X2
15610 MOVOU -16(AX)(BX*1), X3
15611 MOVQ BX, DI
15612 SHRQ $0x05, DI
15613 MOVQ CX, SI
15614 ANDL $0x0000001f, SI
15615 MOVQ $0x00000040, R8
15616 SUBQ SI, R8
15617 DECQ DI
15618 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15619 LEAQ -32(AX)(R8*1), SI
15620 LEAQ -32(CX)(R8*1), R9
15621
15622 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back:
15623 MOVOU (SI), X4
15624 MOVOU 16(SI), X5
15625 MOVOA X4, (R9)
15626 MOVOA X5, 16(R9)
15627 ADDQ $0x20, R9
15628 ADDQ $0x20, SI
15629 ADDQ $0x20, R8
15630 DECQ DI
15631 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back
15632
15633 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32:
15634 MOVOU -32(AX)(R8*1), X4
15635 MOVOU -16(AX)(R8*1), X5
15636 MOVOA X4, -32(CX)(R8*1)
15637 MOVOA X5, -16(CX)(R8*1)
15638 ADDQ $0x20, R8
15639 CMPQ BX, R8
15640 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32
15641 MOVOU X0, (CX)
15642 MOVOU X1, 16(CX)
15643 MOVOU X2, -32(CX)(BX*1)
15644 MOVOU X3, -16(CX)(BX*1)
15645 MOVQ DX, CX
15646
15647 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm:
15648 MOVQ dst_base+0(FP), AX
15649 SUBQ AX, CX
15650 MOVQ CX, ret+56(FP)
15651 RET
15652
15653 // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte, tmp *[294912]byte) int
15654 // Requires: BMI, SSE2
15655 TEXT ·encodeSnappyBetterBlockAsm64K(SB), $24-64
15656 MOVQ tmp+48(FP), AX
15657 MOVQ dst_base+0(FP), CX
15658 MOVQ $0x00000900, DX
15659 MOVQ AX, BX
15660 PXOR X0, X0
15661
15662 zero_loop_encodeSnappyBetterBlockAsm64K:
15663 MOVOU X0, (BX)
15664 MOVOU X0, 16(BX)
15665 MOVOU X0, 32(BX)
15666 MOVOU X0, 48(BX)
15667 MOVOU X0, 64(BX)
15668 MOVOU X0, 80(BX)
15669 MOVOU X0, 96(BX)
15670 MOVOU X0, 112(BX)
15671 ADDQ $0x80, BX
15672 DECQ DX
15673 JNZ zero_loop_encodeSnappyBetterBlockAsm64K
15674 MOVL $0x00000000, 12(SP)
15675 MOVQ src_len+32(FP), DX
15676 LEAQ -9(DX), BX
15677 LEAQ -8(DX), SI
15678 MOVL SI, 8(SP)
15679 SHRQ $0x05, DX
15680 SUBL DX, BX
15681 LEAQ (CX)(BX*1), BX
15682 MOVQ BX, (SP)
15683 MOVL $0x00000001, DX
15684 MOVL $0x00000000, 16(SP)
15685 MOVQ src_base+24(FP), BX
15686
15687 search_loop_encodeSnappyBetterBlockAsm64K:
15688 MOVL DX, SI
15689 SUBL 12(SP), SI
15690 SHRL $0x07, SI
15691 LEAL 1(DX)(SI*1), SI
15692 CMPL SI, 8(SP)
15693 JAE emit_remainder_encodeSnappyBetterBlockAsm64K
15694 MOVQ (BX)(DX*1), DI
15695 MOVL SI, 20(SP)
15696 MOVQ $0x00cf1bbcdcbfa563, R9
15697 MOVQ $0x9e3779b1, SI
15698 MOVQ DI, R10
15699 MOVQ DI, R11
15700 SHLQ $0x08, R10
15701 IMULQ R9, R10
15702 SHRQ $0x30, R10
15703 SHLQ $0x20, R11
15704 IMULQ SI, R11
15705 SHRQ $0x33, R11
15706 MOVL (AX)(R10*4), SI
15707 MOVL 262144(AX)(R11*4), R8
15708 MOVL DX, (AX)(R10*4)
15709 MOVL DX, 262144(AX)(R11*4)
15710 MOVQ (BX)(SI*1), R10
15711 MOVQ (BX)(R8*1), R11
15712 CMPQ R10, DI
15713 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15714 CMPQ R11, DI
15715 JNE no_short_found_encodeSnappyBetterBlockAsm64K
15716 MOVL R8, SI
15717 JMP candidate_match_encodeSnappyBetterBlockAsm64K
15718
15719 no_short_found_encodeSnappyBetterBlockAsm64K:
15720 CMPL R10, DI
15721 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15722 CMPL R11, DI
15723 JEQ candidateS_match_encodeSnappyBetterBlockAsm64K
15724 MOVL 20(SP), DX
15725 JMP search_loop_encodeSnappyBetterBlockAsm64K
15726
15727 candidateS_match_encodeSnappyBetterBlockAsm64K:
15728 SHRQ $0x08, DI
15729 MOVQ DI, R10
15730 SHLQ $0x08, R10
15731 IMULQ R9, R10
15732 SHRQ $0x30, R10
15733 MOVL (AX)(R10*4), SI
15734 INCL DX
15735 MOVL DX, (AX)(R10*4)
15736 CMPL (BX)(SI*1), DI
15737 JEQ candidate_match_encodeSnappyBetterBlockAsm64K
15738 DECL DX
15739 MOVL R8, SI
15740
15741 candidate_match_encodeSnappyBetterBlockAsm64K:
15742 MOVL 12(SP), DI
15743 TESTL SI, SI
15744 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
15745
15746 match_extend_back_loop_encodeSnappyBetterBlockAsm64K:
15747 CMPL DX, DI
15748 JBE match_extend_back_end_encodeSnappyBetterBlockAsm64K
15749 MOVB -1(BX)(SI*1), R8
15750 MOVB -1(BX)(DX*1), R9
15751 CMPB R8, R9
15752 JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K
15753 LEAL -1(DX), DX
15754 DECL SI
15755 JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K
15756 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K
15757
15758 match_extend_back_end_encodeSnappyBetterBlockAsm64K:
15759 MOVL DX, DI
15760 SUBL 12(SP), DI
15761 LEAQ 3(CX)(DI*1), DI
15762 CMPQ DI, (SP)
15763 JB match_dst_size_check_encodeSnappyBetterBlockAsm64K
15764 MOVQ $0x00000000, ret+56(FP)
15765 RET
15766
15767 match_dst_size_check_encodeSnappyBetterBlockAsm64K:
15768 MOVL DX, DI
15769 ADDL $0x04, DX
15770 ADDL $0x04, SI
15771 MOVQ src_len+32(FP), R8
15772 SUBL DX, R8
15773 LEAQ (BX)(DX*1), R9
15774 LEAQ (BX)(SI*1), R10
15775
15776 // matchLen
15777 XORL R12, R12
15778
15779 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K:
15780 CMPL R8, $0x10
15781 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K
15782 MOVQ (R9)(R12*1), R11
15783 MOVQ 8(R9)(R12*1), R13
15784 XORQ (R10)(R12*1), R11
15785 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
15786 XORQ 8(R10)(R12*1), R13
15787 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K
15788 LEAL -16(R8), R8
15789 LEAL 16(R12), R12
15790 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K
15791
15792 matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K:
15793 #ifdef GOAMD64_v3
15794 TZCNTQ R13, R13
15795
15796 #else
15797 BSFQ R13, R13
15798
15799 #endif
15800 SARQ $0x03, R13
15801 LEAL 8(R12)(R13*1), R12
15802 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
15803
15804 matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K:
15805 CMPL R8, $0x08
15806 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
15807 MOVQ (R9)(R12*1), R11
15808 XORQ (R10)(R12*1), R11
15809 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K
15810 LEAL -8(R8), R8
15811 LEAL 8(R12), R12
15812 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K
15813
15814 matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K:
15815 #ifdef GOAMD64_v3
15816 TZCNTQ R11, R11
15817
15818 #else
15819 BSFQ R11, R11
15820
15821 #endif
15822 SARQ $0x03, R11
15823 LEAL (R12)(R11*1), R12
15824 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K
15825
15826 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K:
15827 CMPL R8, $0x04
15828 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
15829 MOVL (R9)(R12*1), R11
15830 CMPL (R10)(R12*1), R11
15831 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K
15832 LEAL -4(R8), R8
15833 LEAL 4(R12), R12
15834
15835 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K:
15836 CMPL R8, $0x01
15837 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
15838 JB match_nolit_end_encodeSnappyBetterBlockAsm64K
15839 MOVW (R9)(R12*1), R11
15840 CMPW (R10)(R12*1), R11
15841 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K
15842 LEAL 2(R12), R12
15843 SUBL $0x02, R8
15844 JZ match_nolit_end_encodeSnappyBetterBlockAsm64K
15845
15846 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K:
15847 MOVB (R9)(R12*1), R11
15848 CMPB (R10)(R12*1), R11
15849 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K
15850 LEAL 1(R12), R12
15851
15852 match_nolit_end_encodeSnappyBetterBlockAsm64K:
15853 MOVL DX, R8
15854 SUBL SI, R8
15855
15856 // Check if repeat
15857 MOVL R8, 16(SP)
15858 MOVL 12(SP), SI
15859 CMPL SI, DI
15860 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
15861 MOVL DI, R9
15862 MOVL DI, 12(SP)
15863 LEAQ (BX)(SI*1), R10
15864 SUBL SI, R9
15865 LEAL -1(R9), SI
15866 CMPL SI, $0x3c
15867 JB one_byte_match_emit_encodeSnappyBetterBlockAsm64K
15868 CMPL SI, $0x00000100
15869 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm64K
15870 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm64K
15871
15872 three_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
15873 MOVB $0xf4, (CX)
15874 MOVW SI, 1(CX)
15875 ADDQ $0x03, CX
15876 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
15877
15878 two_bytes_match_emit_encodeSnappyBetterBlockAsm64K:
15879 MOVB $0xf0, (CX)
15880 MOVB SI, 1(CX)
15881 ADDQ $0x02, CX
15882 CMPL SI, $0x40
15883 JB memmove_match_emit_encodeSnappyBetterBlockAsm64K
15884 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K
15885
15886 one_byte_match_emit_encodeSnappyBetterBlockAsm64K:
15887 SHLB $0x02, SI
15888 MOVB SI, (CX)
15889 ADDQ $0x01, CX
15890
15891 memmove_match_emit_encodeSnappyBetterBlockAsm64K:
15892 LEAQ (CX)(R9*1), SI
15893
15894 // genMemMoveShort
15895 CMPQ R9, $0x08
15896 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8
15897 CMPQ R9, $0x10
15898 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
15899 CMPQ R9, $0x20
15900 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
15901 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
15902
15903 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8:
15904 MOVQ (R10), R11
15905 MOVQ R11, (CX)
15906 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15907
15908 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
15909 MOVQ (R10), R11
15910 MOVQ -8(R10)(R9*1), R10
15911 MOVQ R11, (CX)
15912 MOVQ R10, -8(CX)(R9*1)
15913 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15914
15915 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
15916 MOVOU (R10), X0
15917 MOVOU -16(R10)(R9*1), X1
15918 MOVOU X0, (CX)
15919 MOVOU X1, -16(CX)(R9*1)
15920 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K
15921
15922 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
15923 MOVOU (R10), X0
15924 MOVOU 16(R10), X1
15925 MOVOU -32(R10)(R9*1), X2
15926 MOVOU -16(R10)(R9*1), X3
15927 MOVOU X0, (CX)
15928 MOVOU X1, 16(CX)
15929 MOVOU X2, -32(CX)(R9*1)
15930 MOVOU X3, -16(CX)(R9*1)
15931
15932 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K:
15933 MOVQ SI, CX
15934 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K
15935
15936 memmove_long_match_emit_encodeSnappyBetterBlockAsm64K:
15937 LEAQ (CX)(R9*1), SI
15938
15939 // genMemMoveLong
15940 MOVOU (R10), X0
15941 MOVOU 16(R10), X1
15942 MOVOU -32(R10)(R9*1), X2
15943 MOVOU -16(R10)(R9*1), X3
15944 MOVQ R9, R13
15945 SHRQ $0x05, R13
15946 MOVQ CX, R11
15947 ANDL $0x0000001f, R11
15948 MOVQ $0x00000040, R14
15949 SUBQ R11, R14
15950 DECQ R13
15951 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
15952 LEAQ -32(R10)(R14*1), R11
15953 LEAQ -32(CX)(R14*1), R15
15954
15955 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
15956 MOVOU (R11), X4
15957 MOVOU 16(R11), X5
15958 MOVOA X4, (R15)
15959 MOVOA X5, 16(R15)
15960 ADDQ $0x20, R15
15961 ADDQ $0x20, R11
15962 ADDQ $0x20, R14
15963 DECQ R13
15964 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
15965
15966 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
15967 MOVOU -32(R10)(R14*1), X4
15968 MOVOU -16(R10)(R14*1), X5
15969 MOVOA X4, -32(CX)(R14*1)
15970 MOVOA X5, -16(CX)(R14*1)
15971 ADDQ $0x20, R14
15972 CMPQ R9, R14
15973 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
15974 MOVOU X0, (CX)
15975 MOVOU X1, 16(CX)
15976 MOVOU X2, -32(CX)(R9*1)
15977 MOVOU X3, -16(CX)(R9*1)
15978 MOVQ SI, CX
15979
15980 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K:
15981 ADDL R12, DX
15982 ADDL $0x04, R12
15983 MOVL DX, 12(SP)
15984
15985 // emitCopy
15986 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K:
15987 CMPL R12, $0x40
15988 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K
15989 MOVB $0xee, (CX)
15990 MOVW R8, 1(CX)
15991 LEAL -60(R12), R12
15992 ADDQ $0x03, CX
15993 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K
15994
15995 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K:
15996 MOVL R12, SI
15997 SHLL $0x02, SI
15998 CMPL R12, $0x0c
15999 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
16000 CMPL R8, $0x00000800
16001 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K
16002 LEAL -15(SI), SI
16003 MOVB R8, 1(CX)
16004 SHRL $0x08, R8
16005 SHLL $0x05, R8
16006 ORL R8, SI
16007 MOVB SI, (CX)
16008 ADDQ $0x02, CX
16009 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K
16010
16011 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K:
16012 LEAL -2(SI), SI
16013 MOVB SI, (CX)
16014 MOVW R8, 1(CX)
16015 ADDQ $0x03, CX
16016
16017 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K:
16018 CMPL DX, 8(SP)
16019 JAE emit_remainder_encodeSnappyBetterBlockAsm64K
16020 CMPQ CX, (SP)
16021 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K
16022 MOVQ $0x00000000, ret+56(FP)
16023 RET
16024
16025 match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K:
16026 MOVQ $0x00cf1bbcdcbfa563, SI
16027 MOVQ $0x9e3779b1, R8
16028 LEAQ 1(DI), DI
16029 LEAQ -2(DX), R9
16030 MOVQ (BX)(DI*1), R10
16031 MOVQ 1(BX)(DI*1), R11
16032 MOVQ (BX)(R9*1), R12
16033 MOVQ 1(BX)(R9*1), R13
16034 SHLQ $0x08, R10
16035 IMULQ SI, R10
16036 SHRQ $0x30, R10
16037 SHLQ $0x20, R11
16038 IMULQ R8, R11
16039 SHRQ $0x33, R11
16040 SHLQ $0x08, R12
16041 IMULQ SI, R12
16042 SHRQ $0x30, R12
16043 SHLQ $0x20, R13
16044 IMULQ R8, R13
16045 SHRQ $0x33, R13
16046 LEAQ 1(DI), R8
16047 LEAQ 1(R9), R14
16048 MOVL DI, (AX)(R10*4)
16049 MOVL R9, (AX)(R12*4)
16050 MOVL R8, 262144(AX)(R11*4)
16051 MOVL R14, 262144(AX)(R13*4)
16052 LEAQ 1(R9)(DI*1), R8
16053 SHRQ $0x01, R8
16054 ADDQ $0x01, DI
16055 SUBQ $0x01, R9
16056
16057 index_loop_encodeSnappyBetterBlockAsm64K:
16058 CMPQ R8, R9
16059 JAE search_loop_encodeSnappyBetterBlockAsm64K
16060 MOVQ (BX)(DI*1), R10
16061 MOVQ (BX)(R8*1), R11
16062 SHLQ $0x08, R10
16063 IMULQ SI, R10
16064 SHRQ $0x30, R10
16065 SHLQ $0x08, R11
16066 IMULQ SI, R11
16067 SHRQ $0x30, R11
16068 MOVL DI, (AX)(R10*4)
16069 MOVL R8, (AX)(R11*4)
16070 ADDQ $0x02, DI
16071 ADDQ $0x02, R8
16072 JMP index_loop_encodeSnappyBetterBlockAsm64K
16073
16074 emit_remainder_encodeSnappyBetterBlockAsm64K:
16075 MOVQ src_len+32(FP), AX
16076 SUBL 12(SP), AX
16077 LEAQ 3(CX)(AX*1), AX
16078 CMPQ AX, (SP)
16079 JB emit_remainder_ok_encodeSnappyBetterBlockAsm64K
16080 MOVQ $0x00000000, ret+56(FP)
16081 RET
16082
16083 emit_remainder_ok_encodeSnappyBetterBlockAsm64K:
16084 MOVQ src_len+32(FP), AX
16085 MOVL 12(SP), DX
16086 CMPL DX, AX
16087 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
16088 MOVL AX, SI
16089 MOVL AX, 12(SP)
16090 LEAQ (BX)(DX*1), AX
16091 SUBL DX, SI
16092 LEAL -1(SI), DX
16093 CMPL DX, $0x3c
16094 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K
16095 CMPL DX, $0x00000100
16096 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
16097 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K
16098
16099 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
16100 MOVB $0xf4, (CX)
16101 MOVW DX, 1(CX)
16102 ADDQ $0x03, CX
16103 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
16104
16105 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K:
16106 MOVB $0xf0, (CX)
16107 MOVB DL, 1(CX)
16108 ADDQ $0x02, CX
16109 CMPL DX, $0x40
16110 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm64K
16111 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K
16112
16113 one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K:
16114 SHLB $0x02, DL
16115 MOVB DL, (CX)
16116 ADDQ $0x01, CX
16117
16118 memmove_emit_remainder_encodeSnappyBetterBlockAsm64K:
16119 LEAQ (CX)(SI*1), DX
16120 MOVL SI, BX
16121
16122 // genMemMoveShort
16123 CMPQ BX, $0x03
16124 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2
16125 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3
16126 CMPQ BX, $0x08
16127 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7
16128 CMPQ BX, $0x10
16129 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16
16130 CMPQ BX, $0x20
16131 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32
16132 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64
16133
16134 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2:
16135 MOVB (AX), SI
16136 MOVB -1(AX)(BX*1), AL
16137 MOVB SI, (CX)
16138 MOVB AL, -1(CX)(BX*1)
16139 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16140
16141 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3:
16142 MOVW (AX), SI
16143 MOVB 2(AX), AL
16144 MOVW SI, (CX)
16145 MOVB AL, 2(CX)
16146 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16147
16148 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7:
16149 MOVL (AX), SI
16150 MOVL -4(AX)(BX*1), AX
16151 MOVL SI, (CX)
16152 MOVL AX, -4(CX)(BX*1)
16153 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16154
16155 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16:
16156 MOVQ (AX), SI
16157 MOVQ -8(AX)(BX*1), AX
16158 MOVQ SI, (CX)
16159 MOVQ AX, -8(CX)(BX*1)
16160 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16161
16162 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32:
16163 MOVOU (AX), X0
16164 MOVOU -16(AX)(BX*1), X1
16165 MOVOU X0, (CX)
16166 MOVOU X1, -16(CX)(BX*1)
16167 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K
16168
16169 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64:
16170 MOVOU (AX), X0
16171 MOVOU 16(AX), X1
16172 MOVOU -32(AX)(BX*1), X2
16173 MOVOU -16(AX)(BX*1), X3
16174 MOVOU X0, (CX)
16175 MOVOU X1, 16(CX)
16176 MOVOU X2, -32(CX)(BX*1)
16177 MOVOU X3, -16(CX)(BX*1)
16178
16179 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K:
16180 MOVQ DX, CX
16181 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K
16182
16183 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K:
16184 LEAQ (CX)(SI*1), DX
16185 MOVL SI, BX
16186
16187 // genMemMoveLong
16188 MOVOU (AX), X0
16189 MOVOU 16(AX), X1
16190 MOVOU -32(AX)(BX*1), X2
16191 MOVOU -16(AX)(BX*1), X3
16192 MOVQ BX, DI
16193 SHRQ $0x05, DI
16194 MOVQ CX, SI
16195 ANDL $0x0000001f, SI
16196 MOVQ $0x00000040, R8
16197 SUBQ SI, R8
16198 DECQ DI
16199 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
16200 LEAQ -32(AX)(R8*1), SI
16201 LEAQ -32(CX)(R8*1), R9
16202
16203 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back:
16204 MOVOU (SI), X4
16205 MOVOU 16(SI), X5
16206 MOVOA X4, (R9)
16207 MOVOA X5, 16(R9)
16208 ADDQ $0x20, R9
16209 ADDQ $0x20, SI
16210 ADDQ $0x20, R8
16211 DECQ DI
16212 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back
16213
16214 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32:
16215 MOVOU -32(AX)(R8*1), X4
16216 MOVOU -16(AX)(R8*1), X5
16217 MOVOA X4, -32(CX)(R8*1)
16218 MOVOA X5, -16(CX)(R8*1)
16219 ADDQ $0x20, R8
16220 CMPQ BX, R8
16221 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32
16222 MOVOU X0, (CX)
16223 MOVOU X1, 16(CX)
16224 MOVOU X2, -32(CX)(BX*1)
16225 MOVOU X3, -16(CX)(BX*1)
16226 MOVQ DX, CX
16227
16228 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K:
16229 MOVQ dst_base+0(FP), AX
16230 SUBQ AX, CX
16231 MOVQ CX, ret+56(FP)
16232 RET
16233
16234 // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte, tmp *[81920]byte) int
16235 // Requires: BMI, SSE2
16236 TEXT ·encodeSnappyBetterBlockAsm12B(SB), $24-64
16237 MOVQ tmp+48(FP), AX
16238 MOVQ dst_base+0(FP), CX
16239 MOVQ $0x00000280, DX
16240 MOVQ AX, BX
16241 PXOR X0, X0
16242
16243 zero_loop_encodeSnappyBetterBlockAsm12B:
16244 MOVOU X0, (BX)
16245 MOVOU X0, 16(BX)
16246 MOVOU X0, 32(BX)
16247 MOVOU X0, 48(BX)
16248 MOVOU X0, 64(BX)
16249 MOVOU X0, 80(BX)
16250 MOVOU X0, 96(BX)
16251 MOVOU X0, 112(BX)
16252 ADDQ $0x80, BX
16253 DECQ DX
16254 JNZ zero_loop_encodeSnappyBetterBlockAsm12B
16255 MOVL $0x00000000, 12(SP)
16256 MOVQ src_len+32(FP), DX
16257 LEAQ -9(DX), BX
16258 LEAQ -8(DX), SI
16259 MOVL SI, 8(SP)
16260 SHRQ $0x05, DX
16261 SUBL DX, BX
16262 LEAQ (CX)(BX*1), BX
16263 MOVQ BX, (SP)
16264 MOVL $0x00000001, DX
16265 MOVL $0x00000000, 16(SP)
16266 MOVQ src_base+24(FP), BX
16267
16268 search_loop_encodeSnappyBetterBlockAsm12B:
16269 MOVL DX, SI
16270 SUBL 12(SP), SI
16271 SHRL $0x06, SI
16272 LEAL 1(DX)(SI*1), SI
16273 CMPL SI, 8(SP)
16274 JAE emit_remainder_encodeSnappyBetterBlockAsm12B
16275 MOVQ (BX)(DX*1), DI
16276 MOVL SI, 20(SP)
16277 MOVQ $0x0000cf1bbcdcbf9b, R9
16278 MOVQ $0x9e3779b1, SI
16279 MOVQ DI, R10
16280 MOVQ DI, R11
16281 SHLQ $0x10, R10
16282 IMULQ R9, R10
16283 SHRQ $0x32, R10
16284 SHLQ $0x20, R11
16285 IMULQ SI, R11
16286 SHRQ $0x34, R11
16287 MOVL (AX)(R10*4), SI
16288 MOVL 65536(AX)(R11*4), R8
16289 MOVL DX, (AX)(R10*4)
16290 MOVL DX, 65536(AX)(R11*4)
16291 MOVQ (BX)(SI*1), R10
16292 MOVQ (BX)(R8*1), R11
16293 CMPQ R10, DI
16294 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16295 CMPQ R11, DI
16296 JNE no_short_found_encodeSnappyBetterBlockAsm12B
16297 MOVL R8, SI
16298 JMP candidate_match_encodeSnappyBetterBlockAsm12B
16299
16300 no_short_found_encodeSnappyBetterBlockAsm12B:
16301 CMPL R10, DI
16302 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16303 CMPL R11, DI
16304 JEQ candidateS_match_encodeSnappyBetterBlockAsm12B
16305 MOVL 20(SP), DX
16306 JMP search_loop_encodeSnappyBetterBlockAsm12B
16307
16308 candidateS_match_encodeSnappyBetterBlockAsm12B:
16309 SHRQ $0x08, DI
16310 MOVQ DI, R10
16311 SHLQ $0x10, R10
16312 IMULQ R9, R10
16313 SHRQ $0x32, R10
16314 MOVL (AX)(R10*4), SI
16315 INCL DX
16316 MOVL DX, (AX)(R10*4)
16317 CMPL (BX)(SI*1), DI
16318 JEQ candidate_match_encodeSnappyBetterBlockAsm12B
16319 DECL DX
16320 MOVL R8, SI
16321
16322 candidate_match_encodeSnappyBetterBlockAsm12B:
16323 MOVL 12(SP), DI
16324 TESTL SI, SI
16325 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
16326
16327 match_extend_back_loop_encodeSnappyBetterBlockAsm12B:
16328 CMPL DX, DI
16329 JBE match_extend_back_end_encodeSnappyBetterBlockAsm12B
16330 MOVB -1(BX)(SI*1), R8
16331 MOVB -1(BX)(DX*1), R9
16332 CMPB R8, R9
16333 JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B
16334 LEAL -1(DX), DX
16335 DECL SI
16336 JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B
16337 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B
16338
16339 match_extend_back_end_encodeSnappyBetterBlockAsm12B:
16340 MOVL DX, DI
16341 SUBL 12(SP), DI
16342 LEAQ 3(CX)(DI*1), DI
16343 CMPQ DI, (SP)
16344 JB match_dst_size_check_encodeSnappyBetterBlockAsm12B
16345 MOVQ $0x00000000, ret+56(FP)
16346 RET
16347
16348 match_dst_size_check_encodeSnappyBetterBlockAsm12B:
16349 MOVL DX, DI
16350 ADDL $0x04, DX
16351 ADDL $0x04, SI
16352 MOVQ src_len+32(FP), R8
16353 SUBL DX, R8
16354 LEAQ (BX)(DX*1), R9
16355 LEAQ (BX)(SI*1), R10
16356
16357 // matchLen
16358 XORL R12, R12
16359
16360 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B:
16361 CMPL R8, $0x10
16362 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B
16363 MOVQ (R9)(R12*1), R11
16364 MOVQ 8(R9)(R12*1), R13
16365 XORQ (R10)(R12*1), R11
16366 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
16367 XORQ 8(R10)(R12*1), R13
16368 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B
16369 LEAL -16(R8), R8
16370 LEAL 16(R12), R12
16371 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B
16372
16373 matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B:
16374 #ifdef GOAMD64_v3
16375 TZCNTQ R13, R13
16376
16377 #else
16378 BSFQ R13, R13
16379
16380 #endif
16381 SARQ $0x03, R13
16382 LEAL 8(R12)(R13*1), R12
16383 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
16384
16385 matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B:
16386 CMPL R8, $0x08
16387 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
16388 MOVQ (R9)(R12*1), R11
16389 XORQ (R10)(R12*1), R11
16390 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B
16391 LEAL -8(R8), R8
16392 LEAL 8(R12), R12
16393 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B
16394
16395 matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B:
16396 #ifdef GOAMD64_v3
16397 TZCNTQ R11, R11
16398
16399 #else
16400 BSFQ R11, R11
16401
16402 #endif
16403 SARQ $0x03, R11
16404 LEAL (R12)(R11*1), R12
16405 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B
16406
16407 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B:
16408 CMPL R8, $0x04
16409 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
16410 MOVL (R9)(R12*1), R11
16411 CMPL (R10)(R12*1), R11
16412 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B
16413 LEAL -4(R8), R8
16414 LEAL 4(R12), R12
16415
16416 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B:
16417 CMPL R8, $0x01
16418 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
16419 JB match_nolit_end_encodeSnappyBetterBlockAsm12B
16420 MOVW (R9)(R12*1), R11
16421 CMPW (R10)(R12*1), R11
16422 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B
16423 LEAL 2(R12), R12
16424 SUBL $0x02, R8
16425 JZ match_nolit_end_encodeSnappyBetterBlockAsm12B
16426
16427 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B:
16428 MOVB (R9)(R12*1), R11
16429 CMPB (R10)(R12*1), R11
16430 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B
16431 LEAL 1(R12), R12
16432
16433 match_nolit_end_encodeSnappyBetterBlockAsm12B:
16434 MOVL DX, R8
16435 SUBL SI, R8
16436
16437 // Check if repeat
16438 MOVL R8, 16(SP)
16439 MOVL 12(SP), SI
16440 CMPL SI, DI
16441 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
16442 MOVL DI, R9
16443 MOVL DI, 12(SP)
16444 LEAQ (BX)(SI*1), R10
16445 SUBL SI, R9
16446 LEAL -1(R9), SI
16447 CMPL SI, $0x3c
16448 JB one_byte_match_emit_encodeSnappyBetterBlockAsm12B
16449 CMPL SI, $0x00000100
16450 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm12B
16451 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm12B
16452
16453 three_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
16454 MOVB $0xf4, (CX)
16455 MOVW SI, 1(CX)
16456 ADDQ $0x03, CX
16457 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
16458
16459 two_bytes_match_emit_encodeSnappyBetterBlockAsm12B:
16460 MOVB $0xf0, (CX)
16461 MOVB SI, 1(CX)
16462 ADDQ $0x02, CX
16463 CMPL SI, $0x40
16464 JB memmove_match_emit_encodeSnappyBetterBlockAsm12B
16465 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B
16466
16467 one_byte_match_emit_encodeSnappyBetterBlockAsm12B:
16468 SHLB $0x02, SI
16469 MOVB SI, (CX)
16470 ADDQ $0x01, CX
16471
16472 memmove_match_emit_encodeSnappyBetterBlockAsm12B:
16473 LEAQ (CX)(R9*1), SI
16474
16475 // genMemMoveShort
16476 CMPQ R9, $0x08
16477 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8
16478 CMPQ R9, $0x10
16479 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
16480 CMPQ R9, $0x20
16481 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
16482 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
16483
16484 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8:
16485 MOVQ (R10), R11
16486 MOVQ R11, (CX)
16487 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16488
16489 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
16490 MOVQ (R10), R11
16491 MOVQ -8(R10)(R9*1), R10
16492 MOVQ R11, (CX)
16493 MOVQ R10, -8(CX)(R9*1)
16494 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16495
16496 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
16497 MOVOU (R10), X0
16498 MOVOU -16(R10)(R9*1), X1
16499 MOVOU X0, (CX)
16500 MOVOU X1, -16(CX)(R9*1)
16501 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B
16502
16503 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
16504 MOVOU (R10), X0
16505 MOVOU 16(R10), X1
16506 MOVOU -32(R10)(R9*1), X2
16507 MOVOU -16(R10)(R9*1), X3
16508 MOVOU X0, (CX)
16509 MOVOU X1, 16(CX)
16510 MOVOU X2, -32(CX)(R9*1)
16511 MOVOU X3, -16(CX)(R9*1)
16512
16513 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B:
16514 MOVQ SI, CX
16515 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B
16516
16517 memmove_long_match_emit_encodeSnappyBetterBlockAsm12B:
16518 LEAQ (CX)(R9*1), SI
16519
16520 // genMemMoveLong
16521 MOVOU (R10), X0
16522 MOVOU 16(R10), X1
16523 MOVOU -32(R10)(R9*1), X2
16524 MOVOU -16(R10)(R9*1), X3
16525 MOVQ R9, R13
16526 SHRQ $0x05, R13
16527 MOVQ CX, R11
16528 ANDL $0x0000001f, R11
16529 MOVQ $0x00000040, R14
16530 SUBQ R11, R14
16531 DECQ R13
16532 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16533 LEAQ -32(R10)(R14*1), R11
16534 LEAQ -32(CX)(R14*1), R15
16535
16536 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
16537 MOVOU (R11), X4
16538 MOVOU 16(R11), X5
16539 MOVOA X4, (R15)
16540 MOVOA X5, 16(R15)
16541 ADDQ $0x20, R15
16542 ADDQ $0x20, R11
16543 ADDQ $0x20, R14
16544 DECQ R13
16545 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
16546
16547 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
16548 MOVOU -32(R10)(R14*1), X4
16549 MOVOU -16(R10)(R14*1), X5
16550 MOVOA X4, -32(CX)(R14*1)
16551 MOVOA X5, -16(CX)(R14*1)
16552 ADDQ $0x20, R14
16553 CMPQ R9, R14
16554 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16555 MOVOU X0, (CX)
16556 MOVOU X1, 16(CX)
16557 MOVOU X2, -32(CX)(R9*1)
16558 MOVOU X3, -16(CX)(R9*1)
16559 MOVQ SI, CX
16560
16561 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B:
16562 ADDL R12, DX
16563 ADDL $0x04, R12
16564 MOVL DX, 12(SP)
16565
16566 // emitCopy
16567 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B:
16568 CMPL R12, $0x40
16569 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B
16570 MOVB $0xee, (CX)
16571 MOVW R8, 1(CX)
16572 LEAL -60(R12), R12
16573 ADDQ $0x03, CX
16574 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B
16575
16576 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B:
16577 MOVL R12, SI
16578 SHLL $0x02, SI
16579 CMPL R12, $0x0c
16580 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
16581 CMPL R8, $0x00000800
16582 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B
16583 LEAL -15(SI), SI
16584 MOVB R8, 1(CX)
16585 SHRL $0x08, R8
16586 SHLL $0x05, R8
16587 ORL R8, SI
16588 MOVB SI, (CX)
16589 ADDQ $0x02, CX
16590 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B
16591
16592 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B:
16593 LEAL -2(SI), SI
16594 MOVB SI, (CX)
16595 MOVW R8, 1(CX)
16596 ADDQ $0x03, CX
16597
16598 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B:
16599 CMPL DX, 8(SP)
16600 JAE emit_remainder_encodeSnappyBetterBlockAsm12B
16601 CMPQ CX, (SP)
16602 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B
16603 MOVQ $0x00000000, ret+56(FP)
16604 RET
16605
16606 match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B:
16607 MOVQ $0x0000cf1bbcdcbf9b, SI
16608 MOVQ $0x9e3779b1, R8
16609 LEAQ 1(DI), DI
16610 LEAQ -2(DX), R9
16611 MOVQ (BX)(DI*1), R10
16612 MOVQ 1(BX)(DI*1), R11
16613 MOVQ (BX)(R9*1), R12
16614 MOVQ 1(BX)(R9*1), R13
16615 SHLQ $0x10, R10
16616 IMULQ SI, R10
16617 SHRQ $0x32, R10
16618 SHLQ $0x20, R11
16619 IMULQ R8, R11
16620 SHRQ $0x34, R11
16621 SHLQ $0x10, R12
16622 IMULQ SI, R12
16623 SHRQ $0x32, R12
16624 SHLQ $0x20, R13
16625 IMULQ R8, R13
16626 SHRQ $0x34, R13
16627 LEAQ 1(DI), R8
16628 LEAQ 1(R9), R14
16629 MOVL DI, (AX)(R10*4)
16630 MOVL R9, (AX)(R12*4)
16631 MOVL R8, 65536(AX)(R11*4)
16632 MOVL R14, 65536(AX)(R13*4)
16633 LEAQ 1(R9)(DI*1), R8
16634 SHRQ $0x01, R8
16635 ADDQ $0x01, DI
16636 SUBQ $0x01, R9
16637
16638 index_loop_encodeSnappyBetterBlockAsm12B:
16639 CMPQ R8, R9
16640 JAE search_loop_encodeSnappyBetterBlockAsm12B
16641 MOVQ (BX)(DI*1), R10
16642 MOVQ (BX)(R8*1), R11
16643 SHLQ $0x10, R10
16644 IMULQ SI, R10
16645 SHRQ $0x32, R10
16646 SHLQ $0x10, R11
16647 IMULQ SI, R11
16648 SHRQ $0x32, R11
16649 MOVL DI, (AX)(R10*4)
16650 MOVL R8, (AX)(R11*4)
16651 ADDQ $0x02, DI
16652 ADDQ $0x02, R8
16653 JMP index_loop_encodeSnappyBetterBlockAsm12B
16654
16655 emit_remainder_encodeSnappyBetterBlockAsm12B:
16656 MOVQ src_len+32(FP), AX
16657 SUBL 12(SP), AX
16658 LEAQ 3(CX)(AX*1), AX
16659 CMPQ AX, (SP)
16660 JB emit_remainder_ok_encodeSnappyBetterBlockAsm12B
16661 MOVQ $0x00000000, ret+56(FP)
16662 RET
16663
16664 emit_remainder_ok_encodeSnappyBetterBlockAsm12B:
16665 MOVQ src_len+32(FP), AX
16666 MOVL 12(SP), DX
16667 CMPL DX, AX
16668 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
16669 MOVL AX, SI
16670 MOVL AX, 12(SP)
16671 LEAQ (BX)(DX*1), AX
16672 SUBL DX, SI
16673 LEAL -1(SI), DX
16674 CMPL DX, $0x3c
16675 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B
16676 CMPL DX, $0x00000100
16677 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
16678 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B
16679
16680 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
16681 MOVB $0xf4, (CX)
16682 MOVW DX, 1(CX)
16683 ADDQ $0x03, CX
16684 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
16685
16686 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B:
16687 MOVB $0xf0, (CX)
16688 MOVB DL, 1(CX)
16689 ADDQ $0x02, CX
16690 CMPL DX, $0x40
16691 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm12B
16692 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B
16693
16694 one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B:
16695 SHLB $0x02, DL
16696 MOVB DL, (CX)
16697 ADDQ $0x01, CX
16698
16699 memmove_emit_remainder_encodeSnappyBetterBlockAsm12B:
16700 LEAQ (CX)(SI*1), DX
16701 MOVL SI, BX
16702
16703 // genMemMoveShort
16704 CMPQ BX, $0x03
16705 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2
16706 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3
16707 CMPQ BX, $0x08
16708 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7
16709 CMPQ BX, $0x10
16710 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16
16711 CMPQ BX, $0x20
16712 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32
16713 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64
16714
16715 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2:
16716 MOVB (AX), SI
16717 MOVB -1(AX)(BX*1), AL
16718 MOVB SI, (CX)
16719 MOVB AL, -1(CX)(BX*1)
16720 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16721
16722 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3:
16723 MOVW (AX), SI
16724 MOVB 2(AX), AL
16725 MOVW SI, (CX)
16726 MOVB AL, 2(CX)
16727 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16728
16729 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7:
16730 MOVL (AX), SI
16731 MOVL -4(AX)(BX*1), AX
16732 MOVL SI, (CX)
16733 MOVL AX, -4(CX)(BX*1)
16734 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16735
16736 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16:
16737 MOVQ (AX), SI
16738 MOVQ -8(AX)(BX*1), AX
16739 MOVQ SI, (CX)
16740 MOVQ AX, -8(CX)(BX*1)
16741 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16742
16743 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32:
16744 MOVOU (AX), X0
16745 MOVOU -16(AX)(BX*1), X1
16746 MOVOU X0, (CX)
16747 MOVOU X1, -16(CX)(BX*1)
16748 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B
16749
16750 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64:
16751 MOVOU (AX), X0
16752 MOVOU 16(AX), X1
16753 MOVOU -32(AX)(BX*1), X2
16754 MOVOU -16(AX)(BX*1), X3
16755 MOVOU X0, (CX)
16756 MOVOU X1, 16(CX)
16757 MOVOU X2, -32(CX)(BX*1)
16758 MOVOU X3, -16(CX)(BX*1)
16759
16760 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B:
16761 MOVQ DX, CX
16762 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B
16763
16764 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B:
16765 LEAQ (CX)(SI*1), DX
16766 MOVL SI, BX
16767
16768 // genMemMoveLong
16769 MOVOU (AX), X0
16770 MOVOU 16(AX), X1
16771 MOVOU -32(AX)(BX*1), X2
16772 MOVOU -16(AX)(BX*1), X3
16773 MOVQ BX, DI
16774 SHRQ $0x05, DI
16775 MOVQ CX, SI
16776 ANDL $0x0000001f, SI
16777 MOVQ $0x00000040, R8
16778 SUBQ SI, R8
16779 DECQ DI
16780 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16781 LEAQ -32(AX)(R8*1), SI
16782 LEAQ -32(CX)(R8*1), R9
16783
16784 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back:
16785 MOVOU (SI), X4
16786 MOVOU 16(SI), X5
16787 MOVOA X4, (R9)
16788 MOVOA X5, 16(R9)
16789 ADDQ $0x20, R9
16790 ADDQ $0x20, SI
16791 ADDQ $0x20, R8
16792 DECQ DI
16793 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back
16794
16795 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32:
16796 MOVOU -32(AX)(R8*1), X4
16797 MOVOU -16(AX)(R8*1), X5
16798 MOVOA X4, -32(CX)(R8*1)
16799 MOVOA X5, -16(CX)(R8*1)
16800 ADDQ $0x20, R8
16801 CMPQ BX, R8
16802 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32
16803 MOVOU X0, (CX)
16804 MOVOU X1, 16(CX)
16805 MOVOU X2, -32(CX)(BX*1)
16806 MOVOU X3, -16(CX)(BX*1)
16807 MOVQ DX, CX
16808
16809 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B:
16810 MOVQ dst_base+0(FP), AX
16811 SUBQ AX, CX
16812 MOVQ CX, ret+56(FP)
16813 RET
16814
16815 // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte, tmp *[20480]byte) int
16816 // Requires: BMI, SSE2
16817 TEXT ·encodeSnappyBetterBlockAsm10B(SB), $24-64
16818 MOVQ tmp+48(FP), AX
16819 MOVQ dst_base+0(FP), CX
16820 MOVQ $0x000000a0, DX
16821 MOVQ AX, BX
16822 PXOR X0, X0
16823
16824 zero_loop_encodeSnappyBetterBlockAsm10B:
16825 MOVOU X0, (BX)
16826 MOVOU X0, 16(BX)
16827 MOVOU X0, 32(BX)
16828 MOVOU X0, 48(BX)
16829 MOVOU X0, 64(BX)
16830 MOVOU X0, 80(BX)
16831 MOVOU X0, 96(BX)
16832 MOVOU X0, 112(BX)
16833 ADDQ $0x80, BX
16834 DECQ DX
16835 JNZ zero_loop_encodeSnappyBetterBlockAsm10B
16836 MOVL $0x00000000, 12(SP)
16837 MOVQ src_len+32(FP), DX
16838 LEAQ -9(DX), BX
16839 LEAQ -8(DX), SI
16840 MOVL SI, 8(SP)
16841 SHRQ $0x05, DX
16842 SUBL DX, BX
16843 LEAQ (CX)(BX*1), BX
16844 MOVQ BX, (SP)
16845 MOVL $0x00000001, DX
16846 MOVL $0x00000000, 16(SP)
16847 MOVQ src_base+24(FP), BX
16848
16849 search_loop_encodeSnappyBetterBlockAsm10B:
16850 MOVL DX, SI
16851 SUBL 12(SP), SI
16852 SHRL $0x05, SI
16853 LEAL 1(DX)(SI*1), SI
16854 CMPL SI, 8(SP)
16855 JAE emit_remainder_encodeSnappyBetterBlockAsm10B
16856 MOVQ (BX)(DX*1), DI
16857 MOVL SI, 20(SP)
16858 MOVQ $0x0000cf1bbcdcbf9b, R9
16859 MOVQ $0x9e3779b1, SI
16860 MOVQ DI, R10
16861 MOVQ DI, R11
16862 SHLQ $0x10, R10
16863 IMULQ R9, R10
16864 SHRQ $0x34, R10
16865 SHLQ $0x20, R11
16866 IMULQ SI, R11
16867 SHRQ $0x36, R11
16868 MOVL (AX)(R10*4), SI
16869 MOVL 16384(AX)(R11*4), R8
16870 MOVL DX, (AX)(R10*4)
16871 MOVL DX, 16384(AX)(R11*4)
16872 MOVQ (BX)(SI*1), R10
16873 MOVQ (BX)(R8*1), R11
16874 CMPQ R10, DI
16875 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16876 CMPQ R11, DI
16877 JNE no_short_found_encodeSnappyBetterBlockAsm10B
16878 MOVL R8, SI
16879 JMP candidate_match_encodeSnappyBetterBlockAsm10B
16880
16881 no_short_found_encodeSnappyBetterBlockAsm10B:
16882 CMPL R10, DI
16883 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16884 CMPL R11, DI
16885 JEQ candidateS_match_encodeSnappyBetterBlockAsm10B
16886 MOVL 20(SP), DX
16887 JMP search_loop_encodeSnappyBetterBlockAsm10B
16888
16889 candidateS_match_encodeSnappyBetterBlockAsm10B:
16890 SHRQ $0x08, DI
16891 MOVQ DI, R10
16892 SHLQ $0x10, R10
16893 IMULQ R9, R10
16894 SHRQ $0x34, R10
16895 MOVL (AX)(R10*4), SI
16896 INCL DX
16897 MOVL DX, (AX)(R10*4)
16898 CMPL (BX)(SI*1), DI
16899 JEQ candidate_match_encodeSnappyBetterBlockAsm10B
16900 DECL DX
16901 MOVL R8, SI
16902
16903 candidate_match_encodeSnappyBetterBlockAsm10B:
16904 MOVL 12(SP), DI
16905 TESTL SI, SI
16906 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
16907
16908 match_extend_back_loop_encodeSnappyBetterBlockAsm10B:
16909 CMPL DX, DI
16910 JBE match_extend_back_end_encodeSnappyBetterBlockAsm10B
16911 MOVB -1(BX)(SI*1), R8
16912 MOVB -1(BX)(DX*1), R9
16913 CMPB R8, R9
16914 JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B
16915 LEAL -1(DX), DX
16916 DECL SI
16917 JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B
16918 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B
16919
16920 match_extend_back_end_encodeSnappyBetterBlockAsm10B:
16921 MOVL DX, DI
16922 SUBL 12(SP), DI
16923 LEAQ 3(CX)(DI*1), DI
16924 CMPQ DI, (SP)
16925 JB match_dst_size_check_encodeSnappyBetterBlockAsm10B
16926 MOVQ $0x00000000, ret+56(FP)
16927 RET
16928
16929 match_dst_size_check_encodeSnappyBetterBlockAsm10B:
16930 MOVL DX, DI
16931 ADDL $0x04, DX
16932 ADDL $0x04, SI
16933 MOVQ src_len+32(FP), R8
16934 SUBL DX, R8
16935 LEAQ (BX)(DX*1), R9
16936 LEAQ (BX)(SI*1), R10
16937
16938 // matchLen
16939 XORL R12, R12
16940
16941 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B:
16942 CMPL R8, $0x10
16943 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B
16944 MOVQ (R9)(R12*1), R11
16945 MOVQ 8(R9)(R12*1), R13
16946 XORQ (R10)(R12*1), R11
16947 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
16948 XORQ 8(R10)(R12*1), R13
16949 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B
16950 LEAL -16(R8), R8
16951 LEAL 16(R12), R12
16952 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B
16953
16954 matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B:
16955 #ifdef GOAMD64_v3
16956 TZCNTQ R13, R13
16957
16958 #else
16959 BSFQ R13, R13
16960
16961 #endif
16962 SARQ $0x03, R13
16963 LEAL 8(R12)(R13*1), R12
16964 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
16965
16966 matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B:
16967 CMPL R8, $0x08
16968 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
16969 MOVQ (R9)(R12*1), R11
16970 XORQ (R10)(R12*1), R11
16971 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B
16972 LEAL -8(R8), R8
16973 LEAL 8(R12), R12
16974 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B
16975
16976 matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B:
16977 #ifdef GOAMD64_v3
16978 TZCNTQ R11, R11
16979
16980 #else
16981 BSFQ R11, R11
16982
16983 #endif
16984 SARQ $0x03, R11
16985 LEAL (R12)(R11*1), R12
16986 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B
16987
16988 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B:
16989 CMPL R8, $0x04
16990 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
16991 MOVL (R9)(R12*1), R11
16992 CMPL (R10)(R12*1), R11
16993 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B
16994 LEAL -4(R8), R8
16995 LEAL 4(R12), R12
16996
16997 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B:
16998 CMPL R8, $0x01
16999 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
17000 JB match_nolit_end_encodeSnappyBetterBlockAsm10B
17001 MOVW (R9)(R12*1), R11
17002 CMPW (R10)(R12*1), R11
17003 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B
17004 LEAL 2(R12), R12
17005 SUBL $0x02, R8
17006 JZ match_nolit_end_encodeSnappyBetterBlockAsm10B
17007
17008 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B:
17009 MOVB (R9)(R12*1), R11
17010 CMPB (R10)(R12*1), R11
17011 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B
17012 LEAL 1(R12), R12
17013
17014 match_nolit_end_encodeSnappyBetterBlockAsm10B:
17015 MOVL DX, R8
17016 SUBL SI, R8
17017
17018 // Check if repeat
17019 MOVL R8, 16(SP)
17020 MOVL 12(SP), SI
17021 CMPL SI, DI
17022 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
17023 MOVL DI, R9
17024 MOVL DI, 12(SP)
17025 LEAQ (BX)(SI*1), R10
17026 SUBL SI, R9
17027 LEAL -1(R9), SI
17028 CMPL SI, $0x3c
17029 JB one_byte_match_emit_encodeSnappyBetterBlockAsm10B
17030 CMPL SI, $0x00000100
17031 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm10B
17032 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm10B
17033
17034 three_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
17035 MOVB $0xf4, (CX)
17036 MOVW SI, 1(CX)
17037 ADDQ $0x03, CX
17038 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
17039
17040 two_bytes_match_emit_encodeSnappyBetterBlockAsm10B:
17041 MOVB $0xf0, (CX)
17042 MOVB SI, 1(CX)
17043 ADDQ $0x02, CX
17044 CMPL SI, $0x40
17045 JB memmove_match_emit_encodeSnappyBetterBlockAsm10B
17046 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B
17047
17048 one_byte_match_emit_encodeSnappyBetterBlockAsm10B:
17049 SHLB $0x02, SI
17050 MOVB SI, (CX)
17051 ADDQ $0x01, CX
17052
17053 memmove_match_emit_encodeSnappyBetterBlockAsm10B:
17054 LEAQ (CX)(R9*1), SI
17055
17056 // genMemMoveShort
17057 CMPQ R9, $0x08
17058 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8
17059 CMPQ R9, $0x10
17060 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
17061 CMPQ R9, $0x20
17062 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
17063 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
17064
17065 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8:
17066 MOVQ (R10), R11
17067 MOVQ R11, (CX)
17068 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
17069
17070 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
17071 MOVQ (R10), R11
17072 MOVQ -8(R10)(R9*1), R10
17073 MOVQ R11, (CX)
17074 MOVQ R10, -8(CX)(R9*1)
17075 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
17076
17077 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
17078 MOVOU (R10), X0
17079 MOVOU -16(R10)(R9*1), X1
17080 MOVOU X0, (CX)
17081 MOVOU X1, -16(CX)(R9*1)
17082 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B
17083
17084 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
17085 MOVOU (R10), X0
17086 MOVOU 16(R10), X1
17087 MOVOU -32(R10)(R9*1), X2
17088 MOVOU -16(R10)(R9*1), X3
17089 MOVOU X0, (CX)
17090 MOVOU X1, 16(CX)
17091 MOVOU X2, -32(CX)(R9*1)
17092 MOVOU X3, -16(CX)(R9*1)
17093
17094 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B:
17095 MOVQ SI, CX
17096 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B
17097
17098 memmove_long_match_emit_encodeSnappyBetterBlockAsm10B:
17099 LEAQ (CX)(R9*1), SI
17100
17101 // genMemMoveLong
17102 MOVOU (R10), X0
17103 MOVOU 16(R10), X1
17104 MOVOU -32(R10)(R9*1), X2
17105 MOVOU -16(R10)(R9*1), X3
17106 MOVQ R9, R13
17107 SHRQ $0x05, R13
17108 MOVQ CX, R11
17109 ANDL $0x0000001f, R11
17110 MOVQ $0x00000040, R14
17111 SUBQ R11, R14
17112 DECQ R13
17113 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17114 LEAQ -32(R10)(R14*1), R11
17115 LEAQ -32(CX)(R14*1), R15
17116
17117 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
17118 MOVOU (R11), X4
17119 MOVOU 16(R11), X5
17120 MOVOA X4, (R15)
17121 MOVOA X5, 16(R15)
17122 ADDQ $0x20, R15
17123 ADDQ $0x20, R11
17124 ADDQ $0x20, R14
17125 DECQ R13
17126 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
17127
17128 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
17129 MOVOU -32(R10)(R14*1), X4
17130 MOVOU -16(R10)(R14*1), X5
17131 MOVOA X4, -32(CX)(R14*1)
17132 MOVOA X5, -16(CX)(R14*1)
17133 ADDQ $0x20, R14
17134 CMPQ R9, R14
17135 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17136 MOVOU X0, (CX)
17137 MOVOU X1, 16(CX)
17138 MOVOU X2, -32(CX)(R9*1)
17139 MOVOU X3, -16(CX)(R9*1)
17140 MOVQ SI, CX
17141
17142 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B:
17143 ADDL R12, DX
17144 ADDL $0x04, R12
17145 MOVL DX, 12(SP)
17146
17147 // emitCopy
17148 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B:
17149 CMPL R12, $0x40
17150 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B
17151 MOVB $0xee, (CX)
17152 MOVW R8, 1(CX)
17153 LEAL -60(R12), R12
17154 ADDQ $0x03, CX
17155 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B
17156
17157 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B:
17158 MOVL R12, SI
17159 SHLL $0x02, SI
17160 CMPL R12, $0x0c
17161 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
17162 CMPL R8, $0x00000800
17163 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B
17164 LEAL -15(SI), SI
17165 MOVB R8, 1(CX)
17166 SHRL $0x08, R8
17167 SHLL $0x05, R8
17168 ORL R8, SI
17169 MOVB SI, (CX)
17170 ADDQ $0x02, CX
17171 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B
17172
17173 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B:
17174 LEAL -2(SI), SI
17175 MOVB SI, (CX)
17176 MOVW R8, 1(CX)
17177 ADDQ $0x03, CX
17178
17179 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B:
17180 CMPL DX, 8(SP)
17181 JAE emit_remainder_encodeSnappyBetterBlockAsm10B
17182 CMPQ CX, (SP)
17183 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B
17184 MOVQ $0x00000000, ret+56(FP)
17185 RET
17186
17187 match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B:
17188 MOVQ $0x0000cf1bbcdcbf9b, SI
17189 MOVQ $0x9e3779b1, R8
17190 LEAQ 1(DI), DI
17191 LEAQ -2(DX), R9
17192 MOVQ (BX)(DI*1), R10
17193 MOVQ 1(BX)(DI*1), R11
17194 MOVQ (BX)(R9*1), R12
17195 MOVQ 1(BX)(R9*1), R13
17196 SHLQ $0x10, R10
17197 IMULQ SI, R10
17198 SHRQ $0x34, R10
17199 SHLQ $0x20, R11
17200 IMULQ R8, R11
17201 SHRQ $0x36, R11
17202 SHLQ $0x10, R12
17203 IMULQ SI, R12
17204 SHRQ $0x34, R12
17205 SHLQ $0x20, R13
17206 IMULQ R8, R13
17207 SHRQ $0x36, R13
17208 LEAQ 1(DI), R8
17209 LEAQ 1(R9), R14
17210 MOVL DI, (AX)(R10*4)
17211 MOVL R9, (AX)(R12*4)
17212 MOVL R8, 16384(AX)(R11*4)
17213 MOVL R14, 16384(AX)(R13*4)
17214 LEAQ 1(R9)(DI*1), R8
17215 SHRQ $0x01, R8
17216 ADDQ $0x01, DI
17217 SUBQ $0x01, R9
17218
17219 index_loop_encodeSnappyBetterBlockAsm10B:
17220 CMPQ R8, R9
17221 JAE search_loop_encodeSnappyBetterBlockAsm10B
17222 MOVQ (BX)(DI*1), R10
17223 MOVQ (BX)(R8*1), R11
17224 SHLQ $0x10, R10
17225 IMULQ SI, R10
17226 SHRQ $0x34, R10
17227 SHLQ $0x10, R11
17228 IMULQ SI, R11
17229 SHRQ $0x34, R11
17230 MOVL DI, (AX)(R10*4)
17231 MOVL R8, (AX)(R11*4)
17232 ADDQ $0x02, DI
17233 ADDQ $0x02, R8
17234 JMP index_loop_encodeSnappyBetterBlockAsm10B
17235
17236 emit_remainder_encodeSnappyBetterBlockAsm10B:
17237 MOVQ src_len+32(FP), AX
17238 SUBL 12(SP), AX
17239 LEAQ 3(CX)(AX*1), AX
17240 CMPQ AX, (SP)
17241 JB emit_remainder_ok_encodeSnappyBetterBlockAsm10B
17242 MOVQ $0x00000000, ret+56(FP)
17243 RET
17244
17245 emit_remainder_ok_encodeSnappyBetterBlockAsm10B:
17246 MOVQ src_len+32(FP), AX
17247 MOVL 12(SP), DX
17248 CMPL DX, AX
17249 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
17250 MOVL AX, SI
17251 MOVL AX, 12(SP)
17252 LEAQ (BX)(DX*1), AX
17253 SUBL DX, SI
17254 LEAL -1(SI), DX
17255 CMPL DX, $0x3c
17256 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B
17257 CMPL DX, $0x00000100
17258 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
17259 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B
17260
17261 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
17262 MOVB $0xf4, (CX)
17263 MOVW DX, 1(CX)
17264 ADDQ $0x03, CX
17265 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
17266
17267 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B:
17268 MOVB $0xf0, (CX)
17269 MOVB DL, 1(CX)
17270 ADDQ $0x02, CX
17271 CMPL DX, $0x40
17272 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm10B
17273 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B
17274
17275 one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B:
17276 SHLB $0x02, DL
17277 MOVB DL, (CX)
17278 ADDQ $0x01, CX
17279
17280 memmove_emit_remainder_encodeSnappyBetterBlockAsm10B:
17281 LEAQ (CX)(SI*1), DX
17282 MOVL SI, BX
17283
17284 // genMemMoveShort
17285 CMPQ BX, $0x03
17286 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2
17287 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3
17288 CMPQ BX, $0x08
17289 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7
17290 CMPQ BX, $0x10
17291 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16
17292 CMPQ BX, $0x20
17293 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32
17294 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64
17295
17296 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2:
17297 MOVB (AX), SI
17298 MOVB -1(AX)(BX*1), AL
17299 MOVB SI, (CX)
17300 MOVB AL, -1(CX)(BX*1)
17301 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17302
17303 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3:
17304 MOVW (AX), SI
17305 MOVB 2(AX), AL
17306 MOVW SI, (CX)
17307 MOVB AL, 2(CX)
17308 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17309
17310 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7:
17311 MOVL (AX), SI
17312 MOVL -4(AX)(BX*1), AX
17313 MOVL SI, (CX)
17314 MOVL AX, -4(CX)(BX*1)
17315 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17316
17317 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16:
17318 MOVQ (AX), SI
17319 MOVQ -8(AX)(BX*1), AX
17320 MOVQ SI, (CX)
17321 MOVQ AX, -8(CX)(BX*1)
17322 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17323
17324 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32:
17325 MOVOU (AX), X0
17326 MOVOU -16(AX)(BX*1), X1
17327 MOVOU X0, (CX)
17328 MOVOU X1, -16(CX)(BX*1)
17329 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B
17330
17331 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64:
17332 MOVOU (AX), X0
17333 MOVOU 16(AX), X1
17334 MOVOU -32(AX)(BX*1), X2
17335 MOVOU -16(AX)(BX*1), X3
17336 MOVOU X0, (CX)
17337 MOVOU X1, 16(CX)
17338 MOVOU X2, -32(CX)(BX*1)
17339 MOVOU X3, -16(CX)(BX*1)
17340
17341 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B:
17342 MOVQ DX, CX
17343 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B
17344
17345 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B:
17346 LEAQ (CX)(SI*1), DX
17347 MOVL SI, BX
17348
17349 // genMemMoveLong
17350 MOVOU (AX), X0
17351 MOVOU 16(AX), X1
17352 MOVOU -32(AX)(BX*1), X2
17353 MOVOU -16(AX)(BX*1), X3
17354 MOVQ BX, DI
17355 SHRQ $0x05, DI
17356 MOVQ CX, SI
17357 ANDL $0x0000001f, SI
17358 MOVQ $0x00000040, R8
17359 SUBQ SI, R8
17360 DECQ DI
17361 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17362 LEAQ -32(AX)(R8*1), SI
17363 LEAQ -32(CX)(R8*1), R9
17364
17365 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back:
17366 MOVOU (SI), X4
17367 MOVOU 16(SI), X5
17368 MOVOA X4, (R9)
17369 MOVOA X5, 16(R9)
17370 ADDQ $0x20, R9
17371 ADDQ $0x20, SI
17372 ADDQ $0x20, R8
17373 DECQ DI
17374 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back
17375
17376 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32:
17377 MOVOU -32(AX)(R8*1), X4
17378 MOVOU -16(AX)(R8*1), X5
17379 MOVOA X4, -32(CX)(R8*1)
17380 MOVOA X5, -16(CX)(R8*1)
17381 ADDQ $0x20, R8
17382 CMPQ BX, R8
17383 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32
17384 MOVOU X0, (CX)
17385 MOVOU X1, 16(CX)
17386 MOVOU X2, -32(CX)(BX*1)
17387 MOVOU X3, -16(CX)(BX*1)
17388 MOVQ DX, CX
17389
17390 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B:
17391 MOVQ dst_base+0(FP), AX
17392 SUBQ AX, CX
17393 MOVQ CX, ret+56(FP)
17394 RET
17395
17396 // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte, tmp *[5120]byte) int
17397 // Requires: BMI, SSE2
17398 TEXT ·encodeSnappyBetterBlockAsm8B(SB), $24-64
17399 MOVQ tmp+48(FP), AX
17400 MOVQ dst_base+0(FP), CX
17401 MOVQ $0x00000028, DX
17402 MOVQ AX, BX
17403 PXOR X0, X0
17404
17405 zero_loop_encodeSnappyBetterBlockAsm8B:
17406 MOVOU X0, (BX)
17407 MOVOU X0, 16(BX)
17408 MOVOU X0, 32(BX)
17409 MOVOU X0, 48(BX)
17410 MOVOU X0, 64(BX)
17411 MOVOU X0, 80(BX)
17412 MOVOU X0, 96(BX)
17413 MOVOU X0, 112(BX)
17414 ADDQ $0x80, BX
17415 DECQ DX
17416 JNZ zero_loop_encodeSnappyBetterBlockAsm8B
17417 MOVL $0x00000000, 12(SP)
17418 MOVQ src_len+32(FP), DX
17419 LEAQ -9(DX), BX
17420 LEAQ -8(DX), SI
17421 MOVL SI, 8(SP)
17422 SHRQ $0x05, DX
17423 SUBL DX, BX
17424 LEAQ (CX)(BX*1), BX
17425 MOVQ BX, (SP)
17426 MOVL $0x00000001, DX
17427 MOVL $0x00000000, 16(SP)
17428 MOVQ src_base+24(FP), BX
17429
17430 search_loop_encodeSnappyBetterBlockAsm8B:
17431 MOVL DX, SI
17432 SUBL 12(SP), SI
17433 SHRL $0x04, SI
17434 LEAL 1(DX)(SI*1), SI
17435 CMPL SI, 8(SP)
17436 JAE emit_remainder_encodeSnappyBetterBlockAsm8B
17437 MOVQ (BX)(DX*1), DI
17438 MOVL SI, 20(SP)
17439 MOVQ $0x0000cf1bbcdcbf9b, R9
17440 MOVQ $0x9e3779b1, SI
17441 MOVQ DI, R10
17442 MOVQ DI, R11
17443 SHLQ $0x10, R10
17444 IMULQ R9, R10
17445 SHRQ $0x36, R10
17446 SHLQ $0x20, R11
17447 IMULQ SI, R11
17448 SHRQ $0x38, R11
17449 MOVL (AX)(R10*4), SI
17450 MOVL 4096(AX)(R11*4), R8
17451 MOVL DX, (AX)(R10*4)
17452 MOVL DX, 4096(AX)(R11*4)
17453 MOVQ (BX)(SI*1), R10
17454 MOVQ (BX)(R8*1), R11
17455 CMPQ R10, DI
17456 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17457 CMPQ R11, DI
17458 JNE no_short_found_encodeSnappyBetterBlockAsm8B
17459 MOVL R8, SI
17460 JMP candidate_match_encodeSnappyBetterBlockAsm8B
17461
17462 no_short_found_encodeSnappyBetterBlockAsm8B:
17463 CMPL R10, DI
17464 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17465 CMPL R11, DI
17466 JEQ candidateS_match_encodeSnappyBetterBlockAsm8B
17467 MOVL 20(SP), DX
17468 JMP search_loop_encodeSnappyBetterBlockAsm8B
17469
17470 candidateS_match_encodeSnappyBetterBlockAsm8B:
17471 SHRQ $0x08, DI
17472 MOVQ DI, R10
17473 SHLQ $0x10, R10
17474 IMULQ R9, R10
17475 SHRQ $0x36, R10
17476 MOVL (AX)(R10*4), SI
17477 INCL DX
17478 MOVL DX, (AX)(R10*4)
17479 CMPL (BX)(SI*1), DI
17480 JEQ candidate_match_encodeSnappyBetterBlockAsm8B
17481 DECL DX
17482 MOVL R8, SI
17483
17484 candidate_match_encodeSnappyBetterBlockAsm8B:
17485 MOVL 12(SP), DI
17486 TESTL SI, SI
17487 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
17488
17489 match_extend_back_loop_encodeSnappyBetterBlockAsm8B:
17490 CMPL DX, DI
17491 JBE match_extend_back_end_encodeSnappyBetterBlockAsm8B
17492 MOVB -1(BX)(SI*1), R8
17493 MOVB -1(BX)(DX*1), R9
17494 CMPB R8, R9
17495 JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B
17496 LEAL -1(DX), DX
17497 DECL SI
17498 JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B
17499 JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B
17500
17501 match_extend_back_end_encodeSnappyBetterBlockAsm8B:
17502 MOVL DX, DI
17503 SUBL 12(SP), DI
17504 LEAQ 3(CX)(DI*1), DI
17505 CMPQ DI, (SP)
17506 JB match_dst_size_check_encodeSnappyBetterBlockAsm8B
17507 MOVQ $0x00000000, ret+56(FP)
17508 RET
17509
17510 match_dst_size_check_encodeSnappyBetterBlockAsm8B:
17511 MOVL DX, DI
17512 ADDL $0x04, DX
17513 ADDL $0x04, SI
17514 MOVQ src_len+32(FP), R8
17515 SUBL DX, R8
17516 LEAQ (BX)(DX*1), R9
17517 LEAQ (BX)(SI*1), R10
17518
17519 // matchLen
17520 XORL R12, R12
17521
17522 matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B:
17523 CMPL R8, $0x10
17524 JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B
17525 MOVQ (R9)(R12*1), R11
17526 MOVQ 8(R9)(R12*1), R13
17527 XORQ (R10)(R12*1), R11
17528 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
17529 XORQ 8(R10)(R12*1), R13
17530 JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B
17531 LEAL -16(R8), R8
17532 LEAL 16(R12), R12
17533 JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B
17534
17535 matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B:
17536 #ifdef GOAMD64_v3
17537 TZCNTQ R13, R13
17538
17539 #else
17540 BSFQ R13, R13
17541
17542 #endif
17543 SARQ $0x03, R13
17544 LEAL 8(R12)(R13*1), R12
17545 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
17546
17547 matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B:
17548 CMPL R8, $0x08
17549 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
17550 MOVQ (R9)(R12*1), R11
17551 XORQ (R10)(R12*1), R11
17552 JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B
17553 LEAL -8(R8), R8
17554 LEAL 8(R12), R12
17555 JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B
17556
17557 matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B:
17558 #ifdef GOAMD64_v3
17559 TZCNTQ R11, R11
17560
17561 #else
17562 BSFQ R11, R11
17563
17564 #endif
17565 SARQ $0x03, R11
17566 LEAL (R12)(R11*1), R12
17567 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B
17568
17569 matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B:
17570 CMPL R8, $0x04
17571 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
17572 MOVL (R9)(R12*1), R11
17573 CMPL (R10)(R12*1), R11
17574 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B
17575 LEAL -4(R8), R8
17576 LEAL 4(R12), R12
17577
17578 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B:
17579 CMPL R8, $0x01
17580 JE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
17581 JB match_nolit_end_encodeSnappyBetterBlockAsm8B
17582 MOVW (R9)(R12*1), R11
17583 CMPW (R10)(R12*1), R11
17584 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B
17585 LEAL 2(R12), R12
17586 SUBL $0x02, R8
17587 JZ match_nolit_end_encodeSnappyBetterBlockAsm8B
17588
17589 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B:
17590 MOVB (R9)(R12*1), R11
17591 CMPB (R10)(R12*1), R11
17592 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B
17593 LEAL 1(R12), R12
17594
17595 match_nolit_end_encodeSnappyBetterBlockAsm8B:
17596 MOVL DX, R8
17597 SUBL SI, R8
17598
17599 // Check if repeat
17600 MOVL R8, 16(SP)
17601 MOVL 12(SP), SI
17602 CMPL SI, DI
17603 JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
17604 MOVL DI, R9
17605 MOVL DI, 12(SP)
17606 LEAQ (BX)(SI*1), R10
17607 SUBL SI, R9
17608 LEAL -1(R9), SI
17609 CMPL SI, $0x3c
17610 JB one_byte_match_emit_encodeSnappyBetterBlockAsm8B
17611 CMPL SI, $0x00000100
17612 JB two_bytes_match_emit_encodeSnappyBetterBlockAsm8B
17613 JB three_bytes_match_emit_encodeSnappyBetterBlockAsm8B
17614
17615 three_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
17616 MOVB $0xf4, (CX)
17617 MOVW SI, 1(CX)
17618 ADDQ $0x03, CX
17619 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
17620
17621 two_bytes_match_emit_encodeSnappyBetterBlockAsm8B:
17622 MOVB $0xf0, (CX)
17623 MOVB SI, 1(CX)
17624 ADDQ $0x02, CX
17625 CMPL SI, $0x40
17626 JB memmove_match_emit_encodeSnappyBetterBlockAsm8B
17627 JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B
17628
17629 one_byte_match_emit_encodeSnappyBetterBlockAsm8B:
17630 SHLB $0x02, SI
17631 MOVB SI, (CX)
17632 ADDQ $0x01, CX
17633
17634 memmove_match_emit_encodeSnappyBetterBlockAsm8B:
17635 LEAQ (CX)(R9*1), SI
17636
17637 // genMemMoveShort
17638 CMPQ R9, $0x08
17639 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8
17640 CMPQ R9, $0x10
17641 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
17642 CMPQ R9, $0x20
17643 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
17644 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
17645
17646 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8:
17647 MOVQ (R10), R11
17648 MOVQ R11, (CX)
17649 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17650
17651 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
17652 MOVQ (R10), R11
17653 MOVQ -8(R10)(R9*1), R10
17654 MOVQ R11, (CX)
17655 MOVQ R10, -8(CX)(R9*1)
17656 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17657
17658 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
17659 MOVOU (R10), X0
17660 MOVOU -16(R10)(R9*1), X1
17661 MOVOU X0, (CX)
17662 MOVOU X1, -16(CX)(R9*1)
17663 JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B
17664
17665 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
17666 MOVOU (R10), X0
17667 MOVOU 16(R10), X1
17668 MOVOU -32(R10)(R9*1), X2
17669 MOVOU -16(R10)(R9*1), X3
17670 MOVOU X0, (CX)
17671 MOVOU X1, 16(CX)
17672 MOVOU X2, -32(CX)(R9*1)
17673 MOVOU X3, -16(CX)(R9*1)
17674
17675 memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B:
17676 MOVQ SI, CX
17677 JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B
17678
17679 memmove_long_match_emit_encodeSnappyBetterBlockAsm8B:
17680 LEAQ (CX)(R9*1), SI
17681
17682 // genMemMoveLong
17683 MOVOU (R10), X0
17684 MOVOU 16(R10), X1
17685 MOVOU -32(R10)(R9*1), X2
17686 MOVOU -16(R10)(R9*1), X3
17687 MOVQ R9, R13
17688 SHRQ $0x05, R13
17689 MOVQ CX, R11
17690 ANDL $0x0000001f, R11
17691 MOVQ $0x00000040, R14
17692 SUBQ R11, R14
17693 DECQ R13
17694 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17695 LEAQ -32(R10)(R14*1), R11
17696 LEAQ -32(CX)(R14*1), R15
17697
17698 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
17699 MOVOU (R11), X4
17700 MOVOU 16(R11), X5
17701 MOVOA X4, (R15)
17702 MOVOA X5, 16(R15)
17703 ADDQ $0x20, R15
17704 ADDQ $0x20, R11
17705 ADDQ $0x20, R14
17706 DECQ R13
17707 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
17708
17709 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
17710 MOVOU -32(R10)(R14*1), X4
17711 MOVOU -16(R10)(R14*1), X5
17712 MOVOA X4, -32(CX)(R14*1)
17713 MOVOA X5, -16(CX)(R14*1)
17714 ADDQ $0x20, R14
17715 CMPQ R9, R14
17716 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17717 MOVOU X0, (CX)
17718 MOVOU X1, 16(CX)
17719 MOVOU X2, -32(CX)(R9*1)
17720 MOVOU X3, -16(CX)(R9*1)
17721 MOVQ SI, CX
17722
17723 emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B:
17724 ADDL R12, DX
17725 ADDL $0x04, R12
17726 MOVL DX, 12(SP)
17727
17728 // emitCopy
17729 two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B:
17730 CMPL R12, $0x40
17731 JBE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B
17732 MOVB $0xee, (CX)
17733 MOVW R8, 1(CX)
17734 LEAL -60(R12), R12
17735 ADDQ $0x03, CX
17736 JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B
17737
17738 two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B:
17739 MOVL R12, SI
17740 SHLL $0x02, SI
17741 CMPL R12, $0x0c
17742 JAE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B
17743 LEAL -15(SI), SI
17744 MOVB R8, 1(CX)
17745 SHRL $0x08, R8
17746 SHLL $0x05, R8
17747 ORL R8, SI
17748 MOVB SI, (CX)
17749 ADDQ $0x02, CX
17750 JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B
17751
17752 emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B:
17753 LEAL -2(SI), SI
17754 MOVB SI, (CX)
17755 MOVW R8, 1(CX)
17756 ADDQ $0x03, CX
17757
17758 match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B:
17759 CMPL DX, 8(SP)
17760 JAE emit_remainder_encodeSnappyBetterBlockAsm8B
17761 CMPQ CX, (SP)
17762 JB match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B
17763 MOVQ $0x00000000, ret+56(FP)
17764 RET
17765
17766 match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B:
17767 MOVQ $0x0000cf1bbcdcbf9b, SI
17768 MOVQ $0x9e3779b1, R8
17769 LEAQ 1(DI), DI
17770 LEAQ -2(DX), R9
17771 MOVQ (BX)(DI*1), R10
17772 MOVQ 1(BX)(DI*1), R11
17773 MOVQ (BX)(R9*1), R12
17774 MOVQ 1(BX)(R9*1), R13
17775 SHLQ $0x10, R10
17776 IMULQ SI, R10
17777 SHRQ $0x36, R10
17778 SHLQ $0x20, R11
17779 IMULQ R8, R11
17780 SHRQ $0x38, R11
17781 SHLQ $0x10, R12
17782 IMULQ SI, R12
17783 SHRQ $0x36, R12
17784 SHLQ $0x20, R13
17785 IMULQ R8, R13
17786 SHRQ $0x38, R13
17787 LEAQ 1(DI), R8
17788 LEAQ 1(R9), R14
17789 MOVL DI, (AX)(R10*4)
17790 MOVL R9, (AX)(R12*4)
17791 MOVL R8, 4096(AX)(R11*4)
17792 MOVL R14, 4096(AX)(R13*4)
17793 LEAQ 1(R9)(DI*1), R8
17794 SHRQ $0x01, R8
17795 ADDQ $0x01, DI
17796 SUBQ $0x01, R9
17797
17798 index_loop_encodeSnappyBetterBlockAsm8B:
17799 CMPQ R8, R9
17800 JAE search_loop_encodeSnappyBetterBlockAsm8B
17801 MOVQ (BX)(DI*1), R10
17802 MOVQ (BX)(R8*1), R11
17803 SHLQ $0x10, R10
17804 IMULQ SI, R10
17805 SHRQ $0x36, R10
17806 SHLQ $0x10, R11
17807 IMULQ SI, R11
17808 SHRQ $0x36, R11
17809 MOVL DI, (AX)(R10*4)
17810 MOVL R8, (AX)(R11*4)
17811 ADDQ $0x02, DI
17812 ADDQ $0x02, R8
17813 JMP index_loop_encodeSnappyBetterBlockAsm8B
17814
17815 emit_remainder_encodeSnappyBetterBlockAsm8B:
17816 MOVQ src_len+32(FP), AX
17817 SUBL 12(SP), AX
17818 LEAQ 3(CX)(AX*1), AX
17819 CMPQ AX, (SP)
17820 JB emit_remainder_ok_encodeSnappyBetterBlockAsm8B
17821 MOVQ $0x00000000, ret+56(FP)
17822 RET
17823
17824 emit_remainder_ok_encodeSnappyBetterBlockAsm8B:
17825 MOVQ src_len+32(FP), AX
17826 MOVL 12(SP), DX
17827 CMPL DX, AX
17828 JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
17829 MOVL AX, SI
17830 MOVL AX, 12(SP)
17831 LEAQ (BX)(DX*1), AX
17832 SUBL DX, SI
17833 LEAL -1(SI), DX
17834 CMPL DX, $0x3c
17835 JB one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B
17836 CMPL DX, $0x00000100
17837 JB two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
17838 JB three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B
17839
17840 three_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
17841 MOVB $0xf4, (CX)
17842 MOVW DX, 1(CX)
17843 ADDQ $0x03, CX
17844 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
17845
17846 two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B:
17847 MOVB $0xf0, (CX)
17848 MOVB DL, 1(CX)
17849 ADDQ $0x02, CX
17850 CMPL DX, $0x40
17851 JB memmove_emit_remainder_encodeSnappyBetterBlockAsm8B
17852 JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B
17853
17854 one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B:
17855 SHLB $0x02, DL
17856 MOVB DL, (CX)
17857 ADDQ $0x01, CX
17858
17859 memmove_emit_remainder_encodeSnappyBetterBlockAsm8B:
17860 LEAQ (CX)(SI*1), DX
17861 MOVL SI, BX
17862
17863 // genMemMoveShort
17864 CMPQ BX, $0x03
17865 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2
17866 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3
17867 CMPQ BX, $0x08
17868 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7
17869 CMPQ BX, $0x10
17870 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16
17871 CMPQ BX, $0x20
17872 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32
17873 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64
17874
17875 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2:
17876 MOVB (AX), SI
17877 MOVB -1(AX)(BX*1), AL
17878 MOVB SI, (CX)
17879 MOVB AL, -1(CX)(BX*1)
17880 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17881
17882 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3:
17883 MOVW (AX), SI
17884 MOVB 2(AX), AL
17885 MOVW SI, (CX)
17886 MOVB AL, 2(CX)
17887 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17888
17889 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7:
17890 MOVL (AX), SI
17891 MOVL -4(AX)(BX*1), AX
17892 MOVL SI, (CX)
17893 MOVL AX, -4(CX)(BX*1)
17894 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17895
17896 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16:
17897 MOVQ (AX), SI
17898 MOVQ -8(AX)(BX*1), AX
17899 MOVQ SI, (CX)
17900 MOVQ AX, -8(CX)(BX*1)
17901 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17902
17903 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32:
17904 MOVOU (AX), X0
17905 MOVOU -16(AX)(BX*1), X1
17906 MOVOU X0, (CX)
17907 MOVOU X1, -16(CX)(BX*1)
17908 JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B
17909
17910 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64:
17911 MOVOU (AX), X0
17912 MOVOU 16(AX), X1
17913 MOVOU -32(AX)(BX*1), X2
17914 MOVOU -16(AX)(BX*1), X3
17915 MOVOU X0, (CX)
17916 MOVOU X1, 16(CX)
17917 MOVOU X2, -32(CX)(BX*1)
17918 MOVOU X3, -16(CX)(BX*1)
17919
17920 memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B:
17921 MOVQ DX, CX
17922 JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B
17923
17924 memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B:
17925 LEAQ (CX)(SI*1), DX
17926 MOVL SI, BX
17927
17928 // genMemMoveLong
17929 MOVOU (AX), X0
17930 MOVOU 16(AX), X1
17931 MOVOU -32(AX)(BX*1), X2
17932 MOVOU -16(AX)(BX*1), X3
17933 MOVQ BX, DI
17934 SHRQ $0x05, DI
17935 MOVQ CX, SI
17936 ANDL $0x0000001f, SI
17937 MOVQ $0x00000040, R8
17938 SUBQ SI, R8
17939 DECQ DI
17940 JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17941 LEAQ -32(AX)(R8*1), SI
17942 LEAQ -32(CX)(R8*1), R9
17943
17944 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back:
17945 MOVOU (SI), X4
17946 MOVOU 16(SI), X5
17947 MOVOA X4, (R9)
17948 MOVOA X5, 16(R9)
17949 ADDQ $0x20, R9
17950 ADDQ $0x20, SI
17951 ADDQ $0x20, R8
17952 DECQ DI
17953 JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back
17954
17955 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32:
17956 MOVOU -32(AX)(R8*1), X4
17957 MOVOU -16(AX)(R8*1), X5
17958 MOVOA X4, -32(CX)(R8*1)
17959 MOVOA X5, -16(CX)(R8*1)
17960 ADDQ $0x20, R8
17961 CMPQ BX, R8
17962 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32
17963 MOVOU X0, (CX)
17964 MOVOU X1, 16(CX)
17965 MOVOU X2, -32(CX)(BX*1)
17966 MOVOU X3, -16(CX)(BX*1)
17967 MOVQ DX, CX
17968
17969 emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B:
17970 MOVQ dst_base+0(FP), AX
17971 SUBQ AX, CX
17972 MOVQ CX, ret+56(FP)
17973 RET
17974
17975 // func calcBlockSize(src []byte, tmp *[32768]byte) int
17976 // Requires: BMI, SSE2
17977 TEXT ·calcBlockSize(SB), $24-40
17978 MOVQ tmp+24(FP), AX
17979 XORQ CX, CX
17980 MOVQ $0x00000100, DX
17981 MOVQ AX, BX
17982 PXOR X0, X0
17983
17984 zero_loop_calcBlockSize:
17985 MOVOU X0, (BX)
17986 MOVOU X0, 16(BX)
17987 MOVOU X0, 32(BX)
17988 MOVOU X0, 48(BX)
17989 MOVOU X0, 64(BX)
17990 MOVOU X0, 80(BX)
17991 MOVOU X0, 96(BX)
17992 MOVOU X0, 112(BX)
17993 ADDQ $0x80, BX
17994 DECQ DX
17995 JNZ zero_loop_calcBlockSize
17996 MOVL $0x00000000, 12(SP)
17997 MOVQ src_len+8(FP), DX
17998 LEAQ -9(DX), BX
17999 LEAQ -8(DX), SI
18000 MOVL SI, 8(SP)
18001 SHRQ $0x05, DX
18002 SUBL DX, BX
18003 LEAQ (CX)(BX*1), BX
18004 MOVQ BX, (SP)
18005 MOVL $0x00000001, DX
18006 MOVL DX, 16(SP)
18007 MOVQ src_base+0(FP), BX
18008
18009 search_loop_calcBlockSize:
18010 MOVL DX, SI
18011 SUBL 12(SP), SI
18012 SHRL $0x05, SI
18013 LEAL 4(DX)(SI*1), SI
18014 CMPL SI, 8(SP)
18015 JAE emit_remainder_calcBlockSize
18016 MOVQ (BX)(DX*1), DI
18017 MOVL SI, 20(SP)
18018 MOVQ $0x0000cf1bbcdcbf9b, R9
18019 MOVQ DI, R10
18020 MOVQ DI, R11
18021 SHRQ $0x08, R11
18022 SHLQ $0x10, R10
18023 IMULQ R9, R10
18024 SHRQ $0x33, R10
18025 SHLQ $0x10, R11
18026 IMULQ R9, R11
18027 SHRQ $0x33, R11
18028 MOVL (AX)(R10*4), SI
18029 MOVL (AX)(R11*4), R8
18030 MOVL DX, (AX)(R10*4)
18031 LEAL 1(DX), R10
18032 MOVL R10, (AX)(R11*4)
18033 MOVQ DI, R10
18034 SHRQ $0x10, R10
18035 SHLQ $0x10, R10
18036 IMULQ R9, R10
18037 SHRQ $0x33, R10
18038 MOVL DX, R9
18039 SUBL 16(SP), R9
18040 MOVL 1(BX)(R9*1), R11
18041 MOVQ DI, R9
18042 SHRQ $0x08, R9
18043 CMPL R9, R11
18044 JNE no_repeat_found_calcBlockSize
18045 LEAL 1(DX), DI
18046 MOVL 12(SP), SI
18047 MOVL DI, R8
18048 SUBL 16(SP), R8
18049 JZ repeat_extend_back_end_calcBlockSize
18050
18051 repeat_extend_back_loop_calcBlockSize:
18052 CMPL DI, SI
18053 JBE repeat_extend_back_end_calcBlockSize
18054 MOVB -1(BX)(R8*1), R9
18055 MOVB -1(BX)(DI*1), R10
18056 CMPB R9, R10
18057 JNE repeat_extend_back_end_calcBlockSize
18058 LEAL -1(DI), DI
18059 DECL R8
18060 JNZ repeat_extend_back_loop_calcBlockSize
18061
18062 repeat_extend_back_end_calcBlockSize:
18063 MOVL DI, SI
18064 SUBL 12(SP), SI
18065 LEAQ 5(CX)(SI*1), SI
18066 CMPQ SI, (SP)
18067 JB repeat_dst_size_check_calcBlockSize
18068 MOVQ $0x00000000, ret+32(FP)
18069 RET
18070
18071 repeat_dst_size_check_calcBlockSize:
18072 MOVL 12(SP), SI
18073 CMPL SI, DI
18074 JEQ emit_literal_done_repeat_emit_calcBlockSize
18075 MOVL DI, R8
18076 MOVL DI, 12(SP)
18077 LEAQ (BX)(SI*1), R9
18078 SUBL SI, R8
18079 LEAL -1(R8), SI
18080 CMPL SI, $0x3c
18081 JB one_byte_repeat_emit_calcBlockSize
18082 CMPL SI, $0x00000100
18083 JB two_bytes_repeat_emit_calcBlockSize
18084 CMPL SI, $0x00010000
18085 JB three_bytes_repeat_emit_calcBlockSize
18086 CMPL SI, $0x01000000
18087 JB four_bytes_repeat_emit_calcBlockSize
18088 ADDQ $0x05, CX
18089 JMP memmove_long_repeat_emit_calcBlockSize
18090
18091 four_bytes_repeat_emit_calcBlockSize:
18092 ADDQ $0x04, CX
18093 JMP memmove_long_repeat_emit_calcBlockSize
18094
18095 three_bytes_repeat_emit_calcBlockSize:
18096 ADDQ $0x03, CX
18097 JMP memmove_long_repeat_emit_calcBlockSize
18098
18099 two_bytes_repeat_emit_calcBlockSize:
18100 ADDQ $0x02, CX
18101 CMPL SI, $0x40
18102 JB memmove_repeat_emit_calcBlockSize
18103 JMP memmove_long_repeat_emit_calcBlockSize
18104
18105 one_byte_repeat_emit_calcBlockSize:
18106 ADDQ $0x01, CX
18107
18108 memmove_repeat_emit_calcBlockSize:
18109 LEAQ (CX)(R8*1), CX
18110 JMP emit_literal_done_repeat_emit_calcBlockSize
18111
18112 memmove_long_repeat_emit_calcBlockSize:
18113 LEAQ (CX)(R8*1), CX
18114
18115 emit_literal_done_repeat_emit_calcBlockSize:
18116 ADDL $0x05, DX
18117 MOVL DX, SI
18118 SUBL 16(SP), SI
18119 MOVQ src_len+8(FP), R8
18120 SUBL DX, R8
18121 LEAQ (BX)(DX*1), R9
18122 LEAQ (BX)(SI*1), SI
18123
18124 // matchLen
18125 XORL R11, R11
18126
18127 matchlen_loopback_16_repeat_extend_calcBlockSize:
18128 CMPL R8, $0x10
18129 JB matchlen_match8_repeat_extend_calcBlockSize
18130 MOVQ (R9)(R11*1), R10
18131 MOVQ 8(R9)(R11*1), R12
18132 XORQ (SI)(R11*1), R10
18133 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
18134 XORQ 8(SI)(R11*1), R12
18135 JNZ matchlen_bsf_16repeat_extend_calcBlockSize
18136 LEAL -16(R8), R8
18137 LEAL 16(R11), R11
18138 JMP matchlen_loopback_16_repeat_extend_calcBlockSize
18139
18140 matchlen_bsf_16repeat_extend_calcBlockSize:
18141 #ifdef GOAMD64_v3
18142 TZCNTQ R12, R12
18143
18144 #else
18145 BSFQ R12, R12
18146
18147 #endif
18148 SARQ $0x03, R12
18149 LEAL 8(R11)(R12*1), R11
18150 JMP repeat_extend_forward_end_calcBlockSize
18151
18152 matchlen_match8_repeat_extend_calcBlockSize:
18153 CMPL R8, $0x08
18154 JB matchlen_match4_repeat_extend_calcBlockSize
18155 MOVQ (R9)(R11*1), R10
18156 XORQ (SI)(R11*1), R10
18157 JNZ matchlen_bsf_8_repeat_extend_calcBlockSize
18158 LEAL -8(R8), R8
18159 LEAL 8(R11), R11
18160 JMP matchlen_match4_repeat_extend_calcBlockSize
18161
18162 matchlen_bsf_8_repeat_extend_calcBlockSize:
18163 #ifdef GOAMD64_v3
18164 TZCNTQ R10, R10
18165
18166 #else
18167 BSFQ R10, R10
18168
18169 #endif
18170 SARQ $0x03, R10
18171 LEAL (R11)(R10*1), R11
18172 JMP repeat_extend_forward_end_calcBlockSize
18173
18174 matchlen_match4_repeat_extend_calcBlockSize:
18175 CMPL R8, $0x04
18176 JB matchlen_match2_repeat_extend_calcBlockSize
18177 MOVL (R9)(R11*1), R10
18178 CMPL (SI)(R11*1), R10
18179 JNE matchlen_match2_repeat_extend_calcBlockSize
18180 LEAL -4(R8), R8
18181 LEAL 4(R11), R11
18182
18183 matchlen_match2_repeat_extend_calcBlockSize:
18184 CMPL R8, $0x01
18185 JE matchlen_match1_repeat_extend_calcBlockSize
18186 JB repeat_extend_forward_end_calcBlockSize
18187 MOVW (R9)(R11*1), R10
18188 CMPW (SI)(R11*1), R10
18189 JNE matchlen_match1_repeat_extend_calcBlockSize
18190 LEAL 2(R11), R11
18191 SUBL $0x02, R8
18192 JZ repeat_extend_forward_end_calcBlockSize
18193
18194 matchlen_match1_repeat_extend_calcBlockSize:
18195 MOVB (R9)(R11*1), R10
18196 CMPB (SI)(R11*1), R10
18197 JNE repeat_extend_forward_end_calcBlockSize
18198 LEAL 1(R11), R11
18199
18200 repeat_extend_forward_end_calcBlockSize:
18201 ADDL R11, DX
18202 MOVL DX, SI
18203 SUBL DI, SI
18204 MOVL 16(SP), DI
18205
18206 // emitCopy
18207 CMPL DI, $0x00010000
18208 JB two_byte_offset_repeat_as_copy_calcBlockSize
18209
18210 four_bytes_loop_back_repeat_as_copy_calcBlockSize:
18211 CMPL SI, $0x40
18212 JBE four_bytes_remain_repeat_as_copy_calcBlockSize
18213 LEAL -64(SI), SI
18214 ADDQ $0x05, CX
18215 CMPL SI, $0x04
18216 JB four_bytes_remain_repeat_as_copy_calcBlockSize
18217 JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize
18218
18219 four_bytes_remain_repeat_as_copy_calcBlockSize:
18220 TESTL SI, SI
18221 JZ repeat_end_emit_calcBlockSize
18222 XORL SI, SI
18223 ADDQ $0x05, CX
18224 JMP repeat_end_emit_calcBlockSize
18225
18226 two_byte_offset_repeat_as_copy_calcBlockSize:
18227 CMPL SI, $0x40
18228 JBE two_byte_offset_short_repeat_as_copy_calcBlockSize
18229 LEAL -60(SI), SI
18230 ADDQ $0x03, CX
18231 JMP two_byte_offset_repeat_as_copy_calcBlockSize
18232
18233 two_byte_offset_short_repeat_as_copy_calcBlockSize:
18234 MOVL SI, R8
18235 SHLL $0x02, R8
18236 CMPL SI, $0x0c
18237 JAE emit_copy_three_repeat_as_copy_calcBlockSize
18238 CMPL DI, $0x00000800
18239 JAE emit_copy_three_repeat_as_copy_calcBlockSize
18240 ADDQ $0x02, CX
18241 JMP repeat_end_emit_calcBlockSize
18242
18243 emit_copy_three_repeat_as_copy_calcBlockSize:
18244 ADDQ $0x03, CX
18245
18246 repeat_end_emit_calcBlockSize:
18247 MOVL DX, 12(SP)
18248 JMP search_loop_calcBlockSize
18249
18250 no_repeat_found_calcBlockSize:
18251 CMPL (BX)(SI*1), DI
18252 JEQ candidate_match_calcBlockSize
18253 SHRQ $0x08, DI
18254 MOVL (AX)(R10*4), SI
18255 LEAL 2(DX), R9
18256 CMPL (BX)(R8*1), DI
18257 JEQ candidate2_match_calcBlockSize
18258 MOVL R9, (AX)(R10*4)
18259 SHRQ $0x08, DI
18260 CMPL (BX)(SI*1), DI
18261 JEQ candidate3_match_calcBlockSize
18262 MOVL 20(SP), DX
18263 JMP search_loop_calcBlockSize
18264
18265 candidate3_match_calcBlockSize:
18266 ADDL $0x02, DX
18267 JMP candidate_match_calcBlockSize
18268
18269 candidate2_match_calcBlockSize:
18270 MOVL R9, (AX)(R10*4)
18271 INCL DX
18272 MOVL R8, SI
18273
18274 candidate_match_calcBlockSize:
18275 MOVL 12(SP), DI
18276 TESTL SI, SI
18277 JZ match_extend_back_end_calcBlockSize
18278
18279 match_extend_back_loop_calcBlockSize:
18280 CMPL DX, DI
18281 JBE match_extend_back_end_calcBlockSize
18282 MOVB -1(BX)(SI*1), R8
18283 MOVB -1(BX)(DX*1), R9
18284 CMPB R8, R9
18285 JNE match_extend_back_end_calcBlockSize
18286 LEAL -1(DX), DX
18287 DECL SI
18288 JZ match_extend_back_end_calcBlockSize
18289 JMP match_extend_back_loop_calcBlockSize
18290
18291 match_extend_back_end_calcBlockSize:
18292 MOVL DX, DI
18293 SUBL 12(SP), DI
18294 LEAQ 5(CX)(DI*1), DI
18295 CMPQ DI, (SP)
18296 JB match_dst_size_check_calcBlockSize
18297 MOVQ $0x00000000, ret+32(FP)
18298 RET
18299
18300 match_dst_size_check_calcBlockSize:
18301 MOVL DX, DI
18302 MOVL 12(SP), R8
18303 CMPL R8, DI
18304 JEQ emit_literal_done_match_emit_calcBlockSize
18305 MOVL DI, R9
18306 MOVL DI, 12(SP)
18307 LEAQ (BX)(R8*1), DI
18308 SUBL R8, R9
18309 LEAL -1(R9), DI
18310 CMPL DI, $0x3c
18311 JB one_byte_match_emit_calcBlockSize
18312 CMPL DI, $0x00000100
18313 JB two_bytes_match_emit_calcBlockSize
18314 CMPL DI, $0x00010000
18315 JB three_bytes_match_emit_calcBlockSize
18316 CMPL DI, $0x01000000
18317 JB four_bytes_match_emit_calcBlockSize
18318 ADDQ $0x05, CX
18319 JMP memmove_long_match_emit_calcBlockSize
18320
18321 four_bytes_match_emit_calcBlockSize:
18322 ADDQ $0x04, CX
18323 JMP memmove_long_match_emit_calcBlockSize
18324
18325 three_bytes_match_emit_calcBlockSize:
18326 ADDQ $0x03, CX
18327 JMP memmove_long_match_emit_calcBlockSize
18328
18329 two_bytes_match_emit_calcBlockSize:
18330 ADDQ $0x02, CX
18331 CMPL DI, $0x40
18332 JB memmove_match_emit_calcBlockSize
18333 JMP memmove_long_match_emit_calcBlockSize
18334
18335 one_byte_match_emit_calcBlockSize:
18336 ADDQ $0x01, CX
18337
18338 memmove_match_emit_calcBlockSize:
18339 LEAQ (CX)(R9*1), CX
18340 JMP emit_literal_done_match_emit_calcBlockSize
18341
18342 memmove_long_match_emit_calcBlockSize:
18343 LEAQ (CX)(R9*1), CX
18344
18345 emit_literal_done_match_emit_calcBlockSize:
18346 match_nolit_loop_calcBlockSize:
18347 MOVL DX, DI
18348 SUBL SI, DI
18349 MOVL DI, 16(SP)
18350 ADDL $0x04, DX
18351 ADDL $0x04, SI
18352 MOVQ src_len+8(FP), DI
18353 SUBL DX, DI
18354 LEAQ (BX)(DX*1), R8
18355 LEAQ (BX)(SI*1), SI
18356
18357 // matchLen
18358 XORL R10, R10
18359
18360 matchlen_loopback_16_match_nolit_calcBlockSize:
18361 CMPL DI, $0x10
18362 JB matchlen_match8_match_nolit_calcBlockSize
18363 MOVQ (R8)(R10*1), R9
18364 MOVQ 8(R8)(R10*1), R11
18365 XORQ (SI)(R10*1), R9
18366 JNZ matchlen_bsf_8_match_nolit_calcBlockSize
18367 XORQ 8(SI)(R10*1), R11
18368 JNZ matchlen_bsf_16match_nolit_calcBlockSize
18369 LEAL -16(DI), DI
18370 LEAL 16(R10), R10
18371 JMP matchlen_loopback_16_match_nolit_calcBlockSize
18372
18373 matchlen_bsf_16match_nolit_calcBlockSize:
18374 #ifdef GOAMD64_v3
18375 TZCNTQ R11, R11
18376
18377 #else
18378 BSFQ R11, R11
18379
18380 #endif
18381 SARQ $0x03, R11
18382 LEAL 8(R10)(R11*1), R10
18383 JMP match_nolit_end_calcBlockSize
18384
18385 matchlen_match8_match_nolit_calcBlockSize:
18386 CMPL DI, $0x08
18387 JB matchlen_match4_match_nolit_calcBlockSize
18388 MOVQ (R8)(R10*1), R9
18389 XORQ (SI)(R10*1), R9
18390 JNZ matchlen_bsf_8_match_nolit_calcBlockSize
18391 LEAL -8(DI), DI
18392 LEAL 8(R10), R10
18393 JMP matchlen_match4_match_nolit_calcBlockSize
18394
18395 matchlen_bsf_8_match_nolit_calcBlockSize:
18396 #ifdef GOAMD64_v3
18397 TZCNTQ R9, R9
18398
18399 #else
18400 BSFQ R9, R9
18401
18402 #endif
18403 SARQ $0x03, R9
18404 LEAL (R10)(R9*1), R10
18405 JMP match_nolit_end_calcBlockSize
18406
18407 matchlen_match4_match_nolit_calcBlockSize:
18408 CMPL DI, $0x04
18409 JB matchlen_match2_match_nolit_calcBlockSize
18410 MOVL (R8)(R10*1), R9
18411 CMPL (SI)(R10*1), R9
18412 JNE matchlen_match2_match_nolit_calcBlockSize
18413 LEAL -4(DI), DI
18414 LEAL 4(R10), R10
18415
18416 matchlen_match2_match_nolit_calcBlockSize:
18417 CMPL DI, $0x01
18418 JE matchlen_match1_match_nolit_calcBlockSize
18419 JB match_nolit_end_calcBlockSize
18420 MOVW (R8)(R10*1), R9
18421 CMPW (SI)(R10*1), R9
18422 JNE matchlen_match1_match_nolit_calcBlockSize
18423 LEAL 2(R10), R10
18424 SUBL $0x02, DI
18425 JZ match_nolit_end_calcBlockSize
18426
18427 matchlen_match1_match_nolit_calcBlockSize:
18428 MOVB (R8)(R10*1), R9
18429 CMPB (SI)(R10*1), R9
18430 JNE match_nolit_end_calcBlockSize
18431 LEAL 1(R10), R10
18432
18433 match_nolit_end_calcBlockSize:
18434 ADDL R10, DX
18435 MOVL 16(SP), SI
18436 ADDL $0x04, R10
18437 MOVL DX, 12(SP)
18438
18439 // emitCopy
18440 CMPL SI, $0x00010000
18441 JB two_byte_offset_match_nolit_calcBlockSize
18442
18443 four_bytes_loop_back_match_nolit_calcBlockSize:
18444 CMPL R10, $0x40
18445 JBE four_bytes_remain_match_nolit_calcBlockSize
18446 LEAL -64(R10), R10
18447 ADDQ $0x05, CX
18448 CMPL R10, $0x04
18449 JB four_bytes_remain_match_nolit_calcBlockSize
18450 JMP four_bytes_loop_back_match_nolit_calcBlockSize
18451
18452 four_bytes_remain_match_nolit_calcBlockSize:
18453 TESTL R10, R10
18454 JZ match_nolit_emitcopy_end_calcBlockSize
18455 XORL SI, SI
18456 ADDQ $0x05, CX
18457 JMP match_nolit_emitcopy_end_calcBlockSize
18458
18459 two_byte_offset_match_nolit_calcBlockSize:
18460 CMPL R10, $0x40
18461 JBE two_byte_offset_short_match_nolit_calcBlockSize
18462 LEAL -60(R10), R10
18463 ADDQ $0x03, CX
18464 JMP two_byte_offset_match_nolit_calcBlockSize
18465
18466 two_byte_offset_short_match_nolit_calcBlockSize:
18467 MOVL R10, DI
18468 SHLL $0x02, DI
18469 CMPL R10, $0x0c
18470 JAE emit_copy_three_match_nolit_calcBlockSize
18471 CMPL SI, $0x00000800
18472 JAE emit_copy_three_match_nolit_calcBlockSize
18473 ADDQ $0x02, CX
18474 JMP match_nolit_emitcopy_end_calcBlockSize
18475
18476 emit_copy_three_match_nolit_calcBlockSize:
18477 ADDQ $0x03, CX
18478
18479 match_nolit_emitcopy_end_calcBlockSize:
18480 CMPL DX, 8(SP)
18481 JAE emit_remainder_calcBlockSize
18482 MOVQ -2(BX)(DX*1), DI
18483 CMPQ CX, (SP)
18484 JB match_nolit_dst_ok_calcBlockSize
18485 MOVQ $0x00000000, ret+32(FP)
18486 RET
18487
18488 match_nolit_dst_ok_calcBlockSize:
18489 MOVQ $0x0000cf1bbcdcbf9b, R9
18490 MOVQ DI, R8
18491 SHRQ $0x10, DI
18492 MOVQ DI, SI
18493 SHLQ $0x10, R8
18494 IMULQ R9, R8
18495 SHRQ $0x33, R8
18496 SHLQ $0x10, SI
18497 IMULQ R9, SI
18498 SHRQ $0x33, SI
18499 LEAL -2(DX), R9
18500 LEAQ (AX)(SI*4), R10
18501 MOVL (R10), SI
18502 MOVL R9, (AX)(R8*4)
18503 MOVL DX, (R10)
18504 CMPL (BX)(SI*1), DI
18505 JEQ match_nolit_loop_calcBlockSize
18506 INCL DX
18507 JMP search_loop_calcBlockSize
18508
18509 emit_remainder_calcBlockSize:
18510 MOVQ src_len+8(FP), AX
18511 SUBL 12(SP), AX
18512 LEAQ 5(CX)(AX*1), AX
18513 CMPQ AX, (SP)
18514 JB emit_remainder_ok_calcBlockSize
18515 MOVQ $0x00000000, ret+32(FP)
18516 RET
18517
18518 emit_remainder_ok_calcBlockSize:
18519 MOVQ src_len+8(FP), AX
18520 MOVL 12(SP), DX
18521 CMPL DX, AX
18522 JEQ emit_literal_done_emit_remainder_calcBlockSize
18523 MOVL AX, SI
18524 MOVL AX, 12(SP)
18525 LEAQ (BX)(DX*1), AX
18526 SUBL DX, SI
18527 LEAL -1(SI), AX
18528 CMPL AX, $0x3c
18529 JB one_byte_emit_remainder_calcBlockSize
18530 CMPL AX, $0x00000100
18531 JB two_bytes_emit_remainder_calcBlockSize
18532 CMPL AX, $0x00010000
18533 JB three_bytes_emit_remainder_calcBlockSize
18534 CMPL AX, $0x01000000
18535 JB four_bytes_emit_remainder_calcBlockSize
18536 ADDQ $0x05, CX
18537 JMP memmove_long_emit_remainder_calcBlockSize
18538
18539 four_bytes_emit_remainder_calcBlockSize:
18540 ADDQ $0x04, CX
18541 JMP memmove_long_emit_remainder_calcBlockSize
18542
18543 three_bytes_emit_remainder_calcBlockSize:
18544 ADDQ $0x03, CX
18545 JMP memmove_long_emit_remainder_calcBlockSize
18546
18547 two_bytes_emit_remainder_calcBlockSize:
18548 ADDQ $0x02, CX
18549 CMPL AX, $0x40
18550 JB memmove_emit_remainder_calcBlockSize
18551 JMP memmove_long_emit_remainder_calcBlockSize
18552
18553 one_byte_emit_remainder_calcBlockSize:
18554 ADDQ $0x01, CX
18555
18556 memmove_emit_remainder_calcBlockSize:
18557 LEAQ (CX)(SI*1), AX
18558 MOVQ AX, CX
18559 JMP emit_literal_done_emit_remainder_calcBlockSize
18560
18561 memmove_long_emit_remainder_calcBlockSize:
18562 LEAQ (CX)(SI*1), AX
18563 MOVQ AX, CX
18564
18565 emit_literal_done_emit_remainder_calcBlockSize:
18566 MOVQ CX, ret+32(FP)
18567 RET
18568
18569 // func calcBlockSizeSmall(src []byte, tmp *[2048]byte) int
18570 // Requires: BMI, SSE2
18571 TEXT ·calcBlockSizeSmall(SB), $24-40
18572 MOVQ tmp+24(FP), AX
18573 XORQ CX, CX
18574 MOVQ $0x00000010, DX
18575 MOVQ AX, BX
18576 PXOR X0, X0
18577
18578 zero_loop_calcBlockSizeSmall:
18579 MOVOU X0, (BX)
18580 MOVOU X0, 16(BX)
18581 MOVOU X0, 32(BX)
18582 MOVOU X0, 48(BX)
18583 MOVOU X0, 64(BX)
18584 MOVOU X0, 80(BX)
18585 MOVOU X0, 96(BX)
18586 MOVOU X0, 112(BX)
18587 ADDQ $0x80, BX
18588 DECQ DX
18589 JNZ zero_loop_calcBlockSizeSmall
18590 MOVL $0x00000000, 12(SP)
18591 MOVQ src_len+8(FP), DX
18592 LEAQ -9(DX), BX
18593 LEAQ -8(DX), SI
18594 MOVL SI, 8(SP)
18595 SHRQ $0x05, DX
18596 SUBL DX, BX
18597 LEAQ (CX)(BX*1), BX
18598 MOVQ BX, (SP)
18599 MOVL $0x00000001, DX
18600 MOVL DX, 16(SP)
18601 MOVQ src_base+0(FP), BX
18602
18603 search_loop_calcBlockSizeSmall:
18604 MOVL DX, SI
18605 SUBL 12(SP), SI
18606 SHRL $0x04, SI
18607 LEAL 4(DX)(SI*1), SI
18608 CMPL SI, 8(SP)
18609 JAE emit_remainder_calcBlockSizeSmall
18610 MOVQ (BX)(DX*1), DI
18611 MOVL SI, 20(SP)
18612 MOVQ $0x9e3779b1, R9
18613 MOVQ DI, R10
18614 MOVQ DI, R11
18615 SHRQ $0x08, R11
18616 SHLQ $0x20, R10
18617 IMULQ R9, R10
18618 SHRQ $0x37, R10
18619 SHLQ $0x20, R11
18620 IMULQ R9, R11
18621 SHRQ $0x37, R11
18622 MOVL (AX)(R10*4), SI
18623 MOVL (AX)(R11*4), R8
18624 MOVL DX, (AX)(R10*4)
18625 LEAL 1(DX), R10
18626 MOVL R10, (AX)(R11*4)
18627 MOVQ DI, R10
18628 SHRQ $0x10, R10
18629 SHLQ $0x20, R10
18630 IMULQ R9, R10
18631 SHRQ $0x37, R10
18632 MOVL DX, R9
18633 SUBL 16(SP), R9
18634 MOVL 1(BX)(R9*1), R11
18635 MOVQ DI, R9
18636 SHRQ $0x08, R9
18637 CMPL R9, R11
18638 JNE no_repeat_found_calcBlockSizeSmall
18639 LEAL 1(DX), DI
18640 MOVL 12(SP), SI
18641 MOVL DI, R8
18642 SUBL 16(SP), R8
18643 JZ repeat_extend_back_end_calcBlockSizeSmall
18644
18645 repeat_extend_back_loop_calcBlockSizeSmall:
18646 CMPL DI, SI
18647 JBE repeat_extend_back_end_calcBlockSizeSmall
18648 MOVB -1(BX)(R8*1), R9
18649 MOVB -1(BX)(DI*1), R10
18650 CMPB R9, R10
18651 JNE repeat_extend_back_end_calcBlockSizeSmall
18652 LEAL -1(DI), DI
18653 DECL R8
18654 JNZ repeat_extend_back_loop_calcBlockSizeSmall
18655
18656 repeat_extend_back_end_calcBlockSizeSmall:
18657 MOVL DI, SI
18658 SUBL 12(SP), SI
18659 LEAQ 3(CX)(SI*1), SI
18660 CMPQ SI, (SP)
18661 JB repeat_dst_size_check_calcBlockSizeSmall
18662 MOVQ $0x00000000, ret+32(FP)
18663 RET
18664
18665 repeat_dst_size_check_calcBlockSizeSmall:
18666 MOVL 12(SP), SI
18667 CMPL SI, DI
18668 JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall
18669 MOVL DI, R8
18670 MOVL DI, 12(SP)
18671 LEAQ (BX)(SI*1), R9
18672 SUBL SI, R8
18673 LEAL -1(R8), SI
18674 CMPL SI, $0x3c
18675 JB one_byte_repeat_emit_calcBlockSizeSmall
18676 CMPL SI, $0x00000100
18677 JB two_bytes_repeat_emit_calcBlockSizeSmall
18678 JB three_bytes_repeat_emit_calcBlockSizeSmall
18679
18680 three_bytes_repeat_emit_calcBlockSizeSmall:
18681 ADDQ $0x03, CX
18682 JMP memmove_long_repeat_emit_calcBlockSizeSmall
18683
18684 two_bytes_repeat_emit_calcBlockSizeSmall:
18685 ADDQ $0x02, CX
18686 CMPL SI, $0x40
18687 JB memmove_repeat_emit_calcBlockSizeSmall
18688 JMP memmove_long_repeat_emit_calcBlockSizeSmall
18689
18690 one_byte_repeat_emit_calcBlockSizeSmall:
18691 ADDQ $0x01, CX
18692
18693 memmove_repeat_emit_calcBlockSizeSmall:
18694 LEAQ (CX)(R8*1), CX
18695 JMP emit_literal_done_repeat_emit_calcBlockSizeSmall
18696
18697 memmove_long_repeat_emit_calcBlockSizeSmall:
18698 LEAQ (CX)(R8*1), CX
18699
18700 emit_literal_done_repeat_emit_calcBlockSizeSmall:
18701 ADDL $0x05, DX
18702 MOVL DX, SI
18703 SUBL 16(SP), SI
18704 MOVQ src_len+8(FP), R8
18705 SUBL DX, R8
18706 LEAQ (BX)(DX*1), R9
18707 LEAQ (BX)(SI*1), SI
18708
18709 // matchLen
18710 XORL R11, R11
18711
18712 matchlen_loopback_16_repeat_extend_calcBlockSizeSmall:
18713 CMPL R8, $0x10
18714 JB matchlen_match8_repeat_extend_calcBlockSizeSmall
18715 MOVQ (R9)(R11*1), R10
18716 MOVQ 8(R9)(R11*1), R12
18717 XORQ (SI)(R11*1), R10
18718 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
18719 XORQ 8(SI)(R11*1), R12
18720 JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall
18721 LEAL -16(R8), R8
18722 LEAL 16(R11), R11
18723 JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall
18724
18725 matchlen_bsf_16repeat_extend_calcBlockSizeSmall:
18726 #ifdef GOAMD64_v3
18727 TZCNTQ R12, R12
18728
18729 #else
18730 BSFQ R12, R12
18731
18732 #endif
18733 SARQ $0x03, R12
18734 LEAL 8(R11)(R12*1), R11
18735 JMP repeat_extend_forward_end_calcBlockSizeSmall
18736
18737 matchlen_match8_repeat_extend_calcBlockSizeSmall:
18738 CMPL R8, $0x08
18739 JB matchlen_match4_repeat_extend_calcBlockSizeSmall
18740 MOVQ (R9)(R11*1), R10
18741 XORQ (SI)(R11*1), R10
18742 JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall
18743 LEAL -8(R8), R8
18744 LEAL 8(R11), R11
18745 JMP matchlen_match4_repeat_extend_calcBlockSizeSmall
18746
18747 matchlen_bsf_8_repeat_extend_calcBlockSizeSmall:
18748 #ifdef GOAMD64_v3
18749 TZCNTQ R10, R10
18750
18751 #else
18752 BSFQ R10, R10
18753
18754 #endif
18755 SARQ $0x03, R10
18756 LEAL (R11)(R10*1), R11
18757 JMP repeat_extend_forward_end_calcBlockSizeSmall
18758
18759 matchlen_match4_repeat_extend_calcBlockSizeSmall:
18760 CMPL R8, $0x04
18761 JB matchlen_match2_repeat_extend_calcBlockSizeSmall
18762 MOVL (R9)(R11*1), R10
18763 CMPL (SI)(R11*1), R10
18764 JNE matchlen_match2_repeat_extend_calcBlockSizeSmall
18765 LEAL -4(R8), R8
18766 LEAL 4(R11), R11
18767
18768 matchlen_match2_repeat_extend_calcBlockSizeSmall:
18769 CMPL R8, $0x01
18770 JE matchlen_match1_repeat_extend_calcBlockSizeSmall
18771 JB repeat_extend_forward_end_calcBlockSizeSmall
18772 MOVW (R9)(R11*1), R10
18773 CMPW (SI)(R11*1), R10
18774 JNE matchlen_match1_repeat_extend_calcBlockSizeSmall
18775 LEAL 2(R11), R11
18776 SUBL $0x02, R8
18777 JZ repeat_extend_forward_end_calcBlockSizeSmall
18778
18779 matchlen_match1_repeat_extend_calcBlockSizeSmall:
18780 MOVB (R9)(R11*1), R10
18781 CMPB (SI)(R11*1), R10
18782 JNE repeat_extend_forward_end_calcBlockSizeSmall
18783 LEAL 1(R11), R11
18784
18785 repeat_extend_forward_end_calcBlockSizeSmall:
18786 ADDL R11, DX
18787 MOVL DX, SI
18788 SUBL DI, SI
18789 MOVL 16(SP), DI
18790
18791 // emitCopy
18792 two_byte_offset_repeat_as_copy_calcBlockSizeSmall:
18793 CMPL SI, $0x40
18794 JBE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall
18795 LEAL -60(SI), SI
18796 ADDQ $0x03, CX
18797 JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall
18798
18799 two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall:
18800 MOVL SI, DI
18801 SHLL $0x02, DI
18802 CMPL SI, $0x0c
18803 JAE emit_copy_three_repeat_as_copy_calcBlockSizeSmall
18804 ADDQ $0x02, CX
18805 JMP repeat_end_emit_calcBlockSizeSmall
18806
18807 emit_copy_three_repeat_as_copy_calcBlockSizeSmall:
18808 ADDQ $0x03, CX
18809
18810 repeat_end_emit_calcBlockSizeSmall:
18811 MOVL DX, 12(SP)
18812 JMP search_loop_calcBlockSizeSmall
18813
18814 no_repeat_found_calcBlockSizeSmall:
18815 CMPL (BX)(SI*1), DI
18816 JEQ candidate_match_calcBlockSizeSmall
18817 SHRQ $0x08, DI
18818 MOVL (AX)(R10*4), SI
18819 LEAL 2(DX), R9
18820 CMPL (BX)(R8*1), DI
18821 JEQ candidate2_match_calcBlockSizeSmall
18822 MOVL R9, (AX)(R10*4)
18823 SHRQ $0x08, DI
18824 CMPL (BX)(SI*1), DI
18825 JEQ candidate3_match_calcBlockSizeSmall
18826 MOVL 20(SP), DX
18827 JMP search_loop_calcBlockSizeSmall
18828
18829 candidate3_match_calcBlockSizeSmall:
18830 ADDL $0x02, DX
18831 JMP candidate_match_calcBlockSizeSmall
18832
18833 candidate2_match_calcBlockSizeSmall:
18834 MOVL R9, (AX)(R10*4)
18835 INCL DX
18836 MOVL R8, SI
18837
18838 candidate_match_calcBlockSizeSmall:
18839 MOVL 12(SP), DI
18840 TESTL SI, SI
18841 JZ match_extend_back_end_calcBlockSizeSmall
18842
18843 match_extend_back_loop_calcBlockSizeSmall:
18844 CMPL DX, DI
18845 JBE match_extend_back_end_calcBlockSizeSmall
18846 MOVB -1(BX)(SI*1), R8
18847 MOVB -1(BX)(DX*1), R9
18848 CMPB R8, R9
18849 JNE match_extend_back_end_calcBlockSizeSmall
18850 LEAL -1(DX), DX
18851 DECL SI
18852 JZ match_extend_back_end_calcBlockSizeSmall
18853 JMP match_extend_back_loop_calcBlockSizeSmall
18854
18855 match_extend_back_end_calcBlockSizeSmall:
18856 MOVL DX, DI
18857 SUBL 12(SP), DI
18858 LEAQ 3(CX)(DI*1), DI
18859 CMPQ DI, (SP)
18860 JB match_dst_size_check_calcBlockSizeSmall
18861 MOVQ $0x00000000, ret+32(FP)
18862 RET
18863
18864 match_dst_size_check_calcBlockSizeSmall:
18865 MOVL DX, DI
18866 MOVL 12(SP), R8
18867 CMPL R8, DI
18868 JEQ emit_literal_done_match_emit_calcBlockSizeSmall
18869 MOVL DI, R9
18870 MOVL DI, 12(SP)
18871 LEAQ (BX)(R8*1), DI
18872 SUBL R8, R9
18873 LEAL -1(R9), DI
18874 CMPL DI, $0x3c
18875 JB one_byte_match_emit_calcBlockSizeSmall
18876 CMPL DI, $0x00000100
18877 JB two_bytes_match_emit_calcBlockSizeSmall
18878 JB three_bytes_match_emit_calcBlockSizeSmall
18879
18880 three_bytes_match_emit_calcBlockSizeSmall:
18881 ADDQ $0x03, CX
18882 JMP memmove_long_match_emit_calcBlockSizeSmall
18883
18884 two_bytes_match_emit_calcBlockSizeSmall:
18885 ADDQ $0x02, CX
18886 CMPL DI, $0x40
18887 JB memmove_match_emit_calcBlockSizeSmall
18888 JMP memmove_long_match_emit_calcBlockSizeSmall
18889
18890 one_byte_match_emit_calcBlockSizeSmall:
18891 ADDQ $0x01, CX
18892
18893 memmove_match_emit_calcBlockSizeSmall:
18894 LEAQ (CX)(R9*1), CX
18895 JMP emit_literal_done_match_emit_calcBlockSizeSmall
18896
18897 memmove_long_match_emit_calcBlockSizeSmall:
18898 LEAQ (CX)(R9*1), CX
18899
18900 emit_literal_done_match_emit_calcBlockSizeSmall:
18901 match_nolit_loop_calcBlockSizeSmall:
18902 MOVL DX, DI
18903 SUBL SI, DI
18904 MOVL DI, 16(SP)
18905 ADDL $0x04, DX
18906 ADDL $0x04, SI
18907 MOVQ src_len+8(FP), DI
18908 SUBL DX, DI
18909 LEAQ (BX)(DX*1), R8
18910 LEAQ (BX)(SI*1), SI
18911
18912 // matchLen
18913 XORL R10, R10
18914
18915 matchlen_loopback_16_match_nolit_calcBlockSizeSmall:
18916 CMPL DI, $0x10
18917 JB matchlen_match8_match_nolit_calcBlockSizeSmall
18918 MOVQ (R8)(R10*1), R9
18919 MOVQ 8(R8)(R10*1), R11
18920 XORQ (SI)(R10*1), R9
18921 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
18922 XORQ 8(SI)(R10*1), R11
18923 JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall
18924 LEAL -16(DI), DI
18925 LEAL 16(R10), R10
18926 JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall
18927
18928 matchlen_bsf_16match_nolit_calcBlockSizeSmall:
18929 #ifdef GOAMD64_v3
18930 TZCNTQ R11, R11
18931
18932 #else
18933 BSFQ R11, R11
18934
18935 #endif
18936 SARQ $0x03, R11
18937 LEAL 8(R10)(R11*1), R10
18938 JMP match_nolit_end_calcBlockSizeSmall
18939
18940 matchlen_match8_match_nolit_calcBlockSizeSmall:
18941 CMPL DI, $0x08
18942 JB matchlen_match4_match_nolit_calcBlockSizeSmall
18943 MOVQ (R8)(R10*1), R9
18944 XORQ (SI)(R10*1), R9
18945 JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall
18946 LEAL -8(DI), DI
18947 LEAL 8(R10), R10
18948 JMP matchlen_match4_match_nolit_calcBlockSizeSmall
18949
18950 matchlen_bsf_8_match_nolit_calcBlockSizeSmall:
18951 #ifdef GOAMD64_v3
18952 TZCNTQ R9, R9
18953
18954 #else
18955 BSFQ R9, R9
18956
18957 #endif
18958 SARQ $0x03, R9
18959 LEAL (R10)(R9*1), R10
18960 JMP match_nolit_end_calcBlockSizeSmall
18961
18962 matchlen_match4_match_nolit_calcBlockSizeSmall:
18963 CMPL DI, $0x04
18964 JB matchlen_match2_match_nolit_calcBlockSizeSmall
18965 MOVL (R8)(R10*1), R9
18966 CMPL (SI)(R10*1), R9
18967 JNE matchlen_match2_match_nolit_calcBlockSizeSmall
18968 LEAL -4(DI), DI
18969 LEAL 4(R10), R10
18970
18971 matchlen_match2_match_nolit_calcBlockSizeSmall:
18972 CMPL DI, $0x01
18973 JE matchlen_match1_match_nolit_calcBlockSizeSmall
18974 JB match_nolit_end_calcBlockSizeSmall
18975 MOVW (R8)(R10*1), R9
18976 CMPW (SI)(R10*1), R9
18977 JNE matchlen_match1_match_nolit_calcBlockSizeSmall
18978 LEAL 2(R10), R10
18979 SUBL $0x02, DI
18980 JZ match_nolit_end_calcBlockSizeSmall
18981
18982 matchlen_match1_match_nolit_calcBlockSizeSmall:
18983 MOVB (R8)(R10*1), R9
18984 CMPB (SI)(R10*1), R9
18985 JNE match_nolit_end_calcBlockSizeSmall
18986 LEAL 1(R10), R10
18987
18988 match_nolit_end_calcBlockSizeSmall:
18989 ADDL R10, DX
18990 MOVL 16(SP), SI
18991 ADDL $0x04, R10
18992 MOVL DX, 12(SP)
18993
18994 // emitCopy
18995 two_byte_offset_match_nolit_calcBlockSizeSmall:
18996 CMPL R10, $0x40
18997 JBE two_byte_offset_short_match_nolit_calcBlockSizeSmall
18998 LEAL -60(R10), R10
18999 ADDQ $0x03, CX
19000 JMP two_byte_offset_match_nolit_calcBlockSizeSmall
19001
19002 two_byte_offset_short_match_nolit_calcBlockSizeSmall:
19003 MOVL R10, SI
19004 SHLL $0x02, SI
19005 CMPL R10, $0x0c
19006 JAE emit_copy_three_match_nolit_calcBlockSizeSmall
19007 ADDQ $0x02, CX
19008 JMP match_nolit_emitcopy_end_calcBlockSizeSmall
19009
19010 emit_copy_three_match_nolit_calcBlockSizeSmall:
19011 ADDQ $0x03, CX
19012
19013 match_nolit_emitcopy_end_calcBlockSizeSmall:
19014 CMPL DX, 8(SP)
19015 JAE emit_remainder_calcBlockSizeSmall
19016 MOVQ -2(BX)(DX*1), DI
19017 CMPQ CX, (SP)
19018 JB match_nolit_dst_ok_calcBlockSizeSmall
19019 MOVQ $0x00000000, ret+32(FP)
19020 RET
19021
19022 match_nolit_dst_ok_calcBlockSizeSmall:
19023 MOVQ $0x9e3779b1, R9
19024 MOVQ DI, R8
19025 SHRQ $0x10, DI
19026 MOVQ DI, SI
19027 SHLQ $0x20, R8
19028 IMULQ R9, R8
19029 SHRQ $0x37, R8
19030 SHLQ $0x20, SI
19031 IMULQ R9, SI
19032 SHRQ $0x37, SI
19033 LEAL -2(DX), R9
19034 LEAQ (AX)(SI*4), R10
19035 MOVL (R10), SI
19036 MOVL R9, (AX)(R8*4)
19037 MOVL DX, (R10)
19038 CMPL (BX)(SI*1), DI
19039 JEQ match_nolit_loop_calcBlockSizeSmall
19040 INCL DX
19041 JMP search_loop_calcBlockSizeSmall
19042
19043 emit_remainder_calcBlockSizeSmall:
19044 MOVQ src_len+8(FP), AX
19045 SUBL 12(SP), AX
19046 LEAQ 3(CX)(AX*1), AX
19047 CMPQ AX, (SP)
19048 JB emit_remainder_ok_calcBlockSizeSmall
19049 MOVQ $0x00000000, ret+32(FP)
19050 RET
19051
19052 emit_remainder_ok_calcBlockSizeSmall:
19053 MOVQ src_len+8(FP), AX
19054 MOVL 12(SP), DX
19055 CMPL DX, AX
19056 JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall
19057 MOVL AX, SI
19058 MOVL AX, 12(SP)
19059 LEAQ (BX)(DX*1), AX
19060 SUBL DX, SI
19061 LEAL -1(SI), AX
19062 CMPL AX, $0x3c
19063 JB one_byte_emit_remainder_calcBlockSizeSmall
19064 CMPL AX, $0x00000100
19065 JB two_bytes_emit_remainder_calcBlockSizeSmall
19066 JB three_bytes_emit_remainder_calcBlockSizeSmall
19067
19068 three_bytes_emit_remainder_calcBlockSizeSmall:
19069 ADDQ $0x03, CX
19070 JMP memmove_long_emit_remainder_calcBlockSizeSmall
19071
19072 two_bytes_emit_remainder_calcBlockSizeSmall:
19073 ADDQ $0x02, CX
19074 CMPL AX, $0x40
19075 JB memmove_emit_remainder_calcBlockSizeSmall
19076 JMP memmove_long_emit_remainder_calcBlockSizeSmall
19077
19078 one_byte_emit_remainder_calcBlockSizeSmall:
19079 ADDQ $0x01, CX
19080
19081 memmove_emit_remainder_calcBlockSizeSmall:
19082 LEAQ (CX)(SI*1), AX
19083 MOVQ AX, CX
19084 JMP emit_literal_done_emit_remainder_calcBlockSizeSmall
19085
19086 memmove_long_emit_remainder_calcBlockSizeSmall:
19087 LEAQ (CX)(SI*1), AX
19088 MOVQ AX, CX
19089
19090 emit_literal_done_emit_remainder_calcBlockSizeSmall:
19091 MOVQ CX, ret+32(FP)
19092 RET
19093
19094 // func emitLiteral(dst []byte, lit []byte) int
19095 // Requires: SSE2
19096 TEXT ·emitLiteral(SB), NOSPLIT, $0-56
19097 MOVQ lit_len+32(FP), DX
19098 MOVQ dst_base+0(FP), AX
19099 MOVQ lit_base+24(FP), CX
19100 TESTQ DX, DX
19101 JZ emit_literal_end_standalone_skip
19102 MOVL DX, BX
19103 LEAL -1(DX), SI
19104 CMPL SI, $0x3c
19105 JB one_byte_standalone
19106 CMPL SI, $0x00000100
19107 JB two_bytes_standalone
19108 CMPL SI, $0x00010000
19109 JB three_bytes_standalone
19110 CMPL SI, $0x01000000
19111 JB four_bytes_standalone
19112 MOVB $0xfc, (AX)
19113 MOVL SI, 1(AX)
19114 ADDQ $0x05, BX
19115 ADDQ $0x05, AX
19116 JMP memmove_long_standalone
19117
19118 four_bytes_standalone:
19119 MOVL SI, DI
19120 SHRL $0x10, DI
19121 MOVB $0xf8, (AX)
19122 MOVW SI, 1(AX)
19123 MOVB DI, 3(AX)
19124 ADDQ $0x04, BX
19125 ADDQ $0x04, AX
19126 JMP memmove_long_standalone
19127
19128 three_bytes_standalone:
19129 MOVB $0xf4, (AX)
19130 MOVW SI, 1(AX)
19131 ADDQ $0x03, BX
19132 ADDQ $0x03, AX
19133 JMP memmove_long_standalone
19134
19135 two_bytes_standalone:
19136 MOVB $0xf0, (AX)
19137 MOVB SI, 1(AX)
19138 ADDQ $0x02, BX
19139 ADDQ $0x02, AX
19140 CMPL SI, $0x40
19141 JB memmove_standalone
19142 JMP memmove_long_standalone
19143
19144 one_byte_standalone:
19145 SHLB $0x02, SI
19146 MOVB SI, (AX)
19147 ADDQ $0x01, BX
19148 ADDQ $0x01, AX
19149
19150 memmove_standalone:
19151 // genMemMoveShort
19152 CMPQ DX, $0x03
19153 JB emit_lit_memmove_standalone_memmove_move_1or2
19154 JE emit_lit_memmove_standalone_memmove_move_3
19155 CMPQ DX, $0x08
19156 JB emit_lit_memmove_standalone_memmove_move_4through7
19157 CMPQ DX, $0x10
19158 JBE emit_lit_memmove_standalone_memmove_move_8through16
19159 CMPQ DX, $0x20
19160 JBE emit_lit_memmove_standalone_memmove_move_17through32
19161 JMP emit_lit_memmove_standalone_memmove_move_33through64
19162
19163 emit_lit_memmove_standalone_memmove_move_1or2:
19164 MOVB (CX), SI
19165 MOVB -1(CX)(DX*1), CL
19166 MOVB SI, (AX)
19167 MOVB CL, -1(AX)(DX*1)
19168 JMP emit_literal_end_standalone
19169
19170 emit_lit_memmove_standalone_memmove_move_3:
19171 MOVW (CX), SI
19172 MOVB 2(CX), CL
19173 MOVW SI, (AX)
19174 MOVB CL, 2(AX)
19175 JMP emit_literal_end_standalone
19176
19177 emit_lit_memmove_standalone_memmove_move_4through7:
19178 MOVL (CX), SI
19179 MOVL -4(CX)(DX*1), CX
19180 MOVL SI, (AX)
19181 MOVL CX, -4(AX)(DX*1)
19182 JMP emit_literal_end_standalone
19183
19184 emit_lit_memmove_standalone_memmove_move_8through16:
19185 MOVQ (CX), SI
19186 MOVQ -8(CX)(DX*1), CX
19187 MOVQ SI, (AX)
19188 MOVQ CX, -8(AX)(DX*1)
19189 JMP emit_literal_end_standalone
19190
19191 emit_lit_memmove_standalone_memmove_move_17through32:
19192 MOVOU (CX), X0
19193 MOVOU -16(CX)(DX*1), X1
19194 MOVOU X0, (AX)
19195 MOVOU X1, -16(AX)(DX*1)
19196 JMP emit_literal_end_standalone
19197
19198 emit_lit_memmove_standalone_memmove_move_33through64:
19199 MOVOU (CX), X0
19200 MOVOU 16(CX), X1
19201 MOVOU -32(CX)(DX*1), X2
19202 MOVOU -16(CX)(DX*1), X3
19203 MOVOU X0, (AX)
19204 MOVOU X1, 16(AX)
19205 MOVOU X2, -32(AX)(DX*1)
19206 MOVOU X3, -16(AX)(DX*1)
19207 JMP emit_literal_end_standalone
19208 JMP emit_literal_end_standalone
19209
19210 memmove_long_standalone:
19211 // genMemMoveLong
19212 MOVOU (CX), X0
19213 MOVOU 16(CX), X1
19214 MOVOU -32(CX)(DX*1), X2
19215 MOVOU -16(CX)(DX*1), X3
19216 MOVQ DX, DI
19217 SHRQ $0x05, DI
19218 MOVQ AX, SI
19219 ANDL $0x0000001f, SI
19220 MOVQ $0x00000040, R8
19221 SUBQ SI, R8
19222 DECQ DI
19223 JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
19224 LEAQ -32(CX)(R8*1), SI
19225 LEAQ -32(AX)(R8*1), R9
19226
19227 emit_lit_memmove_long_standalonelarge_big_loop_back:
19228 MOVOU (SI), X4
19229 MOVOU 16(SI), X5
19230 MOVOA X4, (R9)
19231 MOVOA X5, 16(R9)
19232 ADDQ $0x20, R9
19233 ADDQ $0x20, SI
19234 ADDQ $0x20, R8
19235 DECQ DI
19236 JNA emit_lit_memmove_long_standalonelarge_big_loop_back
19237
19238 emit_lit_memmove_long_standalonelarge_forward_sse_loop_32:
19239 MOVOU -32(CX)(R8*1), X4
19240 MOVOU -16(CX)(R8*1), X5
19241 MOVOA X4, -32(AX)(R8*1)
19242 MOVOA X5, -16(AX)(R8*1)
19243 ADDQ $0x20, R8
19244 CMPQ DX, R8
19245 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32
19246 MOVOU X0, (AX)
19247 MOVOU X1, 16(AX)
19248 MOVOU X2, -32(AX)(DX*1)
19249 MOVOU X3, -16(AX)(DX*1)
19250 JMP emit_literal_end_standalone
19251 JMP emit_literal_end_standalone
19252
19253 emit_literal_end_standalone_skip:
19254 XORQ BX, BX
19255
19256 emit_literal_end_standalone:
19257 MOVQ BX, ret+48(FP)
19258 RET
19259
19260 // func emitRepeat(dst []byte, offset int, length int) int
19261 TEXT ·emitRepeat(SB), NOSPLIT, $0-48
19262 XORQ BX, BX
19263 MOVQ dst_base+0(FP), AX
19264 MOVQ offset+24(FP), CX
19265 MOVQ length+32(FP), DX
19266
19267 // emitRepeat
19268 emit_repeat_again_standalone:
19269 MOVL DX, SI
19270 LEAL -4(DX), DX
19271 CMPL SI, $0x08
19272 JBE repeat_two_standalone
19273 CMPL SI, $0x0c
19274 JAE cant_repeat_two_offset_standalone
19275 CMPL CX, $0x00000800
19276 JB repeat_two_offset_standalone
19277
19278 cant_repeat_two_offset_standalone:
19279 CMPL DX, $0x00000104
19280 JB repeat_three_standalone
19281 CMPL DX, $0x00010100
19282 JB repeat_four_standalone
19283 CMPL DX, $0x0100ffff
19284 JB repeat_five_standalone
19285 LEAL -16842747(DX), DX
19286 MOVL $0xfffb001d, (AX)
19287 MOVB $0xff, 4(AX)
19288 ADDQ $0x05, AX
19289 ADDQ $0x05, BX
19290 JMP emit_repeat_again_standalone
19291
19292 repeat_five_standalone:
19293 LEAL -65536(DX), DX
19294 MOVL DX, CX
19295 MOVW $0x001d, (AX)
19296 MOVW DX, 2(AX)
19297 SARL $0x10, CX
19298 MOVB CL, 4(AX)
19299 ADDQ $0x05, BX
19300 ADDQ $0x05, AX
19301 JMP gen_emit_repeat_end
19302
19303 repeat_four_standalone:
19304 LEAL -256(DX), DX
19305 MOVW $0x0019, (AX)
19306 MOVW DX, 2(AX)
19307 ADDQ $0x04, BX
19308 ADDQ $0x04, AX
19309 JMP gen_emit_repeat_end
19310
19311 repeat_three_standalone:
19312 LEAL -4(DX), DX
19313 MOVW $0x0015, (AX)
19314 MOVB DL, 2(AX)
19315 ADDQ $0x03, BX
19316 ADDQ $0x03, AX
19317 JMP gen_emit_repeat_end
19318
19319 repeat_two_standalone:
19320 SHLL $0x02, DX
19321 ORL $0x01, DX
19322 MOVW DX, (AX)
19323 ADDQ $0x02, BX
19324 ADDQ $0x02, AX
19325 JMP gen_emit_repeat_end
19326
19327 repeat_two_offset_standalone:
19328 XORQ SI, SI
19329 LEAL 1(SI)(DX*4), DX
19330 MOVB CL, 1(AX)
19331 SARL $0x08, CX
19332 SHLL $0x05, CX
19333 ORL CX, DX
19334 MOVB DL, (AX)
19335 ADDQ $0x02, BX
19336 ADDQ $0x02, AX
19337
19338 gen_emit_repeat_end:
19339 MOVQ BX, ret+40(FP)
19340 RET
19341
19342 // func emitCopy(dst []byte, offset int, length int) int
19343 TEXT ·emitCopy(SB), NOSPLIT, $0-48
19344 XORQ BX, BX
19345 MOVQ dst_base+0(FP), AX
19346 MOVQ offset+24(FP), CX
19347 MOVQ length+32(FP), DX
19348
19349 // emitCopy
19350 CMPL CX, $0x00010000
19351 JB two_byte_offset_standalone
19352 CMPL DX, $0x40
19353 JBE four_bytes_remain_standalone
19354 MOVB $0xff, (AX)
19355 MOVL CX, 1(AX)
19356 LEAL -64(DX), DX
19357 ADDQ $0x05, BX
19358 ADDQ $0x05, AX
19359 CMPL DX, $0x04
19360 JB four_bytes_remain_standalone
19361
19362 // emitRepeat
19363 emit_repeat_again_standalone_emit_copy:
19364 MOVL DX, SI
19365 LEAL -4(DX), DX
19366 CMPL SI, $0x08
19367 JBE repeat_two_standalone_emit_copy
19368 CMPL SI, $0x0c
19369 JAE cant_repeat_two_offset_standalone_emit_copy
19370 CMPL CX, $0x00000800
19371 JB repeat_two_offset_standalone_emit_copy
19372
19373 cant_repeat_two_offset_standalone_emit_copy:
19374 CMPL DX, $0x00000104
19375 JB repeat_three_standalone_emit_copy
19376 CMPL DX, $0x00010100
19377 JB repeat_four_standalone_emit_copy
19378 CMPL DX, $0x0100ffff
19379 JB repeat_five_standalone_emit_copy
19380 LEAL -16842747(DX), DX
19381 MOVL $0xfffb001d, (AX)
19382 MOVB $0xff, 4(AX)
19383 ADDQ $0x05, AX
19384 ADDQ $0x05, BX
19385 JMP emit_repeat_again_standalone_emit_copy
19386
19387 repeat_five_standalone_emit_copy:
19388 LEAL -65536(DX), DX
19389 MOVL DX, CX
19390 MOVW $0x001d, (AX)
19391 MOVW DX, 2(AX)
19392 SARL $0x10, CX
19393 MOVB CL, 4(AX)
19394 ADDQ $0x05, BX
19395 ADDQ $0x05, AX
19396 JMP gen_emit_copy_end
19397
19398 repeat_four_standalone_emit_copy:
19399 LEAL -256(DX), DX
19400 MOVW $0x0019, (AX)
19401 MOVW DX, 2(AX)
19402 ADDQ $0x04, BX
19403 ADDQ $0x04, AX
19404 JMP gen_emit_copy_end
19405
19406 repeat_three_standalone_emit_copy:
19407 LEAL -4(DX), DX
19408 MOVW $0x0015, (AX)
19409 MOVB DL, 2(AX)
19410 ADDQ $0x03, BX
19411 ADDQ $0x03, AX
19412 JMP gen_emit_copy_end
19413
19414 repeat_two_standalone_emit_copy:
19415 SHLL $0x02, DX
19416 ORL $0x01, DX
19417 MOVW DX, (AX)
19418 ADDQ $0x02, BX
19419 ADDQ $0x02, AX
19420 JMP gen_emit_copy_end
19421
19422 repeat_two_offset_standalone_emit_copy:
19423 XORQ SI, SI
19424 LEAL 1(SI)(DX*4), DX
19425 MOVB CL, 1(AX)
19426 SARL $0x08, CX
19427 SHLL $0x05, CX
19428 ORL CX, DX
19429 MOVB DL, (AX)
19430 ADDQ $0x02, BX
19431 ADDQ $0x02, AX
19432 JMP gen_emit_copy_end
19433
19434 four_bytes_remain_standalone:
19435 TESTL DX, DX
19436 JZ gen_emit_copy_end
19437 XORL SI, SI
19438 LEAL -1(SI)(DX*4), DX
19439 MOVB DL, (AX)
19440 MOVL CX, 1(AX)
19441 ADDQ $0x05, BX
19442 ADDQ $0x05, AX
19443 JMP gen_emit_copy_end
19444
19445 two_byte_offset_standalone:
19446 CMPL DX, $0x40
19447 JBE two_byte_offset_short_standalone
19448 CMPL CX, $0x00000800
19449 JAE long_offset_short_standalone
19450 MOVL $0x00000001, SI
19451 LEAL 16(SI), SI
19452 MOVB CL, 1(AX)
19453 MOVL CX, DI
19454 SHRL $0x08, DI
19455 SHLL $0x05, DI
19456 ORL DI, SI
19457 MOVB SI, (AX)
19458 ADDQ $0x02, BX
19459 ADDQ $0x02, AX
19460 SUBL $0x08, DX
19461
19462 // emitRepeat
19463 LEAL -4(DX), DX
19464 JMP cant_repeat_two_offset_standalone_emit_copy_short_2b
19465
19466 emit_repeat_again_standalone_emit_copy_short_2b:
19467 MOVL DX, SI
19468 LEAL -4(DX), DX
19469 CMPL SI, $0x08
19470 JBE repeat_two_standalone_emit_copy_short_2b
19471 CMPL SI, $0x0c
19472 JAE cant_repeat_two_offset_standalone_emit_copy_short_2b
19473 CMPL CX, $0x00000800
19474 JB repeat_two_offset_standalone_emit_copy_short_2b
19475
19476 cant_repeat_two_offset_standalone_emit_copy_short_2b:
19477 CMPL DX, $0x00000104
19478 JB repeat_three_standalone_emit_copy_short_2b
19479 CMPL DX, $0x00010100
19480 JB repeat_four_standalone_emit_copy_short_2b
19481 CMPL DX, $0x0100ffff
19482 JB repeat_five_standalone_emit_copy_short_2b
19483 LEAL -16842747(DX), DX
19484 MOVL $0xfffb001d, (AX)
19485 MOVB $0xff, 4(AX)
19486 ADDQ $0x05, AX
19487 ADDQ $0x05, BX
19488 JMP emit_repeat_again_standalone_emit_copy_short_2b
19489
19490 repeat_five_standalone_emit_copy_short_2b:
19491 LEAL -65536(DX), DX
19492 MOVL DX, CX
19493 MOVW $0x001d, (AX)
19494 MOVW DX, 2(AX)
19495 SARL $0x10, CX
19496 MOVB CL, 4(AX)
19497 ADDQ $0x05, BX
19498 ADDQ $0x05, AX
19499 JMP gen_emit_copy_end
19500
19501 repeat_four_standalone_emit_copy_short_2b:
19502 LEAL -256(DX), DX
19503 MOVW $0x0019, (AX)
19504 MOVW DX, 2(AX)
19505 ADDQ $0x04, BX
19506 ADDQ $0x04, AX
19507 JMP gen_emit_copy_end
19508
19509 repeat_three_standalone_emit_copy_short_2b:
19510 LEAL -4(DX), DX
19511 MOVW $0x0015, (AX)
19512 MOVB DL, 2(AX)
19513 ADDQ $0x03, BX
19514 ADDQ $0x03, AX
19515 JMP gen_emit_copy_end
19516
19517 repeat_two_standalone_emit_copy_short_2b:
19518 SHLL $0x02, DX
19519 ORL $0x01, DX
19520 MOVW DX, (AX)
19521 ADDQ $0x02, BX
19522 ADDQ $0x02, AX
19523 JMP gen_emit_copy_end
19524
19525 repeat_two_offset_standalone_emit_copy_short_2b:
19526 XORQ SI, SI
19527 LEAL 1(SI)(DX*4), DX
19528 MOVB CL, 1(AX)
19529 SARL $0x08, CX
19530 SHLL $0x05, CX
19531 ORL CX, DX
19532 MOVB DL, (AX)
19533 ADDQ $0x02, BX
19534 ADDQ $0x02, AX
19535 JMP gen_emit_copy_end
19536
19537 long_offset_short_standalone:
19538 MOVB $0xee, (AX)
19539 MOVW CX, 1(AX)
19540 LEAL -60(DX), DX
19541 ADDQ $0x03, AX
19542 ADDQ $0x03, BX
19543
19544 // emitRepeat
19545 emit_repeat_again_standalone_emit_copy_short:
19546 MOVL DX, SI
19547 LEAL -4(DX), DX
19548 CMPL SI, $0x08
19549 JBE repeat_two_standalone_emit_copy_short
19550 CMPL SI, $0x0c
19551 JAE cant_repeat_two_offset_standalone_emit_copy_short
19552 CMPL CX, $0x00000800
19553 JB repeat_two_offset_standalone_emit_copy_short
19554
19555 cant_repeat_two_offset_standalone_emit_copy_short:
19556 CMPL DX, $0x00000104
19557 JB repeat_three_standalone_emit_copy_short
19558 CMPL DX, $0x00010100
19559 JB repeat_four_standalone_emit_copy_short
19560 CMPL DX, $0x0100ffff
19561 JB repeat_five_standalone_emit_copy_short
19562 LEAL -16842747(DX), DX
19563 MOVL $0xfffb001d, (AX)
19564 MOVB $0xff, 4(AX)
19565 ADDQ $0x05, AX
19566 ADDQ $0x05, BX
19567 JMP emit_repeat_again_standalone_emit_copy_short
19568
19569 repeat_five_standalone_emit_copy_short:
19570 LEAL -65536(DX), DX
19571 MOVL DX, CX
19572 MOVW $0x001d, (AX)
19573 MOVW DX, 2(AX)
19574 SARL $0x10, CX
19575 MOVB CL, 4(AX)
19576 ADDQ $0x05, BX
19577 ADDQ $0x05, AX
19578 JMP gen_emit_copy_end
19579
19580 repeat_four_standalone_emit_copy_short:
19581 LEAL -256(DX), DX
19582 MOVW $0x0019, (AX)
19583 MOVW DX, 2(AX)
19584 ADDQ $0x04, BX
19585 ADDQ $0x04, AX
19586 JMP gen_emit_copy_end
19587
19588 repeat_three_standalone_emit_copy_short:
19589 LEAL -4(DX), DX
19590 MOVW $0x0015, (AX)
19591 MOVB DL, 2(AX)
19592 ADDQ $0x03, BX
19593 ADDQ $0x03, AX
19594 JMP gen_emit_copy_end
19595
19596 repeat_two_standalone_emit_copy_short:
19597 SHLL $0x02, DX
19598 ORL $0x01, DX
19599 MOVW DX, (AX)
19600 ADDQ $0x02, BX
19601 ADDQ $0x02, AX
19602 JMP gen_emit_copy_end
19603
19604 repeat_two_offset_standalone_emit_copy_short:
19605 XORQ SI, SI
19606 LEAL 1(SI)(DX*4), DX
19607 MOVB CL, 1(AX)
19608 SARL $0x08, CX
19609 SHLL $0x05, CX
19610 ORL CX, DX
19611 MOVB DL, (AX)
19612 ADDQ $0x02, BX
19613 ADDQ $0x02, AX
19614 JMP gen_emit_copy_end
19615
19616 two_byte_offset_short_standalone:
19617 MOVL DX, SI
19618 SHLL $0x02, SI
19619 CMPL DX, $0x0c
19620 JAE emit_copy_three_standalone
19621 CMPL CX, $0x00000800
19622 JAE emit_copy_three_standalone
19623 LEAL -15(SI), SI
19624 MOVB CL, 1(AX)
19625 SHRL $0x08, CX
19626 SHLL $0x05, CX
19627 ORL CX, SI
19628 MOVB SI, (AX)
19629 ADDQ $0x02, BX
19630 ADDQ $0x02, AX
19631 JMP gen_emit_copy_end
19632
19633 emit_copy_three_standalone:
19634 LEAL -2(SI), SI
19635 MOVB SI, (AX)
19636 MOVW CX, 1(AX)
19637 ADDQ $0x03, BX
19638 ADDQ $0x03, AX
19639
19640 gen_emit_copy_end:
19641 MOVQ BX, ret+40(FP)
19642 RET
19643
19644 // func emitCopyNoRepeat(dst []byte, offset int, length int) int
19645 TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48
19646 XORQ BX, BX
19647 MOVQ dst_base+0(FP), AX
19648 MOVQ offset+24(FP), CX
19649 MOVQ length+32(FP), DX
19650
19651 // emitCopy
19652 CMPL CX, $0x00010000
19653 JB two_byte_offset_standalone_snappy
19654
19655 four_bytes_loop_back_standalone_snappy:
19656 CMPL DX, $0x40
19657 JBE four_bytes_remain_standalone_snappy
19658 MOVB $0xff, (AX)
19659 MOVL CX, 1(AX)
19660 LEAL -64(DX), DX
19661 ADDQ $0x05, BX
19662 ADDQ $0x05, AX
19663 CMPL DX, $0x04
19664 JB four_bytes_remain_standalone_snappy
19665 JMP four_bytes_loop_back_standalone_snappy
19666
19667 four_bytes_remain_standalone_snappy:
19668 TESTL DX, DX
19669 JZ gen_emit_copy_end_snappy
19670 XORL SI, SI
19671 LEAL -1(SI)(DX*4), DX
19672 MOVB DL, (AX)
19673 MOVL CX, 1(AX)
19674 ADDQ $0x05, BX
19675 ADDQ $0x05, AX
19676 JMP gen_emit_copy_end_snappy
19677
19678 two_byte_offset_standalone_snappy:
19679 CMPL DX, $0x40
19680 JBE two_byte_offset_short_standalone_snappy
19681 MOVB $0xee, (AX)
19682 MOVW CX, 1(AX)
19683 LEAL -60(DX), DX
19684 ADDQ $0x03, AX
19685 ADDQ $0x03, BX
19686 JMP two_byte_offset_standalone_snappy
19687
19688 two_byte_offset_short_standalone_snappy:
19689 MOVL DX, SI
19690 SHLL $0x02, SI
19691 CMPL DX, $0x0c
19692 JAE emit_copy_three_standalone_snappy
19693 CMPL CX, $0x00000800
19694 JAE emit_copy_three_standalone_snappy
19695 LEAL -15(SI), SI
19696 MOVB CL, 1(AX)
19697 SHRL $0x08, CX
19698 SHLL $0x05, CX
19699 ORL CX, SI
19700 MOVB SI, (AX)
19701 ADDQ $0x02, BX
19702 ADDQ $0x02, AX
19703 JMP gen_emit_copy_end_snappy
19704
19705 emit_copy_three_standalone_snappy:
19706 LEAL -2(SI), SI
19707 MOVB SI, (AX)
19708 MOVW CX, 1(AX)
19709 ADDQ $0x03, BX
19710 ADDQ $0x03, AX
19711
19712 gen_emit_copy_end_snappy:
19713 MOVQ BX, ret+40(FP)
19714 RET
19715
19716 // func matchLen(a []byte, b []byte) int
19717 // Requires: BMI
19718 TEXT ·matchLen(SB), NOSPLIT, $0-56
19719 MOVQ a_base+0(FP), AX
19720 MOVQ b_base+24(FP), CX
19721 MOVQ a_len+8(FP), DX
19722
19723 // matchLen
19724 XORL SI, SI
19725
19726 matchlen_loopback_16_standalone:
19727 CMPL DX, $0x10
19728 JB matchlen_match8_standalone
19729 MOVQ (AX)(SI*1), BX
19730 MOVQ 8(AX)(SI*1), DI
19731 XORQ (CX)(SI*1), BX
19732 JNZ matchlen_bsf_8_standalone
19733 XORQ 8(CX)(SI*1), DI
19734 JNZ matchlen_bsf_16standalone
19735 LEAL -16(DX), DX
19736 LEAL 16(SI), SI
19737 JMP matchlen_loopback_16_standalone
19738
19739 matchlen_bsf_16standalone:
19740 #ifdef GOAMD64_v3
19741 TZCNTQ DI, DI
19742
19743 #else
19744 BSFQ DI, DI
19745
19746 #endif
19747 SARQ $0x03, DI
19748 LEAL 8(SI)(DI*1), SI
19749 JMP gen_match_len_end
19750
19751 matchlen_match8_standalone:
19752 CMPL DX, $0x08
19753 JB matchlen_match4_standalone
19754 MOVQ (AX)(SI*1), BX
19755 XORQ (CX)(SI*1), BX
19756 JNZ matchlen_bsf_8_standalone
19757 LEAL -8(DX), DX
19758 LEAL 8(SI), SI
19759 JMP matchlen_match4_standalone
19760
19761 matchlen_bsf_8_standalone:
19762 #ifdef GOAMD64_v3
19763 TZCNTQ BX, BX
19764
19765 #else
19766 BSFQ BX, BX
19767
19768 #endif
19769 SARQ $0x03, BX
19770 LEAL (SI)(BX*1), SI
19771 JMP gen_match_len_end
19772
19773 matchlen_match4_standalone:
19774 CMPL DX, $0x04
19775 JB matchlen_match2_standalone
19776 MOVL (AX)(SI*1), BX
19777 CMPL (CX)(SI*1), BX
19778 JNE matchlen_match2_standalone
19779 LEAL -4(DX), DX
19780 LEAL 4(SI), SI
19781
19782 matchlen_match2_standalone:
19783 CMPL DX, $0x01
19784 JE matchlen_match1_standalone
19785 JB gen_match_len_end
19786 MOVW (AX)(SI*1), BX
19787 CMPW (CX)(SI*1), BX
19788 JNE matchlen_match1_standalone
19789 LEAL 2(SI), SI
19790 SUBL $0x02, DX
19791 JZ gen_match_len_end
19792
19793 matchlen_match1_standalone:
19794 MOVB (AX)(SI*1), BL
19795 CMPB (CX)(SI*1), BL
19796 JNE gen_match_len_end
19797 LEAL 1(SI), SI
19798
19799 gen_match_len_end:
19800 MOVQ SI, ret+48(FP)
19801 RET
19802
19803 // func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
19804 // Requires: SSE2
19805 TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
19806 XORQ SI, SI
19807 MOVQ dst_base+0(FP), AX
19808 MOVQ dst_len+8(FP), CX
19809 MOVQ src_base+24(FP), DX
19810 MOVQ src_len+32(FP), BX
19811 LEAQ (DX)(BX*1), BX
19812 LEAQ -8(AX)(CX*1), CX
19813 XORQ DI, DI
19814
19815 lz4_s2_loop:
19816 CMPQ DX, BX
19817 JAE lz4_s2_corrupt
19818 CMPQ AX, CX
19819 JAE lz4_s2_dstfull
19820 MOVBQZX (DX), R8
19821 MOVQ R8, R9
19822 MOVQ R8, R10
19823 SHRQ $0x04, R9
19824 ANDQ $0x0f, R10
19825 CMPQ R8, $0xf0
19826 JB lz4_s2_ll_end
19827
19828 lz4_s2_ll_loop:
19829 INCQ DX
19830 CMPQ DX, BX
19831 JAE lz4_s2_corrupt
19832 MOVBQZX (DX), R8
19833 ADDQ R8, R9
19834 CMPQ R8, $0xff
19835 JEQ lz4_s2_ll_loop
19836
19837 lz4_s2_ll_end:
19838 LEAQ (DX)(R9*1), R8
19839 ADDQ $0x04, R10
19840 CMPQ R8, BX
19841 JAE lz4_s2_corrupt
19842 INCQ DX
19843 INCQ R8
19844 TESTQ R9, R9
19845 JZ lz4_s2_lits_done
19846 LEAQ (AX)(R9*1), R11
19847 CMPQ R11, CX
19848 JAE lz4_s2_dstfull
19849 ADDQ R9, SI
19850 LEAL -1(R9), R11
19851 CMPL R11, $0x3c
19852 JB one_byte_lz4_s2
19853 CMPL R11, $0x00000100
19854 JB two_bytes_lz4_s2
19855 CMPL R11, $0x00010000
19856 JB three_bytes_lz4_s2
19857 CMPL R11, $0x01000000
19858 JB four_bytes_lz4_s2
19859 MOVB $0xfc, (AX)
19860 MOVL R11, 1(AX)
19861 ADDQ $0x05, AX
19862 JMP memmove_long_lz4_s2
19863
19864 four_bytes_lz4_s2:
19865 MOVL R11, R12
19866 SHRL $0x10, R12
19867 MOVB $0xf8, (AX)
19868 MOVW R11, 1(AX)
19869 MOVB R12, 3(AX)
19870 ADDQ $0x04, AX
19871 JMP memmove_long_lz4_s2
19872
19873 three_bytes_lz4_s2:
19874 MOVB $0xf4, (AX)
19875 MOVW R11, 1(AX)
19876 ADDQ $0x03, AX
19877 JMP memmove_long_lz4_s2
19878
19879 two_bytes_lz4_s2:
19880 MOVB $0xf0, (AX)
19881 MOVB R11, 1(AX)
19882 ADDQ $0x02, AX
19883 CMPL R11, $0x40
19884 JB memmove_lz4_s2
19885 JMP memmove_long_lz4_s2
19886
19887 one_byte_lz4_s2:
19888 SHLB $0x02, R11
19889 MOVB R11, (AX)
19890 ADDQ $0x01, AX
19891
19892 memmove_lz4_s2:
19893 LEAQ (AX)(R9*1), R11
19894
19895 // genMemMoveShort
19896 CMPQ R9, $0x08
19897 JBE emit_lit_memmove_lz4_s2_memmove_move_8
19898 CMPQ R9, $0x10
19899 JBE emit_lit_memmove_lz4_s2_memmove_move_8through16
19900 CMPQ R9, $0x20
19901 JBE emit_lit_memmove_lz4_s2_memmove_move_17through32
19902 JMP emit_lit_memmove_lz4_s2_memmove_move_33through64
19903
19904 emit_lit_memmove_lz4_s2_memmove_move_8:
19905 MOVQ (DX), R12
19906 MOVQ R12, (AX)
19907 JMP memmove_end_copy_lz4_s2
19908
19909 emit_lit_memmove_lz4_s2_memmove_move_8through16:
19910 MOVQ (DX), R12
19911 MOVQ -8(DX)(R9*1), DX
19912 MOVQ R12, (AX)
19913 MOVQ DX, -8(AX)(R9*1)
19914 JMP memmove_end_copy_lz4_s2
19915
19916 emit_lit_memmove_lz4_s2_memmove_move_17through32:
19917 MOVOU (DX), X0
19918 MOVOU -16(DX)(R9*1), X1
19919 MOVOU X0, (AX)
19920 MOVOU X1, -16(AX)(R9*1)
19921 JMP memmove_end_copy_lz4_s2
19922
19923 emit_lit_memmove_lz4_s2_memmove_move_33through64:
19924 MOVOU (DX), X0
19925 MOVOU 16(DX), X1
19926 MOVOU -32(DX)(R9*1), X2
19927 MOVOU -16(DX)(R9*1), X3
19928 MOVOU X0, (AX)
19929 MOVOU X1, 16(AX)
19930 MOVOU X2, -32(AX)(R9*1)
19931 MOVOU X3, -16(AX)(R9*1)
19932
19933 memmove_end_copy_lz4_s2:
19934 MOVQ R11, AX
19935 JMP lz4_s2_lits_emit_done
19936
19937 memmove_long_lz4_s2:
19938 LEAQ (AX)(R9*1), R11
19939
19940 // genMemMoveLong
19941 MOVOU (DX), X0
19942 MOVOU 16(DX), X1
19943 MOVOU -32(DX)(R9*1), X2
19944 MOVOU -16(DX)(R9*1), X3
19945 MOVQ R9, R13
19946 SHRQ $0x05, R13
19947 MOVQ AX, R12
19948 ANDL $0x0000001f, R12
19949 MOVQ $0x00000040, R14
19950 SUBQ R12, R14
19951 DECQ R13
19952 JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
19953 LEAQ -32(DX)(R14*1), R12
19954 LEAQ -32(AX)(R14*1), R15
19955
19956 emit_lit_memmove_long_lz4_s2large_big_loop_back:
19957 MOVOU (R12), X4
19958 MOVOU 16(R12), X5
19959 MOVOA X4, (R15)
19960 MOVOA X5, 16(R15)
19961 ADDQ $0x20, R15
19962 ADDQ $0x20, R12
19963 ADDQ $0x20, R14
19964 DECQ R13
19965 JNA emit_lit_memmove_long_lz4_s2large_big_loop_back
19966
19967 emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
19968 MOVOU -32(DX)(R14*1), X4
19969 MOVOU -16(DX)(R14*1), X5
19970 MOVOA X4, -32(AX)(R14*1)
19971 MOVOA X5, -16(AX)(R14*1)
19972 ADDQ $0x20, R14
19973 CMPQ R9, R14
19974 JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
19975 MOVOU X0, (AX)
19976 MOVOU X1, 16(AX)
19977 MOVOU X2, -32(AX)(R9*1)
19978 MOVOU X3, -16(AX)(R9*1)
19979 MOVQ R11, AX
19980
19981 lz4_s2_lits_emit_done:
19982 MOVQ R8, DX
19983
19984 lz4_s2_lits_done:
19985 CMPQ DX, BX
19986 JNE lz4_s2_match
19987 CMPQ R10, $0x04
19988 JEQ lz4_s2_done
19989 JMP lz4_s2_corrupt
19990
19991 lz4_s2_match:
19992 LEAQ 2(DX), R8
19993 CMPQ R8, BX
19994 JAE lz4_s2_corrupt
19995 MOVWQZX (DX), R9
19996 MOVQ R8, DX
19997 TESTQ R9, R9
19998 JZ lz4_s2_corrupt
19999 CMPQ R9, SI
20000 JA lz4_s2_corrupt
20001 CMPQ R10, $0x13
20002 JNE lz4_s2_ml_done
20003
20004 lz4_s2_ml_loop:
20005 MOVBQZX (DX), R8
20006 INCQ DX
20007 ADDQ R8, R10
20008 CMPQ DX, BX
20009 JAE lz4_s2_corrupt
20010 CMPQ R8, $0xff
20011 JEQ lz4_s2_ml_loop
20012
20013 lz4_s2_ml_done:
20014 ADDQ R10, SI
20015 CMPQ R9, DI
20016 JNE lz4_s2_docopy
20017
20018 // emitRepeat
20019 emit_repeat_again_lz4_s2:
20020 MOVL R10, R8
20021 LEAL -4(R10), R10
20022 CMPL R8, $0x08
20023 JBE repeat_two_lz4_s2
20024 CMPL R8, $0x0c
20025 JAE cant_repeat_two_offset_lz4_s2
20026 CMPL R9, $0x00000800
20027 JB repeat_two_offset_lz4_s2
20028
20029 cant_repeat_two_offset_lz4_s2:
20030 CMPL R10, $0x00000104
20031 JB repeat_three_lz4_s2
20032 CMPL R10, $0x00010100
20033 JB repeat_four_lz4_s2
20034 CMPL R10, $0x0100ffff
20035 JB repeat_five_lz4_s2
20036 LEAL -16842747(R10), R10
20037 MOVL $0xfffb001d, (AX)
20038 MOVB $0xff, 4(AX)
20039 ADDQ $0x05, AX
20040 JMP emit_repeat_again_lz4_s2
20041
20042 repeat_five_lz4_s2:
20043 LEAL -65536(R10), R10
20044 MOVL R10, R9
20045 MOVW $0x001d, (AX)
20046 MOVW R10, 2(AX)
20047 SARL $0x10, R9
20048 MOVB R9, 4(AX)
20049 ADDQ $0x05, AX
20050 JMP lz4_s2_loop
20051
20052 repeat_four_lz4_s2:
20053 LEAL -256(R10), R10
20054 MOVW $0x0019, (AX)
20055 MOVW R10, 2(AX)
20056 ADDQ $0x04, AX
20057 JMP lz4_s2_loop
20058
20059 repeat_three_lz4_s2:
20060 LEAL -4(R10), R10
20061 MOVW $0x0015, (AX)
20062 MOVB R10, 2(AX)
20063 ADDQ $0x03, AX
20064 JMP lz4_s2_loop
20065
20066 repeat_two_lz4_s2:
20067 SHLL $0x02, R10
20068 ORL $0x01, R10
20069 MOVW R10, (AX)
20070 ADDQ $0x02, AX
20071 JMP lz4_s2_loop
20072
20073 repeat_two_offset_lz4_s2:
20074 XORQ R8, R8
20075 LEAL 1(R8)(R10*4), R10
20076 MOVB R9, 1(AX)
20077 SARL $0x08, R9
20078 SHLL $0x05, R9
20079 ORL R9, R10
20080 MOVB R10, (AX)
20081 ADDQ $0x02, AX
20082 JMP lz4_s2_loop
20083
20084 lz4_s2_docopy:
20085 MOVQ R9, DI
20086
20087 // emitCopy
20088 CMPL R10, $0x40
20089 JBE two_byte_offset_short_lz4_s2
20090 CMPL R9, $0x00000800
20091 JAE long_offset_short_lz4_s2
20092 MOVL $0x00000001, R8
20093 LEAL 16(R8), R8
20094 MOVB R9, 1(AX)
20095 MOVL R9, R11
20096 SHRL $0x08, R11
20097 SHLL $0x05, R11
20098 ORL R11, R8
20099 MOVB R8, (AX)
20100 ADDQ $0x02, AX
20101 SUBL $0x08, R10
20102
20103 // emitRepeat
20104 LEAL -4(R10), R10
20105 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20106
20107 emit_repeat_again_lz4_s2_emit_copy_short_2b:
20108 MOVL R10, R8
20109 LEAL -4(R10), R10
20110 CMPL R8, $0x08
20111 JBE repeat_two_lz4_s2_emit_copy_short_2b
20112 CMPL R8, $0x0c
20113 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20114 CMPL R9, $0x00000800
20115 JB repeat_two_offset_lz4_s2_emit_copy_short_2b
20116
20117 cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
20118 CMPL R10, $0x00000104
20119 JB repeat_three_lz4_s2_emit_copy_short_2b
20120 CMPL R10, $0x00010100
20121 JB repeat_four_lz4_s2_emit_copy_short_2b
20122 CMPL R10, $0x0100ffff
20123 JB repeat_five_lz4_s2_emit_copy_short_2b
20124 LEAL -16842747(R10), R10
20125 MOVL $0xfffb001d, (AX)
20126 MOVB $0xff, 4(AX)
20127 ADDQ $0x05, AX
20128 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
20129
20130 repeat_five_lz4_s2_emit_copy_short_2b:
20131 LEAL -65536(R10), R10
20132 MOVL R10, R9
20133 MOVW $0x001d, (AX)
20134 MOVW R10, 2(AX)
20135 SARL $0x10, R9
20136 MOVB R9, 4(AX)
20137 ADDQ $0x05, AX
20138 JMP lz4_s2_loop
20139
20140 repeat_four_lz4_s2_emit_copy_short_2b:
20141 LEAL -256(R10), R10
20142 MOVW $0x0019, (AX)
20143 MOVW R10, 2(AX)
20144 ADDQ $0x04, AX
20145 JMP lz4_s2_loop
20146
20147 repeat_three_lz4_s2_emit_copy_short_2b:
20148 LEAL -4(R10), R10
20149 MOVW $0x0015, (AX)
20150 MOVB R10, 2(AX)
20151 ADDQ $0x03, AX
20152 JMP lz4_s2_loop
20153
20154 repeat_two_lz4_s2_emit_copy_short_2b:
20155 SHLL $0x02, R10
20156 ORL $0x01, R10
20157 MOVW R10, (AX)
20158 ADDQ $0x02, AX
20159 JMP lz4_s2_loop
20160
20161 repeat_two_offset_lz4_s2_emit_copy_short_2b:
20162 XORQ R8, R8
20163 LEAL 1(R8)(R10*4), R10
20164 MOVB R9, 1(AX)
20165 SARL $0x08, R9
20166 SHLL $0x05, R9
20167 ORL R9, R10
20168 MOVB R10, (AX)
20169 ADDQ $0x02, AX
20170 JMP lz4_s2_loop
20171
20172 long_offset_short_lz4_s2:
20173 MOVB $0xee, (AX)
20174 MOVW R9, 1(AX)
20175 LEAL -60(R10), R10
20176 ADDQ $0x03, AX
20177
20178 // emitRepeat
20179 emit_repeat_again_lz4_s2_emit_copy_short:
20180 MOVL R10, R8
20181 LEAL -4(R10), R10
20182 CMPL R8, $0x08
20183 JBE repeat_two_lz4_s2_emit_copy_short
20184 CMPL R8, $0x0c
20185 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
20186 CMPL R9, $0x00000800
20187 JB repeat_two_offset_lz4_s2_emit_copy_short
20188
20189 cant_repeat_two_offset_lz4_s2_emit_copy_short:
20190 CMPL R10, $0x00000104
20191 JB repeat_three_lz4_s2_emit_copy_short
20192 CMPL R10, $0x00010100
20193 JB repeat_four_lz4_s2_emit_copy_short
20194 CMPL R10, $0x0100ffff
20195 JB repeat_five_lz4_s2_emit_copy_short
20196 LEAL -16842747(R10), R10
20197 MOVL $0xfffb001d, (AX)
20198 MOVB $0xff, 4(AX)
20199 ADDQ $0x05, AX
20200 JMP emit_repeat_again_lz4_s2_emit_copy_short
20201
20202 repeat_five_lz4_s2_emit_copy_short:
20203 LEAL -65536(R10), R10
20204 MOVL R10, R9
20205 MOVW $0x001d, (AX)
20206 MOVW R10, 2(AX)
20207 SARL $0x10, R9
20208 MOVB R9, 4(AX)
20209 ADDQ $0x05, AX
20210 JMP lz4_s2_loop
20211
20212 repeat_four_lz4_s2_emit_copy_short:
20213 LEAL -256(R10), R10
20214 MOVW $0x0019, (AX)
20215 MOVW R10, 2(AX)
20216 ADDQ $0x04, AX
20217 JMP lz4_s2_loop
20218
20219 repeat_three_lz4_s2_emit_copy_short:
20220 LEAL -4(R10), R10
20221 MOVW $0x0015, (AX)
20222 MOVB R10, 2(AX)
20223 ADDQ $0x03, AX
20224 JMP lz4_s2_loop
20225
20226 repeat_two_lz4_s2_emit_copy_short:
20227 SHLL $0x02, R10
20228 ORL $0x01, R10
20229 MOVW R10, (AX)
20230 ADDQ $0x02, AX
20231 JMP lz4_s2_loop
20232
20233 repeat_two_offset_lz4_s2_emit_copy_short:
20234 XORQ R8, R8
20235 LEAL 1(R8)(R10*4), R10
20236 MOVB R9, 1(AX)
20237 SARL $0x08, R9
20238 SHLL $0x05, R9
20239 ORL R9, R10
20240 MOVB R10, (AX)
20241 ADDQ $0x02, AX
20242 JMP lz4_s2_loop
20243
20244 two_byte_offset_short_lz4_s2:
20245 MOVL R10, R8
20246 SHLL $0x02, R8
20247 CMPL R10, $0x0c
20248 JAE emit_copy_three_lz4_s2
20249 CMPL R9, $0x00000800
20250 JAE emit_copy_three_lz4_s2
20251 LEAL -15(R8), R8
20252 MOVB R9, 1(AX)
20253 SHRL $0x08, R9
20254 SHLL $0x05, R9
20255 ORL R9, R8
20256 MOVB R8, (AX)
20257 ADDQ $0x02, AX
20258 JMP lz4_s2_loop
20259
20260 emit_copy_three_lz4_s2:
20261 LEAL -2(R8), R8
20262 MOVB R8, (AX)
20263 MOVW R9, 1(AX)
20264 ADDQ $0x03, AX
20265 JMP lz4_s2_loop
20266
20267 lz4_s2_done:
20268 MOVQ dst_base+0(FP), CX
20269 SUBQ CX, AX
20270 MOVQ SI, uncompressed+48(FP)
20271 MOVQ AX, dstUsed+56(FP)
20272 RET
20273
20274 lz4_s2_corrupt:
20275 XORQ AX, AX
20276 LEAQ -1(AX), SI
20277 MOVQ SI, uncompressed+48(FP)
20278 RET
20279
20280 lz4_s2_dstfull:
20281 XORQ AX, AX
20282 LEAQ -2(AX), SI
20283 MOVQ SI, uncompressed+48(FP)
20284 RET
20285
20286 // func cvtLZ4sBlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
20287 // Requires: SSE2
20288 TEXT ·cvtLZ4sBlockAsm(SB), NOSPLIT, $0-64
20289 XORQ SI, SI
20290 MOVQ dst_base+0(FP), AX
20291 MOVQ dst_len+8(FP), CX
20292 MOVQ src_base+24(FP), DX
20293 MOVQ src_len+32(FP), BX
20294 LEAQ (DX)(BX*1), BX
20295 LEAQ -8(AX)(CX*1), CX
20296 XORQ DI, DI
20297
20298 lz4s_s2_loop:
20299 CMPQ DX, BX
20300 JAE lz4s_s2_corrupt
20301 CMPQ AX, CX
20302 JAE lz4s_s2_dstfull
20303 MOVBQZX (DX), R8
20304 MOVQ R8, R9
20305 MOVQ R8, R10
20306 SHRQ $0x04, R9
20307 ANDQ $0x0f, R10
20308 CMPQ R8, $0xf0
20309 JB lz4s_s2_ll_end
20310
20311 lz4s_s2_ll_loop:
20312 INCQ DX
20313 CMPQ DX, BX
20314 JAE lz4s_s2_corrupt
20315 MOVBQZX (DX), R8
20316 ADDQ R8, R9
20317 CMPQ R8, $0xff
20318 JEQ lz4s_s2_ll_loop
20319
20320 lz4s_s2_ll_end:
20321 LEAQ (DX)(R9*1), R8
20322 ADDQ $0x03, R10
20323 CMPQ R8, BX
20324 JAE lz4s_s2_corrupt
20325 INCQ DX
20326 INCQ R8
20327 TESTQ R9, R9
20328 JZ lz4s_s2_lits_done
20329 LEAQ (AX)(R9*1), R11
20330 CMPQ R11, CX
20331 JAE lz4s_s2_dstfull
20332 ADDQ R9, SI
20333 LEAL -1(R9), R11
20334 CMPL R11, $0x3c
20335 JB one_byte_lz4s_s2
20336 CMPL R11, $0x00000100
20337 JB two_bytes_lz4s_s2
20338 CMPL R11, $0x00010000
20339 JB three_bytes_lz4s_s2
20340 CMPL R11, $0x01000000
20341 JB four_bytes_lz4s_s2
20342 MOVB $0xfc, (AX)
20343 MOVL R11, 1(AX)
20344 ADDQ $0x05, AX
20345 JMP memmove_long_lz4s_s2
20346
20347 four_bytes_lz4s_s2:
20348 MOVL R11, R12
20349 SHRL $0x10, R12
20350 MOVB $0xf8, (AX)
20351 MOVW R11, 1(AX)
20352 MOVB R12, 3(AX)
20353 ADDQ $0x04, AX
20354 JMP memmove_long_lz4s_s2
20355
20356 three_bytes_lz4s_s2:
20357 MOVB $0xf4, (AX)
20358 MOVW R11, 1(AX)
20359 ADDQ $0x03, AX
20360 JMP memmove_long_lz4s_s2
20361
20362 two_bytes_lz4s_s2:
20363 MOVB $0xf0, (AX)
20364 MOVB R11, 1(AX)
20365 ADDQ $0x02, AX
20366 CMPL R11, $0x40
20367 JB memmove_lz4s_s2
20368 JMP memmove_long_lz4s_s2
20369
20370 one_byte_lz4s_s2:
20371 SHLB $0x02, R11
20372 MOVB R11, (AX)
20373 ADDQ $0x01, AX
20374
20375 memmove_lz4s_s2:
20376 LEAQ (AX)(R9*1), R11
20377
20378 // genMemMoveShort
20379 CMPQ R9, $0x08
20380 JBE emit_lit_memmove_lz4s_s2_memmove_move_8
20381 CMPQ R9, $0x10
20382 JBE emit_lit_memmove_lz4s_s2_memmove_move_8through16
20383 CMPQ R9, $0x20
20384 JBE emit_lit_memmove_lz4s_s2_memmove_move_17through32
20385 JMP emit_lit_memmove_lz4s_s2_memmove_move_33through64
20386
20387 emit_lit_memmove_lz4s_s2_memmove_move_8:
20388 MOVQ (DX), R12
20389 MOVQ R12, (AX)
20390 JMP memmove_end_copy_lz4s_s2
20391
20392 emit_lit_memmove_lz4s_s2_memmove_move_8through16:
20393 MOVQ (DX), R12
20394 MOVQ -8(DX)(R9*1), DX
20395 MOVQ R12, (AX)
20396 MOVQ DX, -8(AX)(R9*1)
20397 JMP memmove_end_copy_lz4s_s2
20398
20399 emit_lit_memmove_lz4s_s2_memmove_move_17through32:
20400 MOVOU (DX), X0
20401 MOVOU -16(DX)(R9*1), X1
20402 MOVOU X0, (AX)
20403 MOVOU X1, -16(AX)(R9*1)
20404 JMP memmove_end_copy_lz4s_s2
20405
20406 emit_lit_memmove_lz4s_s2_memmove_move_33through64:
20407 MOVOU (DX), X0
20408 MOVOU 16(DX), X1
20409 MOVOU -32(DX)(R9*1), X2
20410 MOVOU -16(DX)(R9*1), X3
20411 MOVOU X0, (AX)
20412 MOVOU X1, 16(AX)
20413 MOVOU X2, -32(AX)(R9*1)
20414 MOVOU X3, -16(AX)(R9*1)
20415
20416 memmove_end_copy_lz4s_s2:
20417 MOVQ R11, AX
20418 JMP lz4s_s2_lits_emit_done
20419
20420 memmove_long_lz4s_s2:
20421 LEAQ (AX)(R9*1), R11
20422
20423 // genMemMoveLong
20424 MOVOU (DX), X0
20425 MOVOU 16(DX), X1
20426 MOVOU -32(DX)(R9*1), X2
20427 MOVOU -16(DX)(R9*1), X3
20428 MOVQ R9, R13
20429 SHRQ $0x05, R13
20430 MOVQ AX, R12
20431 ANDL $0x0000001f, R12
20432 MOVQ $0x00000040, R14
20433 SUBQ R12, R14
20434 DECQ R13
20435 JA emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
20436 LEAQ -32(DX)(R14*1), R12
20437 LEAQ -32(AX)(R14*1), R15
20438
20439 emit_lit_memmove_long_lz4s_s2large_big_loop_back:
20440 MOVOU (R12), X4
20441 MOVOU 16(R12), X5
20442 MOVOA X4, (R15)
20443 MOVOA X5, 16(R15)
20444 ADDQ $0x20, R15
20445 ADDQ $0x20, R12
20446 ADDQ $0x20, R14
20447 DECQ R13
20448 JNA emit_lit_memmove_long_lz4s_s2large_big_loop_back
20449
20450 emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32:
20451 MOVOU -32(DX)(R14*1), X4
20452 MOVOU -16(DX)(R14*1), X5
20453 MOVOA X4, -32(AX)(R14*1)
20454 MOVOA X5, -16(AX)(R14*1)
20455 ADDQ $0x20, R14
20456 CMPQ R9, R14
20457 JAE emit_lit_memmove_long_lz4s_s2large_forward_sse_loop_32
20458 MOVOU X0, (AX)
20459 MOVOU X1, 16(AX)
20460 MOVOU X2, -32(AX)(R9*1)
20461 MOVOU X3, -16(AX)(R9*1)
20462 MOVQ R11, AX
20463
20464 lz4s_s2_lits_emit_done:
20465 MOVQ R8, DX
20466
20467 lz4s_s2_lits_done:
20468 CMPQ DX, BX
20469 JNE lz4s_s2_match
20470 CMPQ R10, $0x03
20471 JEQ lz4s_s2_done
20472 JMP lz4s_s2_corrupt
20473
20474 lz4s_s2_match:
20475 CMPQ R10, $0x03
20476 JEQ lz4s_s2_loop
20477 LEAQ 2(DX), R8
20478 CMPQ R8, BX
20479 JAE lz4s_s2_corrupt
20480 MOVWQZX (DX), R9
20481 MOVQ R8, DX
20482 TESTQ R9, R9
20483 JZ lz4s_s2_corrupt
20484 CMPQ R9, SI
20485 JA lz4s_s2_corrupt
20486 CMPQ R10, $0x12
20487 JNE lz4s_s2_ml_done
20488
20489 lz4s_s2_ml_loop:
20490 MOVBQZX (DX), R8
20491 INCQ DX
20492 ADDQ R8, R10
20493 CMPQ DX, BX
20494 JAE lz4s_s2_corrupt
20495 CMPQ R8, $0xff
20496 JEQ lz4s_s2_ml_loop
20497
20498 lz4s_s2_ml_done:
20499 ADDQ R10, SI
20500 CMPQ R9, DI
20501 JNE lz4s_s2_docopy
20502
20503 // emitRepeat
20504 emit_repeat_again_lz4_s2:
20505 MOVL R10, R8
20506 LEAL -4(R10), R10
20507 CMPL R8, $0x08
20508 JBE repeat_two_lz4_s2
20509 CMPL R8, $0x0c
20510 JAE cant_repeat_two_offset_lz4_s2
20511 CMPL R9, $0x00000800
20512 JB repeat_two_offset_lz4_s2
20513
20514 cant_repeat_two_offset_lz4_s2:
20515 CMPL R10, $0x00000104
20516 JB repeat_three_lz4_s2
20517 CMPL R10, $0x00010100
20518 JB repeat_four_lz4_s2
20519 CMPL R10, $0x0100ffff
20520 JB repeat_five_lz4_s2
20521 LEAL -16842747(R10), R10
20522 MOVL $0xfffb001d, (AX)
20523 MOVB $0xff, 4(AX)
20524 ADDQ $0x05, AX
20525 JMP emit_repeat_again_lz4_s2
20526
20527 repeat_five_lz4_s2:
20528 LEAL -65536(R10), R10
20529 MOVL R10, R9
20530 MOVW $0x001d, (AX)
20531 MOVW R10, 2(AX)
20532 SARL $0x10, R9
20533 MOVB R9, 4(AX)
20534 ADDQ $0x05, AX
20535 JMP lz4s_s2_loop
20536
20537 repeat_four_lz4_s2:
20538 LEAL -256(R10), R10
20539 MOVW $0x0019, (AX)
20540 MOVW R10, 2(AX)
20541 ADDQ $0x04, AX
20542 JMP lz4s_s2_loop
20543
20544 repeat_three_lz4_s2:
20545 LEAL -4(R10), R10
20546 MOVW $0x0015, (AX)
20547 MOVB R10, 2(AX)
20548 ADDQ $0x03, AX
20549 JMP lz4s_s2_loop
20550
20551 repeat_two_lz4_s2:
20552 SHLL $0x02, R10
20553 ORL $0x01, R10
20554 MOVW R10, (AX)
20555 ADDQ $0x02, AX
20556 JMP lz4s_s2_loop
20557
20558 repeat_two_offset_lz4_s2:
20559 XORQ R8, R8
20560 LEAL 1(R8)(R10*4), R10
20561 MOVB R9, 1(AX)
20562 SARL $0x08, R9
20563 SHLL $0x05, R9
20564 ORL R9, R10
20565 MOVB R10, (AX)
20566 ADDQ $0x02, AX
20567 JMP lz4s_s2_loop
20568
20569 lz4s_s2_docopy:
20570 MOVQ R9, DI
20571
20572 // emitCopy
20573 CMPL R10, $0x40
20574 JBE two_byte_offset_short_lz4_s2
20575 CMPL R9, $0x00000800
20576 JAE long_offset_short_lz4_s2
20577 MOVL $0x00000001, R8
20578 LEAL 16(R8), R8
20579 MOVB R9, 1(AX)
20580 MOVL R9, R11
20581 SHRL $0x08, R11
20582 SHLL $0x05, R11
20583 ORL R11, R8
20584 MOVB R8, (AX)
20585 ADDQ $0x02, AX
20586 SUBL $0x08, R10
20587
20588 // emitRepeat
20589 LEAL -4(R10), R10
20590 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20591
20592 emit_repeat_again_lz4_s2_emit_copy_short_2b:
20593 MOVL R10, R8
20594 LEAL -4(R10), R10
20595 CMPL R8, $0x08
20596 JBE repeat_two_lz4_s2_emit_copy_short_2b
20597 CMPL R8, $0x0c
20598 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
20599 CMPL R9, $0x00000800
20600 JB repeat_two_offset_lz4_s2_emit_copy_short_2b
20601
20602 cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
20603 CMPL R10, $0x00000104
20604 JB repeat_three_lz4_s2_emit_copy_short_2b
20605 CMPL R10, $0x00010100
20606 JB repeat_four_lz4_s2_emit_copy_short_2b
20607 CMPL R10, $0x0100ffff
20608 JB repeat_five_lz4_s2_emit_copy_short_2b
20609 LEAL -16842747(R10), R10
20610 MOVL $0xfffb001d, (AX)
20611 MOVB $0xff, 4(AX)
20612 ADDQ $0x05, AX
20613 JMP emit_repeat_again_lz4_s2_emit_copy_short_2b
20614
20615 repeat_five_lz4_s2_emit_copy_short_2b:
20616 LEAL -65536(R10), R10
20617 MOVL R10, R9
20618 MOVW $0x001d, (AX)
20619 MOVW R10, 2(AX)
20620 SARL $0x10, R9
20621 MOVB R9, 4(AX)
20622 ADDQ $0x05, AX
20623 JMP lz4s_s2_loop
20624
20625 repeat_four_lz4_s2_emit_copy_short_2b:
20626 LEAL -256(R10), R10
20627 MOVW $0x0019, (AX)
20628 MOVW R10, 2(AX)
20629 ADDQ $0x04, AX
20630 JMP lz4s_s2_loop
20631
20632 repeat_three_lz4_s2_emit_copy_short_2b:
20633 LEAL -4(R10), R10
20634 MOVW $0x0015, (AX)
20635 MOVB R10, 2(AX)
20636 ADDQ $0x03, AX
20637 JMP lz4s_s2_loop
20638
20639 repeat_two_lz4_s2_emit_copy_short_2b:
20640 SHLL $0x02, R10
20641 ORL $0x01, R10
20642 MOVW R10, (AX)
20643 ADDQ $0x02, AX
20644 JMP lz4s_s2_loop
20645
20646 repeat_two_offset_lz4_s2_emit_copy_short_2b:
20647 XORQ R8, R8
20648 LEAL 1(R8)(R10*4), R10
20649 MOVB R9, 1(AX)
20650 SARL $0x08, R9
20651 SHLL $0x05, R9
20652 ORL R9, R10
20653 MOVB R10, (AX)
20654 ADDQ $0x02, AX
20655 JMP lz4s_s2_loop
20656
20657 long_offset_short_lz4_s2:
20658 MOVB $0xee, (AX)
20659 MOVW R9, 1(AX)
20660 LEAL -60(R10), R10
20661 ADDQ $0x03, AX
20662
20663 // emitRepeat
20664 emit_repeat_again_lz4_s2_emit_copy_short:
20665 MOVL R10, R8
20666 LEAL -4(R10), R10
20667 CMPL R8, $0x08
20668 JBE repeat_two_lz4_s2_emit_copy_short
20669 CMPL R8, $0x0c
20670 JAE cant_repeat_two_offset_lz4_s2_emit_copy_short
20671 CMPL R9, $0x00000800
20672 JB repeat_two_offset_lz4_s2_emit_copy_short
20673
20674 cant_repeat_two_offset_lz4_s2_emit_copy_short:
20675 CMPL R10, $0x00000104
20676 JB repeat_three_lz4_s2_emit_copy_short
20677 CMPL R10, $0x00010100
20678 JB repeat_four_lz4_s2_emit_copy_short
20679 CMPL R10, $0x0100ffff
20680 JB repeat_five_lz4_s2_emit_copy_short
20681 LEAL -16842747(R10), R10
20682 MOVL $0xfffb001d, (AX)
20683 MOVB $0xff, 4(AX)
20684 ADDQ $0x05, AX
20685 JMP emit_repeat_again_lz4_s2_emit_copy_short
20686
20687 repeat_five_lz4_s2_emit_copy_short:
20688 LEAL -65536(R10), R10
20689 MOVL R10, R9
20690 MOVW $0x001d, (AX)
20691 MOVW R10, 2(AX)
20692 SARL $0x10, R9
20693 MOVB R9, 4(AX)
20694 ADDQ $0x05, AX
20695 JMP lz4s_s2_loop
20696
20697 repeat_four_lz4_s2_emit_copy_short:
20698 LEAL -256(R10), R10
20699 MOVW $0x0019, (AX)
20700 MOVW R10, 2(AX)
20701 ADDQ $0x04, AX
20702 JMP lz4s_s2_loop
20703
20704 repeat_three_lz4_s2_emit_copy_short:
20705 LEAL -4(R10), R10
20706 MOVW $0x0015, (AX)
20707 MOVB R10, 2(AX)
20708 ADDQ $0x03, AX
20709 JMP lz4s_s2_loop
20710
20711 repeat_two_lz4_s2_emit_copy_short:
20712 SHLL $0x02, R10
20713 ORL $0x01, R10
20714 MOVW R10, (AX)
20715 ADDQ $0x02, AX
20716 JMP lz4s_s2_loop
20717
20718 repeat_two_offset_lz4_s2_emit_copy_short:
20719 XORQ R8, R8
20720 LEAL 1(R8)(R10*4), R10
20721 MOVB R9, 1(AX)
20722 SARL $0x08, R9
20723 SHLL $0x05, R9
20724 ORL R9, R10
20725 MOVB R10, (AX)
20726 ADDQ $0x02, AX
20727 JMP lz4s_s2_loop
20728
20729 two_byte_offset_short_lz4_s2:
20730 MOVL R10, R8
20731 SHLL $0x02, R8
20732 CMPL R10, $0x0c
20733 JAE emit_copy_three_lz4_s2
20734 CMPL R9, $0x00000800
20735 JAE emit_copy_three_lz4_s2
20736 LEAL -15(R8), R8
20737 MOVB R9, 1(AX)
20738 SHRL $0x08, R9
20739 SHLL $0x05, R9
20740 ORL R9, R8
20741 MOVB R8, (AX)
20742 ADDQ $0x02, AX
20743 JMP lz4s_s2_loop
20744
20745 emit_copy_three_lz4_s2:
20746 LEAL -2(R8), R8
20747 MOVB R8, (AX)
20748 MOVW R9, 1(AX)
20749 ADDQ $0x03, AX
20750 JMP lz4s_s2_loop
20751
20752 lz4s_s2_done:
20753 MOVQ dst_base+0(FP), CX
20754 SUBQ CX, AX
20755 MOVQ SI, uncompressed+48(FP)
20756 MOVQ AX, dstUsed+56(FP)
20757 RET
20758
20759 lz4s_s2_corrupt:
20760 XORQ AX, AX
20761 LEAQ -1(AX), SI
20762 MOVQ SI, uncompressed+48(FP)
20763 RET
20764
20765 lz4s_s2_dstfull:
20766 XORQ AX, AX
20767 LEAQ -2(AX), SI
20768 MOVQ SI, uncompressed+48(FP)
20769 RET
20770
20771 // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
20772 // Requires: SSE2
20773 TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64
20774 XORQ SI, SI
20775 MOVQ dst_base+0(FP), AX
20776 MOVQ dst_len+8(FP), CX
20777 MOVQ src_base+24(FP), DX
20778 MOVQ src_len+32(FP), BX
20779 LEAQ (DX)(BX*1), BX
20780 LEAQ -8(AX)(CX*1), CX
20781
20782 lz4_snappy_loop:
20783 CMPQ DX, BX
20784 JAE lz4_snappy_corrupt
20785 CMPQ AX, CX
20786 JAE lz4_snappy_dstfull
20787 MOVBQZX (DX), DI
20788 MOVQ DI, R8
20789 MOVQ DI, R9
20790 SHRQ $0x04, R8
20791 ANDQ $0x0f, R9
20792 CMPQ DI, $0xf0
20793 JB lz4_snappy_ll_end
20794
20795 lz4_snappy_ll_loop:
20796 INCQ DX
20797 CMPQ DX, BX
20798 JAE lz4_snappy_corrupt
20799 MOVBQZX (DX), DI
20800 ADDQ DI, R8
20801 CMPQ DI, $0xff
20802 JEQ lz4_snappy_ll_loop
20803
20804 lz4_snappy_ll_end:
20805 LEAQ (DX)(R8*1), DI
20806 ADDQ $0x04, R9
20807 CMPQ DI, BX
20808 JAE lz4_snappy_corrupt
20809 INCQ DX
20810 INCQ DI
20811 TESTQ R8, R8
20812 JZ lz4_snappy_lits_done
20813 LEAQ (AX)(R8*1), R10
20814 CMPQ R10, CX
20815 JAE lz4_snappy_dstfull
20816 ADDQ R8, SI
20817 LEAL -1(R8), R10
20818 CMPL R10, $0x3c
20819 JB one_byte_lz4_snappy
20820 CMPL R10, $0x00000100
20821 JB two_bytes_lz4_snappy
20822 CMPL R10, $0x00010000
20823 JB three_bytes_lz4_snappy
20824 CMPL R10, $0x01000000
20825 JB four_bytes_lz4_snappy
20826 MOVB $0xfc, (AX)
20827 MOVL R10, 1(AX)
20828 ADDQ $0x05, AX
20829 JMP memmove_long_lz4_snappy
20830
20831 four_bytes_lz4_snappy:
20832 MOVL R10, R11
20833 SHRL $0x10, R11
20834 MOVB $0xf8, (AX)
20835 MOVW R10, 1(AX)
20836 MOVB R11, 3(AX)
20837 ADDQ $0x04, AX
20838 JMP memmove_long_lz4_snappy
20839
20840 three_bytes_lz4_snappy:
20841 MOVB $0xf4, (AX)
20842 MOVW R10, 1(AX)
20843 ADDQ $0x03, AX
20844 JMP memmove_long_lz4_snappy
20845
20846 two_bytes_lz4_snappy:
20847 MOVB $0xf0, (AX)
20848 MOVB R10, 1(AX)
20849 ADDQ $0x02, AX
20850 CMPL R10, $0x40
20851 JB memmove_lz4_snappy
20852 JMP memmove_long_lz4_snappy
20853
20854 one_byte_lz4_snappy:
20855 SHLB $0x02, R10
20856 MOVB R10, (AX)
20857 ADDQ $0x01, AX
20858
20859 memmove_lz4_snappy:
20860 LEAQ (AX)(R8*1), R10
20861
20862 // genMemMoveShort
20863 CMPQ R8, $0x08
20864 JBE emit_lit_memmove_lz4_snappy_memmove_move_8
20865 CMPQ R8, $0x10
20866 JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16
20867 CMPQ R8, $0x20
20868 JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32
20869 JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64
20870
20871 emit_lit_memmove_lz4_snappy_memmove_move_8:
20872 MOVQ (DX), R11
20873 MOVQ R11, (AX)
20874 JMP memmove_end_copy_lz4_snappy
20875
20876 emit_lit_memmove_lz4_snappy_memmove_move_8through16:
20877 MOVQ (DX), R11
20878 MOVQ -8(DX)(R8*1), DX
20879 MOVQ R11, (AX)
20880 MOVQ DX, -8(AX)(R8*1)
20881 JMP memmove_end_copy_lz4_snappy
20882
20883 emit_lit_memmove_lz4_snappy_memmove_move_17through32:
20884 MOVOU (DX), X0
20885 MOVOU -16(DX)(R8*1), X1
20886 MOVOU X0, (AX)
20887 MOVOU X1, -16(AX)(R8*1)
20888 JMP memmove_end_copy_lz4_snappy
20889
20890 emit_lit_memmove_lz4_snappy_memmove_move_33through64:
20891 MOVOU (DX), X0
20892 MOVOU 16(DX), X1
20893 MOVOU -32(DX)(R8*1), X2
20894 MOVOU -16(DX)(R8*1), X3
20895 MOVOU X0, (AX)
20896 MOVOU X1, 16(AX)
20897 MOVOU X2, -32(AX)(R8*1)
20898 MOVOU X3, -16(AX)(R8*1)
20899
20900 memmove_end_copy_lz4_snappy:
20901 MOVQ R10, AX
20902 JMP lz4_snappy_lits_emit_done
20903
20904 memmove_long_lz4_snappy:
20905 LEAQ (AX)(R8*1), R10
20906
20907 // genMemMoveLong
20908 MOVOU (DX), X0
20909 MOVOU 16(DX), X1
20910 MOVOU -32(DX)(R8*1), X2
20911 MOVOU -16(DX)(R8*1), X3
20912 MOVQ R8, R12
20913 SHRQ $0x05, R12
20914 MOVQ AX, R11
20915 ANDL $0x0000001f, R11
20916 MOVQ $0x00000040, R13
20917 SUBQ R11, R13
20918 DECQ R12
20919 JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
20920 LEAQ -32(DX)(R13*1), R11
20921 LEAQ -32(AX)(R13*1), R14
20922
20923 emit_lit_memmove_long_lz4_snappylarge_big_loop_back:
20924 MOVOU (R11), X4
20925 MOVOU 16(R11), X5
20926 MOVOA X4, (R14)
20927 MOVOA X5, 16(R14)
20928 ADDQ $0x20, R14
20929 ADDQ $0x20, R11
20930 ADDQ $0x20, R13
20931 DECQ R12
20932 JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back
20933
20934 emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32:
20935 MOVOU -32(DX)(R13*1), X4
20936 MOVOU -16(DX)(R13*1), X5
20937 MOVOA X4, -32(AX)(R13*1)
20938 MOVOA X5, -16(AX)(R13*1)
20939 ADDQ $0x20, R13
20940 CMPQ R8, R13
20941 JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32
20942 MOVOU X0, (AX)
20943 MOVOU X1, 16(AX)
20944 MOVOU X2, -32(AX)(R8*1)
20945 MOVOU X3, -16(AX)(R8*1)
20946 MOVQ R10, AX
20947
20948 lz4_snappy_lits_emit_done:
20949 MOVQ DI, DX
20950
20951 lz4_snappy_lits_done:
20952 CMPQ DX, BX
20953 JNE lz4_snappy_match
20954 CMPQ R9, $0x04
20955 JEQ lz4_snappy_done
20956 JMP lz4_snappy_corrupt
20957
20958 lz4_snappy_match:
20959 LEAQ 2(DX), DI
20960 CMPQ DI, BX
20961 JAE lz4_snappy_corrupt
20962 MOVWQZX (DX), R8
20963 MOVQ DI, DX
20964 TESTQ R8, R8
20965 JZ lz4_snappy_corrupt
20966 CMPQ R8, SI
20967 JA lz4_snappy_corrupt
20968 CMPQ R9, $0x13
20969 JNE lz4_snappy_ml_done
20970
20971 lz4_snappy_ml_loop:
20972 MOVBQZX (DX), DI
20973 INCQ DX
20974 ADDQ DI, R9
20975 CMPQ DX, BX
20976 JAE lz4_snappy_corrupt
20977 CMPQ DI, $0xff
20978 JEQ lz4_snappy_ml_loop
20979
20980 lz4_snappy_ml_done:
20981 ADDQ R9, SI
20982
20983 // emitCopy
20984 two_byte_offset_lz4_s2:
20985 CMPL R9, $0x40
20986 JBE two_byte_offset_short_lz4_s2
20987 MOVB $0xee, (AX)
20988 MOVW R8, 1(AX)
20989 LEAL -60(R9), R9
20990 ADDQ $0x03, AX
20991 CMPQ AX, CX
20992 JAE lz4_snappy_loop
20993 JMP two_byte_offset_lz4_s2
20994
20995 two_byte_offset_short_lz4_s2:
20996 MOVL R9, DI
20997 SHLL $0x02, DI
20998 CMPL R9, $0x0c
20999 JAE emit_copy_three_lz4_s2
21000 CMPL R8, $0x00000800
21001 JAE emit_copy_three_lz4_s2
21002 LEAL -15(DI), DI
21003 MOVB R8, 1(AX)
21004 SHRL $0x08, R8
21005 SHLL $0x05, R8
21006 ORL R8, DI
21007 MOVB DI, (AX)
21008 ADDQ $0x02, AX
21009 JMP lz4_snappy_loop
21010
21011 emit_copy_three_lz4_s2:
21012 LEAL -2(DI), DI
21013 MOVB DI, (AX)
21014 MOVW R8, 1(AX)
21015 ADDQ $0x03, AX
21016 JMP lz4_snappy_loop
21017
21018 lz4_snappy_done:
21019 MOVQ dst_base+0(FP), CX
21020 SUBQ CX, AX
21021 MOVQ SI, uncompressed+48(FP)
21022 MOVQ AX, dstUsed+56(FP)
21023 RET
21024
21025 lz4_snappy_corrupt:
21026 XORQ AX, AX
21027 LEAQ -1(AX), SI
21028 MOVQ SI, uncompressed+48(FP)
21029 RET
21030
21031 lz4_snappy_dstfull:
21032 XORQ AX, AX
21033 LEAQ -2(AX), SI
21034 MOVQ SI, uncompressed+48(FP)
21035 RET
21036
21037 // func cvtLZ4sBlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
21038 // Requires: SSE2
21039 TEXT ·cvtLZ4sBlockSnappyAsm(SB), NOSPLIT, $0-64
21040 XORQ SI, SI
21041 MOVQ dst_base+0(FP), AX
21042 MOVQ dst_len+8(FP), CX
21043 MOVQ src_base+24(FP), DX
21044 MOVQ src_len+32(FP), BX
21045 LEAQ (DX)(BX*1), BX
21046 LEAQ -8(AX)(CX*1), CX
21047
21048 lz4s_snappy_loop:
21049 CMPQ DX, BX
21050 JAE lz4s_snappy_corrupt
21051 CMPQ AX, CX
21052 JAE lz4s_snappy_dstfull
21053 MOVBQZX (DX), DI
21054 MOVQ DI, R8
21055 MOVQ DI, R9
21056 SHRQ $0x04, R8
21057 ANDQ $0x0f, R9
21058 CMPQ DI, $0xf0
21059 JB lz4s_snappy_ll_end
21060
21061 lz4s_snappy_ll_loop:
21062 INCQ DX
21063 CMPQ DX, BX
21064 JAE lz4s_snappy_corrupt
21065 MOVBQZX (DX), DI
21066 ADDQ DI, R8
21067 CMPQ DI, $0xff
21068 JEQ lz4s_snappy_ll_loop
21069
21070 lz4s_snappy_ll_end:
21071 LEAQ (DX)(R8*1), DI
21072 ADDQ $0x03, R9
21073 CMPQ DI, BX
21074 JAE lz4s_snappy_corrupt
21075 INCQ DX
21076 INCQ DI
21077 TESTQ R8, R8
21078 JZ lz4s_snappy_lits_done
21079 LEAQ (AX)(R8*1), R10
21080 CMPQ R10, CX
21081 JAE lz4s_snappy_dstfull
21082 ADDQ R8, SI
21083 LEAL -1(R8), R10
21084 CMPL R10, $0x3c
21085 JB one_byte_lz4s_snappy
21086 CMPL R10, $0x00000100
21087 JB two_bytes_lz4s_snappy
21088 CMPL R10, $0x00010000
21089 JB three_bytes_lz4s_snappy
21090 CMPL R10, $0x01000000
21091 JB four_bytes_lz4s_snappy
21092 MOVB $0xfc, (AX)
21093 MOVL R10, 1(AX)
21094 ADDQ $0x05, AX
21095 JMP memmove_long_lz4s_snappy
21096
21097 four_bytes_lz4s_snappy:
21098 MOVL R10, R11
21099 SHRL $0x10, R11
21100 MOVB $0xf8, (AX)
21101 MOVW R10, 1(AX)
21102 MOVB R11, 3(AX)
21103 ADDQ $0x04, AX
21104 JMP memmove_long_lz4s_snappy
21105
21106 three_bytes_lz4s_snappy:
21107 MOVB $0xf4, (AX)
21108 MOVW R10, 1(AX)
21109 ADDQ $0x03, AX
21110 JMP memmove_long_lz4s_snappy
21111
21112 two_bytes_lz4s_snappy:
21113 MOVB $0xf0, (AX)
21114 MOVB R10, 1(AX)
21115 ADDQ $0x02, AX
21116 CMPL R10, $0x40
21117 JB memmove_lz4s_snappy
21118 JMP memmove_long_lz4s_snappy
21119
21120 one_byte_lz4s_snappy:
21121 SHLB $0x02, R10
21122 MOVB R10, (AX)
21123 ADDQ $0x01, AX
21124
21125 memmove_lz4s_snappy:
21126 LEAQ (AX)(R8*1), R10
21127
21128 // genMemMoveShort
21129 CMPQ R8, $0x08
21130 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8
21131 CMPQ R8, $0x10
21132 JBE emit_lit_memmove_lz4s_snappy_memmove_move_8through16
21133 CMPQ R8, $0x20
21134 JBE emit_lit_memmove_lz4s_snappy_memmove_move_17through32
21135 JMP emit_lit_memmove_lz4s_snappy_memmove_move_33through64
21136
21137 emit_lit_memmove_lz4s_snappy_memmove_move_8:
21138 MOVQ (DX), R11
21139 MOVQ R11, (AX)
21140 JMP memmove_end_copy_lz4s_snappy
21141
21142 emit_lit_memmove_lz4s_snappy_memmove_move_8through16:
21143 MOVQ (DX), R11
21144 MOVQ -8(DX)(R8*1), DX
21145 MOVQ R11, (AX)
21146 MOVQ DX, -8(AX)(R8*1)
21147 JMP memmove_end_copy_lz4s_snappy
21148
21149 emit_lit_memmove_lz4s_snappy_memmove_move_17through32:
21150 MOVOU (DX), X0
21151 MOVOU -16(DX)(R8*1), X1
21152 MOVOU X0, (AX)
21153 MOVOU X1, -16(AX)(R8*1)
21154 JMP memmove_end_copy_lz4s_snappy
21155
21156 emit_lit_memmove_lz4s_snappy_memmove_move_33through64:
21157 MOVOU (DX), X0
21158 MOVOU 16(DX), X1
21159 MOVOU -32(DX)(R8*1), X2
21160 MOVOU -16(DX)(R8*1), X3
21161 MOVOU X0, (AX)
21162 MOVOU X1, 16(AX)
21163 MOVOU X2, -32(AX)(R8*1)
21164 MOVOU X3, -16(AX)(R8*1)
21165
21166 memmove_end_copy_lz4s_snappy:
21167 MOVQ R10, AX
21168 JMP lz4s_snappy_lits_emit_done
21169
21170 memmove_long_lz4s_snappy:
21171 LEAQ (AX)(R8*1), R10
21172
21173 // genMemMoveLong
21174 MOVOU (DX), X0
21175 MOVOU 16(DX), X1
21176 MOVOU -32(DX)(R8*1), X2
21177 MOVOU -16(DX)(R8*1), X3
21178 MOVQ R8, R12
21179 SHRQ $0x05, R12
21180 MOVQ AX, R11
21181 ANDL $0x0000001f, R11
21182 MOVQ $0x00000040, R13
21183 SUBQ R11, R13
21184 DECQ R12
21185 JA emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
21186 LEAQ -32(DX)(R13*1), R11
21187 LEAQ -32(AX)(R13*1), R14
21188
21189 emit_lit_memmove_long_lz4s_snappylarge_big_loop_back:
21190 MOVOU (R11), X4
21191 MOVOU 16(R11), X5
21192 MOVOA X4, (R14)
21193 MOVOA X5, 16(R14)
21194 ADDQ $0x20, R14
21195 ADDQ $0x20, R11
21196 ADDQ $0x20, R13
21197 DECQ R12
21198 JNA emit_lit_memmove_long_lz4s_snappylarge_big_loop_back
21199
21200 emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32:
21201 MOVOU -32(DX)(R13*1), X4
21202 MOVOU -16(DX)(R13*1), X5
21203 MOVOA X4, -32(AX)(R13*1)
21204 MOVOA X5, -16(AX)(R13*1)
21205 ADDQ $0x20, R13
21206 CMPQ R8, R13
21207 JAE emit_lit_memmove_long_lz4s_snappylarge_forward_sse_loop_32
21208 MOVOU X0, (AX)
21209 MOVOU X1, 16(AX)
21210 MOVOU X2, -32(AX)(R8*1)
21211 MOVOU X3, -16(AX)(R8*1)
21212 MOVQ R10, AX
21213
21214 lz4s_snappy_lits_emit_done:
21215 MOVQ DI, DX
21216
21217 lz4s_snappy_lits_done:
21218 CMPQ DX, BX
21219 JNE lz4s_snappy_match
21220 CMPQ R9, $0x03
21221 JEQ lz4s_snappy_done
21222 JMP lz4s_snappy_corrupt
21223
21224 lz4s_snappy_match:
21225 CMPQ R9, $0x03
21226 JEQ lz4s_snappy_loop
21227 LEAQ 2(DX), DI
21228 CMPQ DI, BX
21229 JAE lz4s_snappy_corrupt
21230 MOVWQZX (DX), R8
21231 MOVQ DI, DX
21232 TESTQ R8, R8
21233 JZ lz4s_snappy_corrupt
21234 CMPQ R8, SI
21235 JA lz4s_snappy_corrupt
21236 CMPQ R9, $0x12
21237 JNE lz4s_snappy_ml_done
21238
21239 lz4s_snappy_ml_loop:
21240 MOVBQZX (DX), DI
21241 INCQ DX
21242 ADDQ DI, R9
21243 CMPQ DX, BX
21244 JAE lz4s_snappy_corrupt
21245 CMPQ DI, $0xff
21246 JEQ lz4s_snappy_ml_loop
21247
21248 lz4s_snappy_ml_done:
21249 ADDQ R9, SI
21250
21251 // emitCopy
21252 two_byte_offset_lz4_s2:
21253 CMPL R9, $0x40
21254 JBE two_byte_offset_short_lz4_s2
21255 MOVB $0xee, (AX)
21256 MOVW R8, 1(AX)
21257 LEAL -60(R9), R9
21258 ADDQ $0x03, AX
21259 CMPQ AX, CX
21260 JAE lz4s_snappy_loop
21261 JMP two_byte_offset_lz4_s2
21262
21263 two_byte_offset_short_lz4_s2:
21264 MOVL R9, DI
21265 SHLL $0x02, DI
21266 CMPL R9, $0x0c
21267 JAE emit_copy_three_lz4_s2
21268 CMPL R8, $0x00000800
21269 JAE emit_copy_three_lz4_s2
21270 LEAL -15(DI), DI
21271 MOVB R8, 1(AX)
21272 SHRL $0x08, R8
21273 SHLL $0x05, R8
21274 ORL R8, DI
21275 MOVB DI, (AX)
21276 ADDQ $0x02, AX
21277 JMP lz4s_snappy_loop
21278
21279 emit_copy_three_lz4_s2:
21280 LEAL -2(DI), DI
21281 MOVB DI, (AX)
21282 MOVW R8, 1(AX)
21283 ADDQ $0x03, AX
21284 JMP lz4s_snappy_loop
21285
21286 lz4s_snappy_done:
21287 MOVQ dst_base+0(FP), CX
21288 SUBQ CX, AX
21289 MOVQ SI, uncompressed+48(FP)
21290 MOVQ AX, dstUsed+56(FP)
21291 RET
21292
21293 lz4s_snappy_corrupt:
21294 XORQ AX, AX
21295 LEAQ -1(AX), SI
21296 MOVQ SI, uncompressed+48(FP)
21297 RET
21298
21299 lz4s_snappy_dstfull:
21300 XORQ AX, AX
21301 LEAQ -2(AX), SI
21302 MOVQ SI, uncompressed+48(FP)
21303 RET
21304