gnarl_hash_amd64.s raw
1 #include "textflag.h"
2
3 // func gnarlAccumulateAVX2(acc *[32]uint32, basis *[12][27][32]uint16, block *[41]byte)
4 //
5 // Scans 324 bits in block (41 bytes). For each set bit at flat index n,
6 // adds basis row n (32 uint16, zero-extended to uint32) into acc[0..31]
7 // using AVX2 VPMOVZXWD + VPADDD.
8 //
9 // Processes block byte-by-byte. Within each byte, uses TZCNT+BLSR to
10 // iterate only over set bits, reducing iterations from 324 to ~162.
11 //
12 // Register allocation:
13 // BX = acc pointer
14 // SI = basis pointer
15 // DI = block pointer
16 // R8 = byte index (0..40)
17 // R9 = current byte value (bits being processed)
18 // R10 = bit base for current byte (R8 * 8)
19 // R11 = scratch (row pointer)
20 // AX = TZCNT result (bit position within byte)
21 // CX = flat bit index
22 // Y0-Y3 = acc[0..7], acc[8..15], acc[16..23], acc[24..31]
23 // Y4-Y7 = scratch for VPMOVZXWD
24 TEXT ·gnarlAccumulateAVX2(SB), NOSPLIT, $0-24
25 MOVQ acc+0(FP), BX
26 MOVQ basis+8(FP), SI
27 MOVQ block+16(FP), DI
28
29 // Zero accumulators.
30 VPXOR Y0, Y0, Y0
31 VPXOR Y1, Y1, Y1
32 VPXOR Y2, Y2, Y2
33 VPXOR Y3, Y3, Y3
34
35 XORQ R8, R8 // byteIdx = 0
36
37 byteloop:
38 CMPQ R8, $41
39 JGE done
40
41 // Load block byte, skip if zero (no set bits).
42 MOVBQZX (DI)(R8*1), R9
43 TESTQ R9, R9
44 JZ nextbyte
45
46 // R10 = bit base for this byte = R8 * 8.
47 MOVQ R8, R10
48 SHLQ $3, R10
49
50 bitloop:
51 // Find lowest set bit in R9.
52 TZCNTQ R9, AX // AX = position of lowest set bit
53
54 // Flat bit index = R10 + AX.
55 LEAQ (R10)(AX*1), CX
56
57 // Bounds check: skip if >= 324 (last byte has only 4 valid bits).
58 CMPQ CX, $324
59 JGE clearbits
60
61 // Row pointer = basis + CX * 64.
62 MOVQ CX, R11
63 SHLQ $6, R11
64 ADDQ SI, R11
65
66 // Load 4 groups of 8 uint16, zero-extend to uint32, add to acc.
67 VPMOVZXWD (R11), Y4
68 VPADDD Y4, Y0, Y0
69
70 VPMOVZXWD 16(R11), Y5
71 VPADDD Y5, Y1, Y1
72
73 VPMOVZXWD 32(R11), Y6
74 VPADDD Y6, Y2, Y2
75
76 VPMOVZXWD 48(R11), Y7
77 VPADDD Y7, Y3, Y3
78
79 clearbits:
80 // Clear lowest set bit: R9 = R9 & (R9 - 1). This is BLSR.
81 BLSRQ R9, R9
82 JNZ bitloop // more bits remaining
83
84 nextbyte:
85 INCQ R8
86 JMP byteloop
87
88 done:
89 // Store accumulators to acc.
90 VMOVDQU Y0, (BX)
91 VMOVDQU Y1, 32(BX)
92 VMOVDQU Y2, 64(BX)
93 VMOVDQU Y3, 96(BX)
94
95 VZEROUPPER
96 RET
97