gnarl_hash_amd64.s raw

   1  #include "textflag.h"
   2  
   3  // func gnarlAccumulateAVX2(acc *[32]uint32, basis *[12][27][32]uint16, block *[41]byte)
   4  //
   5  // Scans 324 bits in block (41 bytes). For each set bit at flat index n,
   6  // adds basis row n (32 uint16, zero-extended to uint32) into acc[0..31]
   7  // using AVX2 VPMOVZXWD + VPADDD.
   8  //
   9  // Processes block byte-by-byte. Within each byte, uses TZCNT+BLSR to
  10  // iterate only over set bits, reducing iterations from 324 to ~162.
  11  //
  12  // Register allocation:
  13  //   BX  = acc pointer
  14  //   SI  = basis pointer
  15  //   DI  = block pointer
  16  //   R8  = byte index (0..40)
  17  //   R9  = current byte value (bits being processed)
  18  //   R10 = bit base for current byte (R8 * 8)
  19  //   R11 = scratch (row pointer)
  20  //   AX  = TZCNT result (bit position within byte)
  21  //   CX  = flat bit index
  22  //   Y0-Y3 = acc[0..7], acc[8..15], acc[16..23], acc[24..31]
  23  //   Y4-Y7 = scratch for VPMOVZXWD
  24  TEXT ·gnarlAccumulateAVX2(SB), NOSPLIT, $0-24
  25  	MOVQ acc+0(FP), BX
  26  	MOVQ basis+8(FP), SI
  27  	MOVQ block+16(FP), DI
  28  
  29  	// Zero accumulators.
  30  	VPXOR Y0, Y0, Y0
  31  	VPXOR Y1, Y1, Y1
  32  	VPXOR Y2, Y2, Y2
  33  	VPXOR Y3, Y3, Y3
  34  
  35  	XORQ R8, R8          // byteIdx = 0
  36  
  37  byteloop:
  38  	CMPQ R8, $41
  39  	JGE done
  40  
  41  	// Load block byte, skip if zero (no set bits).
  42  	MOVBQZX (DI)(R8*1), R9
  43  	TESTQ R9, R9
  44  	JZ nextbyte
  45  
  46  	// R10 = bit base for this byte = R8 * 8.
  47  	MOVQ R8, R10
  48  	SHLQ $3, R10
  49  
  50  bitloop:
  51  	// Find lowest set bit in R9.
  52  	TZCNTQ R9, AX        // AX = position of lowest set bit
  53  
  54  	// Flat bit index = R10 + AX.
  55  	LEAQ (R10)(AX*1), CX
  56  
  57  	// Bounds check: skip if >= 324 (last byte has only 4 valid bits).
  58  	CMPQ CX, $324
  59  	JGE clearbits
  60  
  61  	// Row pointer = basis + CX * 64.
  62  	MOVQ CX, R11
  63  	SHLQ $6, R11
  64  	ADDQ SI, R11
  65  
  66  	// Load 4 groups of 8 uint16, zero-extend to uint32, add to acc.
  67  	VPMOVZXWD (R11), Y4
  68  	VPADDD Y4, Y0, Y0
  69  
  70  	VPMOVZXWD 16(R11), Y5
  71  	VPADDD Y5, Y1, Y1
  72  
  73  	VPMOVZXWD 32(R11), Y6
  74  	VPADDD Y6, Y2, Y2
  75  
  76  	VPMOVZXWD 48(R11), Y7
  77  	VPADDD Y7, Y3, Y3
  78  
  79  clearbits:
  80  	// Clear lowest set bit: R9 = R9 & (R9 - 1). This is BLSR.
  81  	BLSRQ R9, R9
  82  	JNZ bitloop           // more bits remaining
  83  
  84  nextbyte:
  85  	INCQ R8
  86  	JMP byteloop
  87  
  88  done:
  89  	// Store accumulators to acc.
  90  	VMOVDQU Y0, (BX)
  91  	VMOVDQU Y1, 32(BX)
  92  	VMOVDQU Y2, 64(BX)
  93  	VMOVDQU Y3, 96(BX)
  94  
  95  	VZEROUPPER
  96  	RET
  97