xhex_amd64.s raw

   1  #include "textflag.h"
   2  
   3  #define dst R8
   4  #define src R9
   5  #define len R11
   6  #define mask Y0
   7  #define rmask Y1
   8  #define tbl Y2
   9  
  10  // HEX_TBL doubles normal hex table for AVX2 register (expand 16Bytes to 32Bytes)
  11  DATA HEX_TBL<>+0x00(SB)/32, $"0123456789abcdef0123456789abcdef"
  12  GLOBL HEX_TBL<>(SB), RODATA, $32
  13  
  14  // func encodeAVX2(dst, src *byte, n int)
  15  TEXT ·encodeAVX2(SB), NOSPLIT, $0
  16  	MOVQ  d+0(FP), dst
  17  	MOVQ  s+8(FP), src
  18  	MOVQ  n+16(FP), len
  19  	SHRQ  $4, len       // n / 16.
  20  	TESTQ len, len
  21  	JZ    ret           // If 0, return.
  22  
  23  	// Preparation.
  24  	// Make a 32Bytes mask filled with $0x0f.
  25  	MOVQ         $0x0f, DX
  26  	MOVQ         DX, X0
  27  	VPBROADCASTB X0, mask
  28  	MOVQ         ·replaceHighMask(SB), AX
  29  	VMOVDQU      (AX), rmask
  30  	VMOVDQU      HEX_TBL<>(SB), tbl
  31  
  32  loop16b:
  33  	VMOVDQU   (src), X3     // Load 16bytes source.
  34  	VPMOVZXBW X3, Y3        // Zero extend 16bytes to 32bytes.
  35  	VPSRLW    $4, Y3, Y4    // >> 4.
  36  	VPSHUFB   rmask, Y3, Y3
  37  	VPOR      Y4, Y3, Y4
  38  	VPAND     mask, Y4, Y4  // Clean high bits.
  39  	VPSHUFB   Y4, tbl, Y4
  40  	VMOVDQU   Y4, (dst)
  41  
  42  	ADDQ $16, src
  43  	ADDQ $32, dst
  44  	SUBQ $1, len
  45  	JNE  loop16b
  46  	VZEROUPPER
  47  
  48  ret:
  49  	RET
  50  
  51  #define mask15 X0
  52  #define maska X1
  53  #define maskb X2
  54  #define mask9 X10
  55  
  56  // func decodeAVX2(dst, src *byte, n int)
  57  TEXT ·decodeAVX2(SB), NOSPLIT, $0
  58  	MOVQ  d+0(FP), dst
  59  	MOVQ  s+8(FP), src
  60  	MOVQ  n+16(FP), len
  61  	SHRQ  $5, len       // n / 32.
  62  	TESTQ len, len
  63  	JZ    ret           // If 0, return.
  64  
  65  	MOVQ         $0x0f, DX
  66  	MOVQ         DX, X0
  67  	VPBROADCASTW X0, mask15
  68  	MOVQ         $9, DX
  69  	MOVQ         DX, X10
  70  	VPBROADCASTW X10, mask9
  71  
  72  	MOVQ    ·decodeMask1(SB), AX
  73  	VMOVDQU (AX), maska
  74  	MOVQ    ·decodeMask2(SB), BX
  75  	VMOVDQU (BX), maskb
  76  
  77  loop32b:
  78  	VMOVDQU (src), X3
  79  	VMOVDQU 16(src), X4
  80  
  81  	VPSHUFB maska, X3, X5
  82  	VPSHUFB maskb, X3, X6
  83  	VPSHUFB maska, X4, X7
  84  	VPSHUFB maskb, X4, X8
  85  
  86  	VPAND      mask15, X5, X11
  87  	VPSRAW     $6, X5, X12
  88  	VPMADDUBSW mask9, X12, X12
  89  	VPADDW     X11, X12, X5
  90  
  91  	VPAND      mask15, X6, X11
  92  	VPSRAW     $6, X6, X12
  93  	VPMADDUBSW mask9, X12, X12
  94  	VPADDW     X11, X12, X6
  95  
  96  	VPAND      mask15, X7, X11
  97  	VPSRAW     $6, X7, X12
  98  	VPMADDUBSW mask9, X12, X12
  99  	VPADDW     X11, X12, X7
 100  
 101  	VPAND      mask15, X8, X11
 102  	VPSRAW     $6, X8, X12
 103  	VPMADDUBSW mask9, X12, X12
 104  	VPADDW     X11, X12, X8
 105  
 106  	VPSLLW $4, X5, X5
 107  	VPSLLW $4, X7, X7
 108  	VPOR   X5, X6, X5
 109  	VPOR   X7, X8, X7
 110  
 111  	VPACKUSWB X5, X7, X9
 112  	VPSHUFD   $78, X9, X9
 113  
 114  	VMOVDQU X9, (dst)
 115  
 116  	ADDQ $32, src
 117  	ADDQ $16, dst
 118  	SUBQ $1, len
 119  	JNE  loop32b
 120  	VZEROUPPER
 121  
 122  ret:
 123  	RET
 124