mask_amd64.s raw

   1  #include "textflag.h"
   2  
   3  // func maskAsm(b *byte, len int, key uint32)
   4  TEXT ·maskAsm(SB), NOSPLIT, $0-28
   5  	// AX = b
   6  	// CX = len (left length)
   7  	// SI = key (uint32)
   8  	// DI = uint64(SI) | uint64(SI)<<32
   9  	MOVQ b+0(FP), AX
  10  	MOVQ len+8(FP), CX
  11  	MOVL key+16(FP), SI
  12  
  13  	// calculate the DI
  14  	// DI = SI<<32 | SI
  15  	MOVL SI, DI
  16  	MOVQ DI, DX
  17  	SHLQ $32, DI
  18  	ORQ  DX, DI
  19  
  20  	CMPQ  CX, $15
  21  	JLE   less_than_16
  22  	CMPQ  CX, $63
  23  	JLE   less_than_64
  24  	CMPQ  CX, $128
  25  	JLE   sse
  26  	TESTQ $31, AX
  27  	JNZ   unaligned
  28  
  29  unaligned_loop_1byte:
  30  	XORB  SI, (AX)
  31  	INCQ  AX
  32  	DECQ  CX
  33  	ROLL  $24, SI
  34  	TESTQ $7, AX
  35  	JNZ   unaligned_loop_1byte
  36  
  37  	// calculate DI again since SI was modified
  38  	// DI = SI<<32 | SI
  39  	MOVL SI, DI
  40  	MOVQ DI, DX
  41  	SHLQ $32, DI
  42  	ORQ  DX, DI
  43  
  44  	TESTQ $31, AX
  45  	JZ    sse
  46  
  47  unaligned:
  48  	TESTQ $7, AX               // AND $7 & len, if not zero jump to loop_1b.
  49  	JNZ   unaligned_loop_1byte
  50  
  51  unaligned_loop:
  52  	// we don't need to check the CX since we know it's above 128
  53  	XORQ  DI, (AX)
  54  	ADDQ  $8, AX
  55  	SUBQ  $8, CX
  56  	TESTQ $31, AX
  57  	JNZ   unaligned_loop
  58  	JMP   sse
  59  
  60  sse:
  61  	CMPQ       CX, $0x40
  62  	JL         less_than_64
  63  	MOVQ       DI, X0
  64  	PUNPCKLQDQ X0, X0
  65  
  66  sse_loop:
  67  	MOVOU 0*16(AX), X1
  68  	MOVOU 1*16(AX), X2
  69  	MOVOU 2*16(AX), X3
  70  	MOVOU 3*16(AX), X4
  71  	PXOR  X0, X1
  72  	PXOR  X0, X2
  73  	PXOR  X0, X3
  74  	PXOR  X0, X4
  75  	MOVOU X1, 0*16(AX)
  76  	MOVOU X2, 1*16(AX)
  77  	MOVOU X3, 2*16(AX)
  78  	MOVOU X4, 3*16(AX)
  79  	ADDQ  $0x40, AX
  80  	SUBQ  $0x40, CX
  81  	CMPQ  CX, $0x40
  82  	JAE   sse_loop
  83  
  84  less_than_64:
  85  	TESTQ $32, CX
  86  	JZ    less_than_32
  87  	XORQ  DI, (AX)
  88  	XORQ  DI, 8(AX)
  89  	XORQ  DI, 16(AX)
  90  	XORQ  DI, 24(AX)
  91  	ADDQ  $32, AX
  92  
  93  less_than_32:
  94  	TESTQ $16, CX
  95  	JZ    less_than_16
  96  	XORQ  DI, (AX)
  97  	XORQ  DI, 8(AX)
  98  	ADDQ  $16, AX
  99  
 100  less_than_16:
 101  	TESTQ $8, CX
 102  	JZ    less_than_8
 103  	XORQ  DI, (AX)
 104  	ADDQ  $8, AX
 105  
 106  less_than_8:
 107  	TESTQ $4, CX
 108  	JZ    less_than_4
 109  	XORL  SI, (AX)
 110  	ADDQ  $4, AX
 111  
 112  less_than_4:
 113  	TESTQ $2, CX
 114  	JZ    less_than_2
 115  	XORW  SI, (AX)
 116  	ROLL  $16, SI
 117  	ADDQ  $2, AX
 118  
 119  less_than_2:
 120  	TESTQ $1, CX
 121  	JZ    done
 122  	XORB  SI, (AX)
 123  	ROLL  $24, SI
 124  
 125  done:
 126  	MOVL SI, ret+24(FP)
 127  	RET
 128