crc32_amd64.s raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  #include "textflag.h"
   6  
   7  // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
   8  //
   9  // func castagnoliSSE42(crc uint32, p []byte) uint32
  10  TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
  11  	MOVL crc+0(FP), AX  // CRC value
  12  	MOVQ p+8(FP), SI  // data pointer
  13  	MOVQ p_len+16(FP), CX  // len(p)
  14  
  15  	// If there are fewer than 8 bytes to process, skip alignment.
  16  	CMPQ CX, $8
  17  	JL less_than_8
  18  
  19  	MOVQ SI, BX
  20  	ANDQ $7, BX
  21  	JZ aligned
  22  
  23  	// Process the first few bytes to 8-byte align the input.
  24  
  25  	// BX = 8 - BX. We need to process this many bytes to align.
  26  	SUBQ $1, BX
  27  	XORQ $7, BX
  28  
  29  	BTQ $0, BX
  30  	JNC align_2
  31  
  32  	CRC32B (SI), AX
  33  	DECQ CX
  34  	INCQ SI
  35  
  36  align_2:
  37  	BTQ $1, BX
  38  	JNC align_4
  39  
  40  	CRC32W (SI), AX
  41  
  42  	SUBQ $2, CX
  43  	ADDQ $2, SI
  44  
  45  align_4:
  46  	BTQ $2, BX
  47  	JNC aligned
  48  
  49  	CRC32L (SI), AX
  50  
  51  	SUBQ $4, CX
  52  	ADDQ $4, SI
  53  
  54  aligned:
  55  	// The input is now 8-byte aligned and we can process 8-byte chunks.
  56  	CMPQ CX, $8
  57  	JL less_than_8
  58  
  59  	CRC32Q (SI), AX
  60  	ADDQ $8, SI
  61  	SUBQ $8, CX
  62  	JMP aligned
  63  
  64  less_than_8:
  65  	// We may have some bytes left over; process 4 bytes, then 2, then 1.
  66  	BTQ $2, CX
  67  	JNC less_than_4
  68  
  69  	CRC32L (SI), AX
  70  	ADDQ $4, SI
  71  
  72  less_than_4:
  73  	BTQ $1, CX
  74  	JNC less_than_2
  75  
  76  	CRC32W (SI), AX
  77  	ADDQ $2, SI
  78  
  79  less_than_2:
  80  	BTQ $0, CX
  81  	JNC done
  82  
  83  	CRC32B (SI), AX
  84  
  85  done:
  86  	MOVL AX, ret+32(FP)
  87  	RET
  88  
  89  // castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
  90  // bytes from each buffer.
  91  //
  92  // func castagnoliSSE42Triple(
  93  //     crc1, crc2, crc3 uint32,
  94  //     a, b, c []byte,
  95  //     rounds uint32,
  96  // ) (retA uint32, retB uint32, retC uint32)
  97  TEXT ·castagnoliSSE42Triple(SB),NOSPLIT,$0
  98  	MOVL crcA+0(FP), AX
  99  	MOVL crcB+4(FP), CX
 100  	MOVL crcC+8(FP), DX
 101  
 102  	MOVQ a+16(FP), R8   // data pointer
 103  	MOVQ b+40(FP), R9   // data pointer
 104  	MOVQ c+64(FP), R10  // data pointer
 105  
 106  	MOVL rounds+88(FP), R11
 107  
 108  loop:
 109  	CRC32Q (R8), AX
 110  	CRC32Q (R9), CX
 111  	CRC32Q (R10), DX
 112  
 113  	CRC32Q 8(R8), AX
 114  	CRC32Q 8(R9), CX
 115  	CRC32Q 8(R10), DX
 116  
 117  	CRC32Q 16(R8), AX
 118  	CRC32Q 16(R9), CX
 119  	CRC32Q 16(R10), DX
 120  
 121  	ADDQ $24, R8
 122  	ADDQ $24, R9
 123  	ADDQ $24, R10
 124  
 125  	DECQ R11
 126  	JNZ loop
 127  
 128  	MOVL AX, retA+96(FP)
 129  	MOVL CX, retB+100(FP)
 130  	MOVL DX, retC+104(FP)
 131  	RET
 132  
 133  // CRC32 polynomial data
 134  //
 135  // These constants are lifted from the
 136  // Linux kernel, since they avoid the costly
 137  // PSHUFB 16 byte reversal proposed in the
 138  // original Intel paper.
 139  DATA r2r1<>+0(SB)/8, $0x154442bd4
 140  DATA r2r1<>+8(SB)/8, $0x1c6e41596
 141  DATA r4r3<>+0(SB)/8, $0x1751997d0
 142  DATA r4r3<>+8(SB)/8, $0x0ccaa009e
 143  DATA rupoly<>+0(SB)/8, $0x1db710641
 144  DATA rupoly<>+8(SB)/8, $0x1f7011641
 145  DATA r5<>+0(SB)/8, $0x163cd6124
 146  
 147  GLOBL r2r1<>(SB),RODATA,$16
 148  GLOBL r4r3<>(SB),RODATA,$16
 149  GLOBL rupoly<>(SB),RODATA,$16
 150  GLOBL r5<>(SB),RODATA,$8
 151  
 152  // Based on https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 153  // len(p) must be at least 64, and must be a multiple of 16.
 154  
 155  // func ieeeCLMUL(crc uint32, p []byte) uint32
 156  TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
 157  	MOVL   crc+0(FP), X0             // Initial CRC value
 158  	MOVQ   p+8(FP), SI  	         // data pointer
 159  	MOVQ   p_len+16(FP), CX          // len(p)
 160  
 161  	MOVOU  (SI), X1
 162  	MOVOU  16(SI), X2
 163  	MOVOU  32(SI), X3
 164  	MOVOU  48(SI), X4
 165  	PXOR   X0, X1
 166  	ADDQ   $64, SI                  // buf+=64
 167  	SUBQ   $64, CX                  // len-=64
 168  	CMPQ   CX, $64                  // Less than 64 bytes left
 169  	JB     remain64
 170  
 171  	MOVOA  r2r1<>+0(SB), X0
 172  loopback64:
 173  	MOVOA  X1, X5
 174  	MOVOA  X2, X6
 175  	MOVOA  X3, X7
 176  	MOVOA  X4, X8
 177  
 178  	PCLMULQDQ $0, X0, X1
 179  	PCLMULQDQ $0, X0, X2
 180  	PCLMULQDQ $0, X0, X3
 181  	PCLMULQDQ $0, X0, X4
 182  
 183  	/* Load next early */
 184  	MOVOU    (SI), X11
 185  	MOVOU    16(SI), X12
 186  	MOVOU    32(SI), X13
 187  	MOVOU    48(SI), X14
 188  
 189  	PCLMULQDQ $0x11, X0, X5
 190  	PCLMULQDQ $0x11, X0, X6
 191  	PCLMULQDQ $0x11, X0, X7
 192  	PCLMULQDQ $0x11, X0, X8
 193  
 194  	PXOR     X5, X1
 195  	PXOR     X6, X2
 196  	PXOR     X7, X3
 197  	PXOR     X8, X4
 198  
 199  	PXOR     X11, X1
 200  	PXOR     X12, X2
 201  	PXOR     X13, X3
 202  	PXOR     X14, X4
 203  
 204  	ADDQ    $0x40, DI
 205  	ADDQ    $64, SI      // buf+=64
 206  	SUBQ    $64, CX      // len-=64
 207  	CMPQ    CX, $64      // Less than 64 bytes left?
 208  	JGE     loopback64
 209  
 210  	/* Fold result into a single register (X1) */
 211  remain64:
 212  	MOVOA       r4r3<>+0(SB), X0
 213  
 214  	MOVOA       X1, X5
 215  	PCLMULQDQ   $0, X0, X1
 216  	PCLMULQDQ   $0x11, X0, X5
 217  	PXOR        X5, X1
 218  	PXOR        X2, X1
 219  
 220  	MOVOA       X1, X5
 221  	PCLMULQDQ   $0, X0, X1
 222  	PCLMULQDQ   $0x11, X0, X5
 223  	PXOR        X5, X1
 224  	PXOR        X3, X1
 225  
 226  	MOVOA       X1, X5
 227  	PCLMULQDQ   $0, X0, X1
 228  	PCLMULQDQ   $0x11, X0, X5
 229  	PXOR        X5, X1
 230  	PXOR        X4, X1
 231  
 232  	/* If there is less than 16 bytes left we are done */
 233  	CMPQ        CX, $16
 234  	JB          finish
 235  
 236  	/* Encode 16 bytes */
 237  remain16:
 238  	MOVOU       (SI), X10
 239  	MOVOA       X1, X5
 240  	PCLMULQDQ   $0, X0, X1
 241  	PCLMULQDQ   $0x11, X0, X5
 242  	PXOR        X5, X1
 243  	PXOR        X10, X1
 244  	SUBQ        $16, CX
 245  	ADDQ        $16, SI
 246  	CMPQ        CX, $16
 247  	JGE         remain16
 248  
 249  finish:
 250  	/* Fold final result into 32 bits and return it */
 251  	PCMPEQB     X3, X3
 252  	PCLMULQDQ   $1, X1, X0
 253  	PSRLDQ      $8, X1
 254  	PXOR        X0, X1
 255  
 256  	MOVOA       X1, X2
 257  	MOVQ        r5<>+0(SB), X0
 258  
 259  	/* Creates 32 bit mask. Note that we don't care about upper half. */
 260  	PSRLQ       $32, X3
 261  
 262  	PSRLDQ      $4, X2
 263  	PAND        X3, X1
 264  	PCLMULQDQ   $0, X0, X1
 265  	PXOR        X2, X1
 266  
 267  	MOVOA       rupoly<>+0(SB), X0
 268  
 269  	MOVOA       X1, X2
 270  	PAND        X3, X1
 271  	PCLMULQDQ   $0x10, X0, X1
 272  	PAND        X3, X1
 273  	PCLMULQDQ   $0, X0, X1
 274  	PXOR        X2, X1
 275  
 276  	PEXTRD	$1, X1, AX
 277  	MOVL        AX, ret+32(FP)
 278  
 279  	RET
 280