crc32_s390x.s raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  #include "textflag.h"
   6  
   7  // Vector register range containing CRC-32 constants
   8  
   9  #define CONST_PERM_LE2BE        V9
  10  #define CONST_R2R1              V10
  11  #define CONST_R4R3              V11
  12  #define CONST_R5                V12
  13  #define CONST_RU_POLY           V13
  14  #define CONST_CRC_POLY          V14
  15  
  16  
  17  // The CRC-32 constant block contains reduction constants to fold and
  18  // process particular chunks of the input data stream in parallel.
  19  //
  20  // Note that the constant definitions below are extended in order to compute
  21  // intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
  22  // The rightmost doubleword can be 0 to prevent contribution to the result or
  23  // can be multiplied by 1 to perform an XOR without the need for a separate
  24  // VECTOR EXCLUSIVE OR instruction.
  25  //
  26  // The polynomials used are bit-reflected:
  27  //
  28  //            IEEE: P'(x) = 0x0edb88320
  29  //      Castagnoli: P'(x) = 0x082f63b78
  30  
  31  
  32  // IEEE polynomial constants
  33  DATA    ·crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
  34  DATA    ·crclecons+8(SB)/8,  $0x0706050403020100
  35  DATA    ·crclecons+16(SB)/8, $0x00000001c6e41596       // R2
  36  DATA    ·crclecons+24(SB)/8, $0x0000000154442bd4       // R1
  37  DATA    ·crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
  38  DATA    ·crclecons+40(SB)/8, $0x00000001751997d0       // R3
  39  DATA    ·crclecons+48(SB)/8, $0x0000000000000000
  40  DATA    ·crclecons+56(SB)/8, $0x0000000163cd6124       // R5
  41  DATA    ·crclecons+64(SB)/8, $0x0000000000000000
  42  DATA    ·crclecons+72(SB)/8, $0x00000001F7011641       // u'
  43  DATA    ·crclecons+80(SB)/8, $0x0000000000000000
  44  DATA    ·crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1
  45  
  46  GLOBL    ·crclecons(SB),RODATA, $144
  47  
  48  // Castagonli Polynomial constants
  49  DATA    ·crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
  50  DATA    ·crcclecons+8(SB)/8,  $0x0706050403020100
  51  DATA    ·crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
  52  DATA    ·crcclecons+24(SB)/8, $0x00000000740eef02      // R1
  53  DATA    ·crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
  54  DATA    ·crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
  55  DATA    ·crcclecons+48(SB)/8, $0x0000000000000000
  56  DATA    ·crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
  57  DATA    ·crcclecons+64(SB)/8, $0x0000000000000000
  58  DATA    ·crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
  59  DATA    ·crcclecons+80(SB)/8, $0x0000000000000000
  60  DATA    ·crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1
  61  
  62  GLOBL   ·crcclecons(SB),RODATA, $144
  63  
  64  // The CRC-32 function(s) use these calling conventions:
  65  //
  66  // Parameters:
  67  //
  68  //      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
  69  //      R3:    Input buffer pointer, performance might be improved if the
  70  //             buffer is on a doubleword boundary.
  71  //      R4:    Length of the buffer, must be 64 bytes or greater.
  72  //
  73  // Register usage:
  74  //
  75  //      R5:     CRC-32 constant pool base pointer.
  76  //      V0:     Initial CRC value and intermediate constants and results.
  77  //      V1..V4: Data for CRC computation.
  78  //      V5..V8: Next data chunks that are fetched from the input buffer.
  79  //
  80  //      V9..V14: CRC-32 constants.
  81  
  82  // func vectorizedIEEE(crc uint32, p []byte) uint32
  83  TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
  84  	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
  85  	MOVD    p+8(FP), R3       // data pointer
  86  	MOVD    p_len+16(FP), R4  // len(p)
  87  
  88  	MOVD    $·crclecons(SB), R5
  89  	BR      vectorizedBody<>(SB)
  90  
  91  // func vectorizedCastagnoli(crc uint32, p []byte) uint32
  92  TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
  93  	MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
  94  	MOVD    p+8(FP), R3       // data pointer
  95  	MOVD    p_len+16(FP), R4  // len(p)
  96  
  97  	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
  98  	MOVD    $·crcclecons(SB), R5
  99  	BR      vectorizedBody<>(SB)
 100  
 101  TEXT vectorizedBody<>(SB),NOSPLIT,$0
 102  	XOR     $0xffffffff, R2 // NOTW R2
 103  	VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
 104  
 105  	// Load the initial CRC value into the rightmost word of V0
 106  	VZERO   V0
 107  	VLVGF   $3, R2, V0
 108  
 109  	// Crash if the input size is less than 64-bytes.
 110  	CMP     R4, $64
 111  	BLT     crash
 112  
 113  	// Load a 64-byte data chunk and XOR with CRC
 114  	VLM     0(R3), V1, V4    // 64-bytes into V1..V4
 115  
 116  	// Reflect the data if the CRC operation is in the bit-reflected domain
 117  	VPERM   V1, V1, CONST_PERM_LE2BE, V1
 118  	VPERM   V2, V2, CONST_PERM_LE2BE, V2
 119  	VPERM   V3, V3, CONST_PERM_LE2BE, V3
 120  	VPERM   V4, V4, CONST_PERM_LE2BE, V4
 121  
 122  	VX      V0, V1, V1     // V1 ^= CRC
 123  	ADD     $64, R3        // BUF = BUF + 64
 124  	ADD     $(-64), R4
 125  
 126  	// Check remaining buffer size and jump to proper folding method
 127  	CMP     R4, $64
 128  	BLT     less_than_64bytes
 129  
 130  fold_64bytes_loop:
 131  	// Load the next 64-byte data chunk into V5 to V8
 132  	VLM     0(R3), V5, V8
 133  	VPERM   V5, V5, CONST_PERM_LE2BE, V5
 134  	VPERM   V6, V6, CONST_PERM_LE2BE, V6
 135  	VPERM   V7, V7, CONST_PERM_LE2BE, V7
 136  	VPERM   V8, V8, CONST_PERM_LE2BE, V8
 137  
 138  
 139  	// Perform a GF(2) multiplication of the doublewords in V1 with
 140  	// the reduction constants in V0.  The intermediate result is
 141  	// then folded (accumulated) with the next data chunk in V5 and
 142  	// stored in V1.  Repeat this step for the register contents
 143  	// in V2, V3, and V4 respectively.
 144  
 145  	VGFMAG  CONST_R2R1, V1, V5, V1
 146  	VGFMAG  CONST_R2R1, V2, V6, V2
 147  	VGFMAG  CONST_R2R1, V3, V7, V3
 148  	VGFMAG  CONST_R2R1, V4, V8 ,V4
 149  
 150  	// Adjust buffer pointer and length for next loop
 151  	ADD     $64, R3                  // BUF = BUF + 64
 152  	ADD     $(-64), R4               // LEN = LEN - 64
 153  
 154  	CMP     R4, $64
 155  	BGE     fold_64bytes_loop
 156  
 157  less_than_64bytes:
 158  	// Fold V1 to V4 into a single 128-bit value in V1
 159  	VGFMAG  CONST_R4R3, V1, V2, V1
 160  	VGFMAG  CONST_R4R3, V1, V3, V1
 161  	VGFMAG  CONST_R4R3, V1, V4, V1
 162  
 163  	// Check whether to continue with 64-bit folding
 164  	CMP R4, $16
 165  	BLT final_fold
 166  
 167  fold_16bytes_loop:
 168  	VL      0(R3), V2               // Load next data chunk
 169  	VPERM   V2, V2, CONST_PERM_LE2BE, V2
 170  
 171  	VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk
 172  
 173  	// Adjust buffer pointer and size for folding next data chunk
 174  	ADD     $16, R3
 175  	ADD     $-16, R4
 176  
 177  	// Process remaining data chunks
 178  	CMP     R4 ,$16
 179  	BGE     fold_16bytes_loop
 180  
 181  final_fold:
 182  	VLEIB   $7, $0x40, V9
 183  	VSRLB   V9, CONST_R4R3, V0
 184  	VLEIG   $0, $1, V0
 185  
 186  	VGFMG   V0, V1, V1
 187  
 188  	VLEIB   $7, $0x20, V9         // Shift by words
 189  	VSRLB   V9, V1, V2            // Store remaining bits in V2
 190  	VUPLLF  V1, V1                // Split rightmost doubleword
 191  	VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2
 192  
 193  
 194  	// The input values to the Barret reduction are the degree-63 polynomial
 195  	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
 196  	// constant u.  The Barret reduction result is the CRC value of R(x) mod
 197  	// P(x).
 198  	//
 199  	// The Barret reduction algorithm is defined as:
 200  	//
 201  	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
 202  	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
 203  	//    3. C(x)  = R(x) XOR T2(x) mod x^32
 204  	//
 205  	// Note: To compensate the division by x^32, use the vector unpack
 206  	// instruction to move the leftmost word into the leftmost doubleword
 207  	// of the vector register.  The rightmost doubleword is multiplied
 208  	// with zero to not contribute to the intermediate results.
 209  
 210  
 211  	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
 212  	VUPLLF  V1, V2
 213  	VGFMG   CONST_RU_POLY, V2, V2
 214  
 215  
 216  	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
 217  	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
 218  	// The final result is in the rightmost word of V2.
 219  
 220  	VUPLLF  V2, V2
 221  	VGFMAG  CONST_CRC_POLY, V2, V1, V2
 222  
 223  done:
 224  	VLGVF   $2, V2, R2
 225  	XOR     $0xffffffff, R2 // NOTW R2
 226  	MOVWZ   R2, ret + 32(FP)
 227  	RET
 228  
 229  crash:
 230  	MOVD    $0, (R0) // input size is less than 64-bytes
 231