crc32_ppc64le.s raw

   1  // Copyright 2017 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // The vectorized implementation found below is a derived work
   6  // from code written by Anton Blanchard <anton@au.ibm.com> found
   7  // at https://github.com/antonblanchard/crc32-vpmsum.  The original
   8  // is dual licensed under GPL and Apache 2.  As the copyright holder
   9  // for the work, IBM has contributed this new work under
  10  // the golang license.
  11  
  12  // Changes include porting to Go assembler with modifications for
  13  // the Go ABI for ppc64le.
  14  
  15  #include "textflag.h"
  16  
  17  #define POWER8_OFFSET 132
  18  
  19  #define off16	R16
  20  #define off32	R17
  21  #define off48	R18
  22  #define off64	R19
  23  #define off80	R20
  24  #define off96	R21
  25  #define	off112	R22
  26  
  27  #define const1	V24
  28  #define const2	V25
  29  
  30  #define byteswap	V26
  31  #define mask_32bit	V27
  32  #define mask_64bit	V28
  33  #define zeroes		V29
  34  
  35  #define MAX_SIZE	32*1024
  36  #define REFLECT
  37  
  38  TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44
  39  	MOVWZ	crc+0(FP), R3   // incoming crc
  40  	MOVD    table8+8(FP), R4   // *Table
  41  	MOVD    p+16(FP), R5
  42  	MOVD    p_len+24(FP), R6 // p len
  43  
  44  	CMP     $0,R6           // len == 0?
  45  	BNE     start
  46  	MOVW    R3,ret+40(FP)   // return crc
  47  	RET
  48  
  49  start:
  50  	NOR     R3,R3,R7        // ^crc
  51  	MOVWZ	R7,R7		// 32 bits
  52  	CMP	R6,$16
  53  	MOVD	R6,CTR
  54  	BLT	short
  55  	SRAD    $3,R6,R8        // 8 byte chunks
  56  	MOVD    R8,CTR
  57  
  58  loop:
  59  	MOVWZ	0(R5),R8	// 0-3 bytes of p ?Endian?
  60  	MOVWZ	4(R5),R9	// 4-7 bytes of p
  61  	MOVD	R4,R10		// &tab[0]
  62  	XOR	R7,R8,R7	// crc ^= byte[0:3]
  63  	RLDICL	$40,R9,$56,R17	// p[7]
  64  	SLD	$2,R17,R17	// p[7]*4
  65  	RLDICL	$40,R7,$56,R8	// crc>>24
  66  	SLD	$2,R8,R8	// crc>>24*4
  67  	RLDICL	$48,R9,$56,R18	// p[6]
  68  	SLD	$2,R18,R18	// p[6]*4
  69  	MOVWZ	(R10)(R17),R21	// tab[0][p[7]]
  70  	ADD	$1024,R10,R10	// tab[1]
  71  	RLDICL	$56,R9,$56,R19	// p[5]
  72  	SLD	$2,R19,R19	// p[5]*4:1
  73  	MOVWZ	(R10)(R18),R22	// tab[1][p[6]]
  74  	ADD	$1024,R10,R10	// tab[2]
  75  	XOR	R21,R22,R21	// xor done R22
  76  	CLRLSLDI $56,R9,$2,R20
  77  	MOVWZ	(R10)(R19),R23	// tab[2][p[5]]
  78  	ADD	$1024,R10,R10	// &tab[3]
  79  	XOR	R21,R23,R21	// xor done R23
  80  	MOVWZ	(R10)(R20),R24	// tab[3][p[4]]
  81  	ADD 	$1024,R10,R10   // &tab[4]
  82  	XOR	R21,R24,R21	// xor done R24
  83  	MOVWZ	(R10)(R8),R25	// tab[4][crc>>24]
  84  	RLDICL	$48,R7,$56,R24	// crc>>16&0xFF
  85  	XOR	R21,R25,R21	// xor done R25
  86  	ADD	$1024,R10,R10	// &tab[5]
  87  	SLD	$2,R24,R24	// crc>>16&0xFF*4
  88  	MOVWZ	(R10)(R24),R26	// tab[5][crc>>16&0xFF]
  89  	XOR	R21,R26,R21	// xor done R26
  90  	RLDICL	$56,R7,$56,R25	// crc>>8
  91  	ADD	$1024,R10,R10	// &tab[6]
  92  	SLD	$2,R25,R25	// crc>>8&FF*2
  93  	MOVBZ   R7,R26          // crc&0xFF
  94  	MOVWZ	(R10)(R25),R27	// tab[6][crc>>8&0xFF]
  95  	ADD 	$1024,R10,R10   // &tab[7]
  96  	SLD	$2,R26,R26	// crc&0xFF*2
  97  	XOR	R21,R27,R21	// xor done R27
  98  	ADD     $8,R5           // p = p[8:]
  99  	MOVWZ	(R10)(R26),R28	// tab[7][crc&0xFF]
 100  	XOR	R21,R28,R21	// xor done R28
 101  	MOVWZ	R21,R7		// crc for next round
 102  	BDNZ 	loop
 103  	ANDCC	$7,R6,R8	// any leftover bytes
 104  	BEQ	done		// none --> done
 105  	MOVD	R8,CTR		// byte count
 106  	PCALIGN $16             // align short loop
 107  short:
 108  	MOVBZ 	0(R5),R8	// get v
 109  	XOR 	R8,R7,R8	// byte(crc)^v -> R8
 110  	RLDIC	$2,R8,$54,R8	// rldicl r8,r8,2,22
 111  	SRD 	$8,R7,R14	// crc>>8
 112  	MOVWZ	(R4)(R8),R10
 113  	ADD	$1,R5
 114  	XOR 	R10,R14,R7	// loop crc in R7
 115  	BDNZ 	short
 116  done:
 117  	NOR     R7,R7,R7        // ^crc
 118  	MOVW    R7,ret+40(FP)   // return crc
 119  	RET
 120  
 121  #ifdef BYTESWAP_DATA
 122  DATA ·byteswapcons+0(SB)/8,$0x0706050403020100
 123  DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908
 124  
 125  GLOBL ·byteswapcons+0(SB),RODATA,$16
 126  #endif
 127  
 128  TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36
 129  	MOVWZ	crc+0(FP), R3   // incoming crc
 130  	MOVWZ	ctab+4(FP), R14   // crc poly id
 131  	MOVD    p+8(FP), R4
 132  	MOVD    p_len+16(FP), R5 // p len
 133  
 134  	// R3 = incoming crc
 135  	// R14 = constant table identifier
 136  	// R5 = address of bytes
 137  	// R6 = length of bytes
 138  
 139  	// defines for index loads
 140  
 141  	MOVD	$16,off16
 142  	MOVD	$32,off32
 143  	MOVD	$48,off48
 144  	MOVD	$64,off64
 145  	MOVD	$80,off80
 146  	MOVD	$96,off96
 147  	MOVD	$112,off112
 148  	MOVD	$0,R15
 149  
 150  	MOVD	R3,R10	// save initial crc
 151  
 152  	NOR	R3,R3,R3  // ^crc
 153  	MOVWZ	R3,R3	// 32 bits
 154  	VXOR	zeroes,zeroes,zeroes  // clear the V reg
 155  	VSPLTISW $-1,V0
 156  	VSLDOI	$4,V29,V0,mask_32bit
 157  	VSLDOI	$8,V29,V0,mask_64bit
 158  
 159  	VXOR	V8,V8,V8
 160  	MTVSRD	R3,VS40	// crc initial value VS40 = V8
 161  
 162  #ifdef REFLECT
 163  	VSLDOI	$8,zeroes,V8,V8  // or: VSLDOI V29,V8,V27,4 for top 32 bits?
 164  #else
 165  	VSLDOI	$4,V8,zeroes,V8
 166  #endif
 167  
 168  #ifdef BYTESWAP_DATA
 169  	MOVD    $·byteswapcons(SB),R3
 170  	LVX	(R3),byteswap
 171  #endif
 172  
 173  	CMPU	R5,$256		// length of bytes
 174  	BLT	short
 175  
 176  	RLDICR	$0,R5,$56,R6 // chunk to process
 177  
 178  	// First step for larger sizes
 179  l1:	MOVD	$32768,R7
 180  	MOVD	R7,R9
 181  	CMP	R6,R7   // compare R6, R7 (MAX SIZE)
 182  	BGT	top	// less than MAX, just do remainder
 183  	MOVD	R6,R7
 184  top:
 185  	SUB	R7,R6,R6
 186  
 187  	// mainloop does 128 bytes at a time
 188  	SRD	$7,R7
 189  
 190  	// determine the offset into the constants table to start with.
 191  	// Each constant is 128 bytes, used against 16 bytes of data.
 192  	SLD	$4,R7,R8
 193  	SRD	$3,R9,R9
 194  	SUB	R8,R9,R8
 195  
 196  	// The last iteration is reduced in a separate step
 197  	ADD	$-1,R7
 198  	MOVD	R7,CTR
 199  
 200  	// Determine which constant table (depends on poly)
 201  	CMP	R14,$1
 202  	BNE	castTable
 203  	MOVD	$·IEEEConst(SB),R3
 204  	BR	startConst
 205  castTable:
 206  	MOVD	$·CastConst(SB),R3
 207  
 208  startConst:
 209  	ADD	R3,R8,R3	// starting point in constants table
 210  
 211  	VXOR	V0,V0,V0	// clear the V regs
 212  	VXOR	V1,V1,V1
 213  	VXOR	V2,V2,V2
 214  	VXOR	V3,V3,V3
 215  	VXOR	V4,V4,V4
 216  	VXOR	V5,V5,V5
 217  	VXOR	V6,V6,V6
 218  	VXOR	V7,V7,V7
 219  
 220  	LVX	(R3),const1	// loading constant values
 221  
 222  	CMP	R15,$1		// Identify warm up pass
 223  	BEQ	next
 224  
 225  	// First warm up pass: load the bytes to process
 226  	LVX	(R4),V16
 227  	LVX	(R4+off16),V17
 228  	LVX	(R4+off32),V18
 229  	LVX	(R4+off48),V19
 230  	LVX	(R4+off64),V20
 231  	LVX	(R4+off80),V21
 232  	LVX	(R4+off96),V22
 233  	LVX	(R4+off112),V23
 234  	ADD	$128,R4		// bump up to next 128 bytes in buffer
 235  
 236  	VXOR	V16,V8,V16	// xor in initial CRC in V8
 237  
 238  next:
 239  	BC	18,0,first_warm_up_done
 240  
 241  	ADD	$16,R3		// bump up to next constants
 242  	LVX	(R3),const2	// table values
 243  
 244  	VPMSUMD	V16,const1,V8 // second warm up pass
 245  	LVX	(R4),V16	// load from buffer
 246  	OR	$0,R2,R2
 247  
 248  	VPMSUMD	V17,const1,V9	// vpmsumd with constants
 249  	LVX	(R4+off16),V17	// load next from buffer
 250  	OR	$0,R2,R2
 251  
 252  	VPMSUMD	V18,const1,V10	// vpmsumd with constants
 253  	LVX	(R4+off32),V18	// load next from buffer
 254  	OR	$0,R2,R2
 255  
 256  	VPMSUMD	V19,const1,V11	// vpmsumd with constants
 257  	LVX	(R4+off48),V19	// load next from buffer
 258  	OR	$0,R2,R2
 259  
 260  	VPMSUMD	V20,const1,V12	// vpmsumd with constants
 261  	LVX	(R4+off64),V20	// load next from buffer
 262  	OR	$0,R2,R2
 263  
 264  	VPMSUMD	V21,const1,V13	// vpmsumd with constants
 265  	LVX	(R4+off80),V21	// load next from buffer
 266  	OR	$0,R2,R2
 267  
 268  	VPMSUMD	V22,const1,V14	// vpmsumd with constants
 269  	LVX	(R4+off96),V22	// load next from buffer
 270  	OR	$0,R2,R2
 271  
 272  	VPMSUMD	V23,const1,V15	// vpmsumd with constants
 273  	LVX	(R4+off112),V23	// load next from buffer
 274  
 275  	ADD	$128,R4		// bump up to next 128 bytes in buffer
 276  
 277  	BC	18,0,first_cool_down
 278  
 279  cool_top:
 280  	LVX	(R3),const1	// constants
 281  	ADD	$16,R3		// inc to next constants
 282  	OR	$0,R2,R2
 283  
 284  	VXOR	V0,V8,V0	// xor in previous vpmsumd
 285  	VPMSUMD	V16,const2,V8	// vpmsumd with constants
 286  	LVX	(R4),V16	// buffer
 287  	OR	$0,R2,R2
 288  
 289  	VXOR	V1,V9,V1	// xor in previous
 290  	VPMSUMD	V17,const2,V9	// vpmsumd with constants
 291  	LVX	(R4+off16),V17	// next in buffer
 292  	OR	$0,R2,R2
 293  
 294  	VXOR	V2,V10,V2	// xor in previous
 295  	VPMSUMD	V18,const2,V10	// vpmsumd with constants
 296  	LVX	(R4+off32),V18	// next in buffer
 297  	OR	$0,R2,R2
 298  
 299  	VXOR	V3,V11,V3	// xor in previous
 300  	VPMSUMD	V19,const2,V11	// vpmsumd with constants
 301  	LVX	(R4+off48),V19	// next in buffer
 302  	LVX	(R3),const2	// get next constant
 303  	OR	$0,R2,R2
 304  
 305  	VXOR	V4,V12,V4	// xor in previous
 306  	VPMSUMD	V20,const1,V12	// vpmsumd with constants
 307  	LVX	(R4+off64),V20	// next in buffer
 308  	OR	$0,R2,R2
 309  
 310  	VXOR	V5,V13,V5	// xor in previous
 311  	VPMSUMD	V21,const1,V13	// vpmsumd with constants
 312  	LVX	(R4+off80),V21	// next in buffer
 313  	OR	$0,R2,R2
 314  
 315  	VXOR	V6,V14,V6	// xor in previous
 316  	VPMSUMD	V22,const1,V14	// vpmsumd with constants
 317  	LVX	(R4+off96),V22	// next in buffer
 318  	OR	$0,R2,R2
 319  
 320  	VXOR	V7,V15,V7	// xor in previous
 321  	VPMSUMD	V23,const1,V15	// vpmsumd with constants
 322  	LVX	(R4+off112),V23	// next in buffer
 323  
 324  	ADD	$128,R4		// bump up buffer pointer
 325  	BDNZ	cool_top	// are we done?
 326  
 327  first_cool_down:
 328  
 329  	// load the constants
 330  	// xor in the previous value
 331  	// vpmsumd the result with constants
 332  
 333  	LVX	(R3),const1
 334  	ADD	$16,R3
 335  
 336  	VXOR	V0,V8,V0
 337  	VPMSUMD V16,const1,V8
 338  	OR	$0,R2,R2
 339  
 340  	VXOR	V1,V9,V1
 341  	VPMSUMD	V17,const1,V9
 342  	OR	$0,R2,R2
 343  
 344  	VXOR	V2,V10,V2
 345  	VPMSUMD	V18,const1,V10
 346  	OR	$0,R2,R2
 347  
 348  	VXOR	V3,V11,V3
 349  	VPMSUMD	V19,const1,V11
 350  	OR	$0,R2,R2
 351  
 352  	VXOR	V4,V12,V4
 353  	VPMSUMD	V20,const1,V12
 354  	OR	$0,R2,R2
 355  
 356  	VXOR	V5,V13,V5
 357  	VPMSUMD	V21,const1,V13
 358  	OR	$0,R2,R2
 359  
 360  	VXOR	V6,V14,V6
 361  	VPMSUMD	V22,const1,V14
 362  	OR	$0,R2,R2
 363  
 364  	VXOR	V7,V15,V7
 365  	VPMSUMD	V23,const1,V15
 366  	OR	$0,R2,R2
 367  
 368  second_cool_down:
 369  
 370  	VXOR    V0,V8,V0
 371  	VXOR    V1,V9,V1
 372  	VXOR    V2,V10,V2
 373  	VXOR    V3,V11,V3
 374  	VXOR    V4,V12,V4
 375  	VXOR    V5,V13,V5
 376  	VXOR    V6,V14,V6
 377  	VXOR    V7,V15,V7
 378  
 379  #ifdef REFLECT
 380  	VSLDOI  $4,V0,zeroes,V0
 381  	VSLDOI  $4,V1,zeroes,V1
 382  	VSLDOI  $4,V2,zeroes,V2
 383  	VSLDOI  $4,V3,zeroes,V3
 384  	VSLDOI  $4,V4,zeroes,V4
 385  	VSLDOI  $4,V5,zeroes,V5
 386  	VSLDOI  $4,V6,zeroes,V6
 387  	VSLDOI  $4,V7,zeroes,V7
 388  #endif
 389  
 390  	LVX	(R4),V8
 391  	LVX	(R4+off16),V9
 392  	LVX	(R4+off32),V10
 393  	LVX	(R4+off48),V11
 394  	LVX	(R4+off64),V12
 395  	LVX	(R4+off80),V13
 396  	LVX	(R4+off96),V14
 397  	LVX	(R4+off112),V15
 398  
 399  	ADD	$128,R4
 400  
 401  	VXOR	V0,V8,V16
 402  	VXOR	V1,V9,V17
 403  	VXOR	V2,V10,V18
 404  	VXOR	V3,V11,V19
 405  	VXOR	V4,V12,V20
 406  	VXOR	V5,V13,V21
 407  	VXOR	V6,V14,V22
 408  	VXOR	V7,V15,V23
 409  
 410  	MOVD    $1,R15
 411  	CMP     $0,R6
 412  	ADD     $128,R6
 413  
 414  	BNE	l1
 415  	ANDCC   $127,R5
 416  	SUBC	R5,$128,R6
 417  	ADD	R3,R6,R3
 418  
 419  	SRD	$4,R5,R7
 420  	MOVD	R7,CTR
 421  	LVX	(R3),V0
 422  	LVX	(R3+off16),V1
 423  	LVX	(R3+off32),V2
 424  	LVX	(R3+off48),V3
 425  	LVX	(R3+off64),V4
 426  	LVX	(R3+off80),V5
 427  	LVX	(R3+off96),V6
 428  	LVX	(R3+off112),V7
 429  
 430  	ADD	$128,R3
 431  
 432  	VPMSUMW	V16,V0,V0
 433  	VPMSUMW	V17,V1,V1
 434  	VPMSUMW	V18,V2,V2
 435  	VPMSUMW	V19,V3,V3
 436  	VPMSUMW	V20,V4,V4
 437  	VPMSUMW	V21,V5,V5
 438  	VPMSUMW	V22,V6,V6
 439  	VPMSUMW	V23,V7,V7
 440  
 441  	// now reduce the tail
 442  
 443  	CMP	$0,R7
 444  	BEQ	next1
 445  
 446  	LVX	(R4),V16
 447  	LVX	(R3),V17
 448  	VPMSUMW	V16,V17,V16
 449  	VXOR	V0,V16,V0
 450  	BC	18,0,next1
 451  
 452  	LVX	(R4+off16),V16
 453  	LVX	(R3+off16),V17
 454  	VPMSUMW	V16,V17,V16
 455  	VXOR	V0,V16,V0
 456  	BC	18,0,next1
 457  
 458  	LVX	(R4+off32),V16
 459  	LVX	(R3+off32),V17
 460  	VPMSUMW	V16,V17,V16
 461  	VXOR	V0,V16,V0
 462  	BC	18,0,next1
 463  
 464  	LVX	(R4+off48),V16
 465  	LVX	(R3+off48),V17
 466  	VPMSUMW	V16,V17,V16
 467  	VXOR	V0,V16,V0
 468  	BC	18,0,next1
 469  
 470  	LVX	(R4+off64),V16
 471  	LVX	(R3+off64),V17
 472  	VPMSUMW	V16,V17,V16
 473  	VXOR	V0,V16,V0
 474  	BC	18,0,next1
 475  
 476  	LVX	(R4+off80),V16
 477  	LVX	(R3+off80),V17
 478  	VPMSUMW	V16,V17,V16
 479  	VXOR	V0,V16,V0
 480  	BC	18,0,next1
 481  
 482  	LVX	(R4+off96),V16
 483  	LVX	(R3+off96),V17
 484  	VPMSUMW	V16,V17,V16
 485  	VXOR	V0,V16,V0
 486  
 487  next1:
 488  	VXOR	V0,V1,V0
 489  	VXOR	V2,V3,V2
 490  	VXOR	V4,V5,V4
 491  	VXOR	V6,V7,V6
 492  	VXOR	V0,V2,V0
 493  	VXOR	V4,V6,V4
 494  	VXOR	V0,V4,V0
 495  
 496  barrett_reduction:
 497  
 498  	CMP	R14,$1
 499  	BNE	barcstTable
 500  	MOVD	$·IEEEBarConst(SB),R3
 501  	BR	startbarConst
 502  barcstTable:
 503  	MOVD    $·CastBarConst(SB),R3
 504  
 505  startbarConst:
 506  	LVX	(R3),const1
 507  	LVX	(R3+off16),const2
 508  
 509  	VSLDOI	$8,V0,V0,V1
 510  	VXOR	V0,V1,V0
 511  
 512  #ifdef REFLECT
 513  	VSPLTISB $1,V1
 514  	VSL	V0,V1,V0
 515  #endif
 516  
 517  	VAND	V0,mask_64bit,V0
 518  
 519  #ifndef	REFLECT
 520  
 521  	VPMSUMD	V0,const1,V1
 522  	VSLDOI	$8,zeroes,V1,V1
 523  	VPMSUMD	V1,const2,V1
 524  	VXOR	V0,V1,V0
 525  	VSLDOI	$8,V0,zeroes,V0
 526  
 527  #else
 528  
 529  	VAND	V0,mask_32bit,V1
 530  	VPMSUMD	V1,const1,V1
 531  	VAND	V1,mask_32bit,V1
 532  	VPMSUMD	V1,const2,V1
 533  	VXOR	V0,V1,V0
 534  	VSLDOI  $4,V0,zeroes,V0
 535  
 536  #endif
 537  
 538  	MFVSRD	VS32,R3 // VS32 = V0
 539  
 540  	NOR	R3,R3,R3 // return ^crc
 541  	MOVW	R3,ret+32(FP)
 542  	RET
 543  
 544  first_warm_up_done:
 545  
 546  	LVX	(R3),const1
 547  	ADD	$16,R3
 548  
 549  	VPMSUMD	V16,const1,V8
 550  	VPMSUMD	V17,const1,V9
 551  	VPMSUMD	V18,const1,V10
 552  	VPMSUMD	V19,const1,V11
 553  	VPMSUMD	V20,const1,V12
 554  	VPMSUMD	V21,const1,V13
 555  	VPMSUMD	V22,const1,V14
 556  	VPMSUMD	V23,const1,V15
 557  
 558  	BR	second_cool_down
 559  
 560  short:
 561  	CMP	$0,R5
 562  	BEQ	zero
 563  
 564  	// compute short constants
 565  
 566  	CMP     R14,$1
 567  	BNE     castshTable
 568  	MOVD    $·IEEEConst(SB),R3
 569  	ADD	$4080,R3
 570  	BR      startshConst
 571  castshTable:
 572  	MOVD    $·CastConst(SB),R3
 573  	ADD	$4080,R3
 574  
 575  startshConst:
 576  	SUBC	R5,$256,R6	// sub from 256
 577  	ADD	R3,R6,R3
 578  
 579  	// calculate where to start
 580  
 581  	SRD	$4,R5,R7
 582  	MOVD	R7,CTR
 583  
 584  	VXOR	V19,V19,V19
 585  	VXOR	V20,V20,V20
 586  
 587  	LVX	(R4),V0
 588  	LVX	(R3),V16
 589  	VXOR	V0,V8,V0
 590  	VPMSUMW	V0,V16,V0
 591  	BC	18,0,v0
 592  
 593  	LVX	(R4+off16),V1
 594  	LVX	(R3+off16),V17
 595  	VPMSUMW	V1,V17,V1
 596  	BC	18,0,v1
 597  
 598  	LVX	(R4+off32),V2
 599  	LVX	(R3+off32),V16
 600  	VPMSUMW	V2,V16,V2
 601  	BC	18,0,v2
 602  
 603  	LVX	(R4+off48),V3
 604  	LVX	(R3+off48),V17
 605  	VPMSUMW	V3,V17,V3
 606  	BC	18,0,v3
 607  
 608  	LVX	(R4+off64),V4
 609  	LVX	(R3+off64),V16
 610  	VPMSUMW	V4,V16,V4
 611  	BC	18,0,v4
 612  
 613  	LVX	(R4+off80),V5
 614  	LVX	(R3+off80),V17
 615  	VPMSUMW	V5,V17,V5
 616  	BC	18,0,v5
 617  
 618  	LVX	(R4+off96),V6
 619  	LVX	(R3+off96),V16
 620  	VPMSUMW	V6,V16,V6
 621  	BC	18,0,v6
 622  
 623  	LVX	(R4+off112),V7
 624  	LVX	(R3+off112),V17
 625  	VPMSUMW	V7,V17,V7
 626  	BC	18,0,v7
 627  
 628  	ADD	$128,R3
 629  	ADD	$128,R4
 630  
 631  	LVX	(R4),V8
 632  	LVX	(R3),V16
 633  	VPMSUMW	V8,V16,V8
 634  	BC	18,0,v8
 635  
 636  	LVX	(R4+off16),V9
 637  	LVX	(R3+off16),V17
 638  	VPMSUMW	V9,V17,V9
 639  	BC	18,0,v9
 640  
 641  	LVX	(R4+off32),V10
 642  	LVX	(R3+off32),V16
 643  	VPMSUMW	V10,V16,V10
 644  	BC	18,0,v10
 645  
 646  	LVX	(R4+off48),V11
 647  	LVX	(R3+off48),V17
 648  	VPMSUMW	V11,V17,V11
 649  	BC	18,0,v11
 650  
 651  	LVX	(R4+off64),V12
 652  	LVX	(R3+off64),V16
 653  	VPMSUMW	V12,V16,V12
 654  	BC	18,0,v12
 655  
 656  	LVX	(R4+off80),V13
 657  	LVX	(R3+off80),V17
 658  	VPMSUMW	V13,V17,V13
 659  	BC	18,0,v13
 660  
 661  	LVX	(R4+off96),V14
 662  	LVX	(R3+off96),V16
 663  	VPMSUMW	V14,V16,V14
 664  	BC	18,0,v14
 665  
 666  	LVX	(R4+off112),V15
 667  	LVX	(R3+off112),V17
 668  	VPMSUMW	V15,V17,V15
 669  
 670  	VXOR	V19,V15,V19
 671  v14:	VXOR	V20,V14,V20
 672  v13:	VXOR	V19,V13,V19
 673  v12:	VXOR	V20,V12,V20
 674  v11:	VXOR	V19,V11,V19
 675  v10:	VXOR	V20,V10,V20
 676  v9:	VXOR	V19,V9,V19
 677  v8:	VXOR	V20,V8,V20
 678  v7:	VXOR	V19,V7,V19
 679  v6:	VXOR	V20,V6,V20
 680  v5:	VXOR	V19,V5,V19
 681  v4:	VXOR	V20,V4,V20
 682  v3:	VXOR	V19,V3,V19
 683  v2:	VXOR	V20,V2,V20
 684  v1:	VXOR	V19,V1,V19
 685  v0:	VXOR	V20,V0,V20
 686  
 687  	VXOR	V19,V20,V0
 688  
 689  	BR	barrett_reduction
 690  
 691  zero:
 692  	// This case is the original crc, so just return it
 693  	MOVW    R10,ret+32(FP)
 694  	RET
 695