nat_ppc64x.s raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego && (ppc64 || ppc64le)
   6  
   7  #include "textflag.h"
   8  
   9  // func addMulVVW1024(z, x *uint, y uint) (c uint)
  10  TEXT ·addMulVVW1024(SB), $0-32
  11  	MOVD	$4, R6 // R6 = z_len/4
  12  	JMP		addMulVVWx<>(SB)
  13  
  14  // func addMulVVW1536(z, x *uint, y uint) (c uint)
  15  TEXT ·addMulVVW1536(SB), $0-32
  16  	MOVD	$6, R6 // R6 = z_len/4
  17  	JMP		addMulVVWx<>(SB)
  18  
  19  // func addMulVVW2048(z, x *uint, y uint) (c uint)
  20  TEXT ·addMulVVW2048(SB), $0-32
  21  	MOVD	$8, R6 // R6 = z_len/4
  22  	JMP		addMulVVWx<>(SB)
  23  
  24  // This local function expects to be called only by
  25  // callers above. R6 contains the z length/4
  26  // since 4 values are processed for each
  27  // loop iteration, and is guaranteed to be > 0.
  28  // If other callers are added this function might
  29  // need to change.
  30  TEXT addMulVVWx<>(SB), NOSPLIT, $0
  31  	MOVD	z+0(FP), R3
  32  	MOVD	x+8(FP), R4
  33  	MOVD	y+16(FP), R5
  34  
  35  	MOVD	$0, R9		// R9 = c = 0
  36  	MOVD	R6, CTR		// Initialize loop counter
  37  	PCALIGN	$16
  38  
  39  loop:
  40  	MOVD	0(R4), R14	// x[i]
  41  	MOVD	8(R4), R16	// x[i+1]
  42  	MOVD	16(R4), R18	// x[i+2]
  43  	MOVD	24(R4), R20	// x[i+3]
  44  	MOVD	0(R3), R15	// z[i]
  45  	MOVD	8(R3), R17	// z[i+1]
  46  	MOVD	16(R3), R19	// z[i+2]
  47  	MOVD	24(R3), R21	// z[i+3]
  48  	MULLD	R5, R14, R10	// low x[i]*y
  49  	MULHDU	R5, R14, R11	// high x[i]*y
  50  	ADDC	R15, R10
  51  	ADDZE	R11
  52  	ADDC	R9, R10
  53  	ADDZE	R11, R9
  54  	MULLD	R5, R16, R14	// low x[i+1]*y
  55  	MULHDU	R5, R16, R15	// high x[i+1]*y
  56  	ADDC	R17, R14
  57  	ADDZE	R15
  58  	ADDC	R9, R14
  59  	ADDZE	R15, R9
  60  	MULLD	R5, R18, R16	// low x[i+2]*y
  61  	MULHDU	R5, R18, R17	// high x[i+2]*y
  62  	ADDC	R19, R16
  63  	ADDZE	R17
  64  	ADDC	R9, R16
  65  	ADDZE	R17, R9
  66  	MULLD	R5, R20, R18	// low x[i+3]*y
  67  	MULHDU	R5, R20, R19	// high x[i+3]*y
  68  	ADDC	R21, R18
  69  	ADDZE	R19
  70  	ADDC	R9, R18
  71  	ADDZE	R19, R9
  72  	MOVD	R10, 0(R3)	// z[i]
  73  	MOVD	R14, 8(R3)	// z[i+1]
  74  	MOVD	R16, 16(R3)	// z[i+2]
  75  	MOVD	R18, 24(R3)	// z[i+3]
  76  	ADD	$32, R3
  77  	ADD	$32, R4
  78  	BDNZ	loop
  79  
  80  done:
  81  	MOVD	R9, c+24(FP)
  82  	RET
  83