nat_arm64.s raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  #include "textflag.h"
   8  
   9  // func addMulVVW1024(z, x *uint, y uint) (c uint)
  10  TEXT ·addMulVVW1024(SB), $0-32
  11  	MOVD	$16, R0
  12  	JMP		addMulVVWx(SB)
  13  
  14  // func addMulVVW1536(z, x *uint, y uint) (c uint)
  15  TEXT ·addMulVVW1536(SB), $0-32
  16  	MOVD	$24, R0
  17  	JMP		addMulVVWx(SB)
  18  
  19  // func addMulVVW2048(z, x *uint, y uint) (c uint)
  20  TEXT ·addMulVVW2048(SB), $0-32
  21  	MOVD	$32, R0
  22  	JMP		addMulVVWx(SB)
  23  
  24  TEXT addMulVVWx(SB), NOFRAME|NOSPLIT, $0
  25  	MOVD	z+0(FP), R1
  26  	MOVD	x+8(FP), R2
  27  	MOVD	y+16(FP), R3
  28  	MOVD	$0, R4
  29  
  30  // The main loop of this code operates on a block of 4 words every iteration
  31  // performing [R4:R12:R11:R10:R9] = R4 + R3 * [R8:R7:R6:R5] + [R12:R11:R10:R9]
  32  // where R4 is carried from the previous iteration, R8:R7:R6:R5 hold the next
  33  // 4 words of x, R3 is y and R12:R11:R10:R9 are part of the result z.
  34  loop:
  35  	CBZ	R0, done
  36  
  37  	LDP.P	16(R2), (R5, R6)
  38  	LDP.P	16(R2), (R7, R8)
  39  
  40  	LDP	(R1), (R9, R10)
  41  	ADDS	R4, R9
  42  	MUL	R6, R3, R14
  43  	ADCS	R14, R10
  44  	MUL	R7, R3, R15
  45  	LDP	16(R1), (R11, R12)
  46  	ADCS	R15, R11
  47  	MUL	R8, R3, R16
  48  	ADCS	R16, R12
  49  	UMULH	R8, R3, R20
  50  	ADC	$0, R20
  51  
  52  	MUL	R5, R3, R13
  53  	ADDS	R13, R9
  54  	UMULH	R5, R3, R17
  55  	ADCS	R17, R10
  56  	UMULH	R6, R3, R21
  57  	STP.P	(R9, R10), 16(R1)
  58  	ADCS	R21, R11
  59  	UMULH	R7, R3, R19
  60  	ADCS	R19, R12
  61  	STP.P	(R11, R12), 16(R1)
  62  	ADC	$0, R20, R4
  63  
  64  	SUB	$4, R0
  65  	B	loop
  66  
  67  done:
  68  	MOVD	R4, c+24(FP)
  69  	RET
  70