sha3_arm64.s raw

   1  // Copyright 2022 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  #include "textflag.h"
   8  
   9  // func keccakF1600NEON(a *[200]byte)
  10  TEXT ·keccakF1600NEON(SB), $200-8
  11  	MOVD	a+0(FP), R0
  12  	MOVD	$round_consts<>(SB), R1
  13  	MOVD	$24, R2 // counter for loop
  14  
  15  	VLD1.P	16(R0), [V0.D1, V1.D1]
  16  	VLD1.P	16(R0), [V2.D1, V3.D1]
  17  	VLD1.P	16(R0), [V4.D1, V5.D1]
  18  	VLD1.P	16(R0), [V6.D1, V7.D1]
  19  	VLD1.P	16(R0), [V8.D1, V9.D1]
  20  	VLD1.P	16(R0), [V10.D1, V11.D1]
  21  	VLD1.P	16(R0), [V12.D1, V13.D1]
  22  	VLD1.P	16(R0), [V14.D1, V15.D1]
  23  	VLD1.P	16(R0), [V16.D1, V17.D1]
  24  	VLD1.P	16(R0), [V18.D1, V19.D1]
  25  	VLD1.P	16(R0), [V20.D1, V21.D1]
  26  	VLD1.P	16(R0), [V22.D1, V23.D1]
  27  	VLD1	(R0), [V24.D1]
  28  
  29  	SUB	$192, R0, R0
  30  
  31  loop:
  32  	// theta
  33  	VEOR3	 V20.B16, V15.B16, V10.B16, V25.B16
  34  	VEOR3	 V21.B16, V16.B16, V11.B16, V26.B16
  35  	VEOR3	 V22.B16, V17.B16, V12.B16, V27.B16
  36  	VEOR3	 V23.B16, V18.B16, V13.B16, V28.B16
  37  	VEOR3	 V24.B16, V19.B16, V14.B16, V29.B16
  38  	VEOR3	 V25.B16, V5.B16, V0.B16, V25.B16
  39  	VEOR3	 V26.B16, V6.B16, V1.B16, V26.B16
  40  	VEOR3	 V27.B16, V7.B16, V2.B16, V27.B16
  41  	VEOR3	 V28.B16, V8.B16, V3.B16, V28.B16
  42  	VEOR3	 V29.B16, V9.B16, V4.B16, V29.B16
  43  
  44  	VRAX1	V27.D2, V25.D2, V30.D2
  45  	VRAX1	V28.D2, V26.D2, V31.D2
  46  	VRAX1	V29.D2, V27.D2, V27.D2
  47  	VRAX1	V25.D2, V28.D2, V28.D2
  48  	VRAX1	V26.D2, V29.D2, V29.D2
  49  
  50  	// theta and rho and Pi
  51  	VEOR	V29.B16, V0.B16, V0.B16
  52  
  53  	VXAR	$63, V30.D2, V1.D2, V25.D2
  54  
  55  	VXAR	$20, V30.D2, V6.D2, V1.D2
  56  	VXAR	$44, V28.D2, V9.D2, V6.D2
  57  	VXAR	$3, V31.D2, V22.D2, V9.D2
  58  	VXAR	$25, V28.D2, V14.D2, V22.D2
  59  	VXAR	$46, V29.D2, V20.D2, V14.D2
  60  
  61  	VXAR	$2, V31.D2, V2.D2, V26.D2
  62  
  63  	VXAR	$21, V31.D2, V12.D2, V2.D2
  64  	VXAR	$39, V27.D2, V13.D2, V12.D2
  65  	VXAR	$56, V28.D2, V19.D2, V13.D2
  66  	VXAR	$8, V27.D2, V23.D2, V19.D2
  67  	VXAR	$23, V29.D2, V15.D2, V23.D2
  68  
  69  	VXAR	$37, V28.D2, V4.D2, V15.D2
  70  
  71  	VXAR	$50, V28.D2, V24.D2, V28.D2
  72  	VXAR	$62, V30.D2, V21.D2, V24.D2
  73  	VXAR	$9, V27.D2, V8.D2, V8.D2
  74  	VXAR	$19, V30.D2, V16.D2, V4.D2
  75  	VXAR	$28, V29.D2, V5.D2, V16.D2
  76  
  77  	VXAR	$36, V27.D2, V3.D2, V5.D2
  78  
  79  	VXAR	$43, V27.D2, V18.D2, V27.D2
  80  	VXAR	$49, V31.D2, V17.D2, V3.D2
  81  	VXAR	$54, V30.D2, V11.D2, V30.D2
  82  	VXAR	$58, V31.D2, V7.D2, V31.D2
  83  	VXAR	$61, V29.D2, V10.D2, V29.D2
  84  
  85  	// chi and iota
  86  	VBCAX	V8.B16, V22.B16, V26.B16, V20.B16
  87  	VBCAX	V22.B16, V23.B16, V8.B16, V21.B16
  88  	VBCAX	V23.B16, V24.B16, V22.B16, V22.B16
  89  	VBCAX	V24.B16, V26.B16, V23.B16, V23.B16
  90  	VBCAX	V26.B16, V8.B16, V24.B16, V24.B16
  91  
  92  	VLD1R.P	8(R1), [V26.D2]
  93  
  94  	VBCAX	V3.B16, V19.B16, V30.B16, V17.B16
  95  	VBCAX	V19.B16, V15.B16, V3.B16, V18.B16
  96  	VBCAX	V15.B16, V16.B16, V19.B16, V19.B16
  97  	VBCAX	V16.B16, V30.B16, V15.B16, V15.B16
  98  	VBCAX	V30.B16, V3.B16, V16.B16, V16.B16
  99  
 100  	VBCAX	V31.B16, V12.B16, V25.B16, V10.B16
 101  	VBCAX	V12.B16, V13.B16, V31.B16, V11.B16
 102  	VBCAX	V13.B16, V14.B16, V12.B16, V12.B16
 103  	VBCAX	V14.B16, V25.B16, V13.B16, V13.B16
 104  	VBCAX	V25.B16, V31.B16, V14.B16, V14.B16
 105  
 106  	VBCAX	V4.B16, V9.B16, V29.B16, V7.B16
 107  	VBCAX	V9.B16, V5.B16, V4.B16, V8.B16
 108  	VBCAX	V5.B16, V6.B16, V9.B16, V9.B16
 109  	VBCAX	V6.B16, V29.B16, V5.B16, V5.B16
 110  	VBCAX	V29.B16, V4.B16, V6.B16, V6.B16
 111  
 112  	VBCAX	V28.B16, V0.B16, V27.B16, V3.B16
 113  	VBCAX	V0.B16, V1.B16, V28.B16, V4.B16
 114  
 115  	VBCAX	V1.B16, V2.B16, V0.B16, V0.B16  // iota (chi part)
 116  
 117  	VBCAX	V2.B16, V27.B16, V1.B16, V1.B16
 118  	VBCAX	V27.B16, V28.B16, V2.B16, V2.B16
 119  
 120  	VEOR	V26.B16, V0.B16, V0.B16 // iota
 121  
 122  	SUB		$1, R2, R2
 123  	CBNZ	R2, loop
 124  
 125  	VST1.P	[V0.D1, V1.D1], 16(R0)
 126  	VST1.P	[V2.D1, V3.D1], 16(R0)
 127  	VST1.P	[V4.D1, V5.D1], 16(R0)
 128  	VST1.P	[V6.D1, V7.D1], 16(R0)
 129  	VST1.P	[V8.D1, V9.D1], 16(R0)
 130  	VST1.P	[V10.D1, V11.D1], 16(R0)
 131  	VST1.P	[V12.D1, V13.D1], 16(R0)
 132  	VST1.P	[V14.D1, V15.D1], 16(R0)
 133  	VST1.P	[V16.D1, V17.D1], 16(R0)
 134  	VST1.P	[V18.D1, V19.D1], 16(R0)
 135  	VST1.P	[V20.D1, V21.D1], 16(R0)
 136  	VST1.P	[V22.D1, V23.D1], 16(R0)
 137  	VST1	[V24.D1], (R0)
 138  
 139  	RET
 140  
 141  DATA	round_consts<>+0x00(SB)/8, $0x0000000000000001
 142  DATA	round_consts<>+0x08(SB)/8, $0x0000000000008082
 143  DATA	round_consts<>+0x10(SB)/8, $0x800000000000808a
 144  DATA	round_consts<>+0x18(SB)/8, $0x8000000080008000
 145  DATA	round_consts<>+0x20(SB)/8, $0x000000000000808b
 146  DATA	round_consts<>+0x28(SB)/8, $0x0000000080000001
 147  DATA	round_consts<>+0x30(SB)/8, $0x8000000080008081
 148  DATA	round_consts<>+0x38(SB)/8, $0x8000000000008009
 149  DATA	round_consts<>+0x40(SB)/8, $0x000000000000008a
 150  DATA	round_consts<>+0x48(SB)/8, $0x0000000000000088
 151  DATA	round_consts<>+0x50(SB)/8, $0x0000000080008009
 152  DATA	round_consts<>+0x58(SB)/8, $0x000000008000000a
 153  DATA	round_consts<>+0x60(SB)/8, $0x000000008000808b
 154  DATA	round_consts<>+0x68(SB)/8, $0x800000000000008b
 155  DATA	round_consts<>+0x70(SB)/8, $0x8000000000008089
 156  DATA	round_consts<>+0x78(SB)/8, $0x8000000000008003
 157  DATA	round_consts<>+0x80(SB)/8, $0x8000000000008002
 158  DATA	round_consts<>+0x88(SB)/8, $0x8000000000000080
 159  DATA	round_consts<>+0x90(SB)/8, $0x000000000000800a
 160  DATA	round_consts<>+0x98(SB)/8, $0x800000008000000a
 161  DATA	round_consts<>+0xA0(SB)/8, $0x8000000080008081
 162  DATA	round_consts<>+0xA8(SB)/8, $0x8000000000008080
 163  DATA	round_consts<>+0xB0(SB)/8, $0x0000000080000001
 164  DATA	round_consts<>+0xB8(SB)/8, $0x8000000080008008
 165  GLOBL	round_consts<>(SB), NOPTR|RODATA, $192
 166