aes_arm64.s raw

   1  // Copyright 2017 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  #include "textflag.h"
   8  DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
   9  DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
  10  GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
  11  DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
  12  DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
  13  GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
  14  // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
  15  TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
  16  	MOVD	nr+0(FP), R9
  17  	MOVD	xk+8(FP), R10
  18  	MOVD	dst+16(FP), R11
  19  	MOVD	src+24(FP), R12
  20  
  21  	VLD1	(R12), [V0.B16]
  22  
  23  	CMP	$12, R9
  24  	BLT	enc128
  25  	BEQ	enc192
  26  enc256:
  27  	VLD1.P	32(R10), [V1.B16, V2.B16]
  28  	AESE	V1.B16, V0.B16
  29  	AESMC	V0.B16, V0.B16
  30  	AESE	V2.B16, V0.B16
  31  	AESMC	V0.B16, V0.B16
  32  enc192:
  33  	VLD1.P	32(R10), [V3.B16, V4.B16]
  34  	AESE	V3.B16, V0.B16
  35  	AESMC	V0.B16, V0.B16
  36  	AESE	V4.B16, V0.B16
  37  	AESMC	V0.B16, V0.B16
  38  enc128:
  39  	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
  40  	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
  41  	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
  42  	AESE	V5.B16, V0.B16
  43  	AESMC	V0.B16, V0.B16
  44  	AESE	V6.B16, V0.B16
  45  	AESMC	V0.B16, V0.B16
  46  	AESE	V7.B16, V0.B16
  47  	AESMC	V0.B16, V0.B16
  48  	AESE	V8.B16, V0.B16
  49  	AESMC	V0.B16, V0.B16
  50  	AESE	V9.B16, V0.B16
  51  	AESMC	V0.B16, V0.B16
  52  	AESE	V10.B16, V0.B16
  53  	AESMC	V0.B16, V0.B16
  54  	AESE	V11.B16, V0.B16
  55  	AESMC	V0.B16, V0.B16
  56  	AESE	V12.B16, V0.B16
  57  	AESMC	V0.B16, V0.B16
  58  	AESE	V13.B16, V0.B16
  59  	AESMC	V0.B16, V0.B16
  60  	AESE	V14.B16, V0.B16
  61  	VEOR    V0.B16, V15.B16, V0.B16
  62  	VST1	[V0.B16], (R11)
  63  	RET
  64  
  65  // func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
  66  TEXT ·decryptBlockAsm(SB),NOSPLIT,$0
  67  	MOVD	nr+0(FP), R9
  68  	MOVD	xk+8(FP), R10
  69  	MOVD	dst+16(FP), R11
  70  	MOVD	src+24(FP), R12
  71  
  72  	VLD1	(R12), [V0.B16]
  73  
  74  	CMP	$12, R9
  75  	BLT	dec128
  76  	BEQ	dec192
  77  dec256:
  78  	VLD1.P	32(R10), [V1.B16, V2.B16]
  79  	AESD	V1.B16, V0.B16
  80  	AESIMC	V0.B16, V0.B16
  81  	AESD	V2.B16, V0.B16
  82  	AESIMC	V0.B16, V0.B16
  83  dec192:
  84  	VLD1.P	32(R10), [V3.B16, V4.B16]
  85  	AESD	V3.B16, V0.B16
  86  	AESIMC	V0.B16, V0.B16
  87  	AESD	V4.B16, V0.B16
  88  	AESIMC	V0.B16, V0.B16
  89  dec128:
  90  	VLD1.P	64(R10), [V5.B16, V6.B16, V7.B16, V8.B16]
  91  	VLD1.P	64(R10), [V9.B16, V10.B16, V11.B16, V12.B16]
  92  	VLD1.P	48(R10), [V13.B16, V14.B16, V15.B16]
  93  	AESD	V5.B16, V0.B16
  94  	AESIMC	V0.B16, V0.B16
  95  	AESD	V6.B16, V0.B16
  96  	AESIMC	V0.B16, V0.B16
  97  	AESD	V7.B16, V0.B16
  98  	AESIMC	V0.B16, V0.B16
  99  	AESD	V8.B16, V0.B16
 100  	AESIMC	V0.B16, V0.B16
 101  	AESD	V9.B16, V0.B16
 102  	AESIMC	V0.B16, V0.B16
 103  	AESD	V10.B16, V0.B16
 104  	AESIMC	V0.B16, V0.B16
 105  	AESD	V11.B16, V0.B16
 106  	AESIMC	V0.B16, V0.B16
 107  	AESD	V12.B16, V0.B16
 108  	AESIMC	V0.B16, V0.B16
 109  	AESD	V13.B16, V0.B16
 110  	AESIMC	V0.B16, V0.B16
 111  	AESD	V14.B16, V0.B16
 112  	VEOR    V0.B16, V15.B16, V0.B16
 113  	VST1	[V0.B16], (R11)
 114  	RET
 115  
 116  // func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
 117  // Note that round keys are stored in uint128 format, not uint32
 118  TEXT ·expandKeyAsm(SB),NOSPLIT,$0
 119  	MOVD	nr+0(FP), R8
 120  	MOVD	key+8(FP), R9
 121  	MOVD	enc+16(FP), R10
 122  	MOVD	dec+24(FP), R11
 123  	LDP	rotInvSRows<>(SB), (R0, R1)
 124  	VMOV	R0, V3.D[0]
 125  	VMOV	R1, V3.D[1]
 126  	VEOR	V0.B16, V0.B16, V0.B16 // All zeroes
 127  	MOVW	$1, R13
 128  	TBZ	$1, R8, ks192
 129  	TBNZ	$2, R8, ks256
 130  	LDPW	(R9), (R4, R5)
 131  	LDPW	8(R9), (R6, R7)
 132  	STPW.P	(R4, R5), 8(R10)
 133  	STPW.P	(R6, R7), 8(R10)
 134  	MOVW	$0x1b, R14
 135  ks128Loop:
 136  		VMOV	R7, V2.S[0]
 137  		VTBL	V3.B16, [V2.B16], V2.B16
 138  		AESE	V0.B16, V2.B16    // Use AES to compute the SBOX
 139  		EORW	R13, R4
 140  		LSLW	$1, R13           // Compute next Rcon
 141  		ANDSW	$0x100, R13, ZR
 142  		CSELW	NE, R14, R13, R13 // Fake modulo
 143  		SUBS	$1, R8
 144  		VMOV	V2.S[0], R0
 145  		EORW	R0, R4
 146  		EORW	R4, R5
 147  		EORW	R5, R6
 148  		EORW	R6, R7
 149  		STPW.P	(R4, R5), 8(R10)
 150  		STPW.P	(R6, R7), 8(R10)
 151  	BNE	ks128Loop
 152  	CBZ	R11, ksDone       // If dec is nil we are done
 153  	SUB	$176, R10
 154  	// Decryption keys are encryption keys with InverseMixColumns applied
 155  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 156  	VMOV	V0.B16, V7.B16
 157  	AESIMC	V1.B16, V6.B16
 158  	AESIMC	V2.B16, V5.B16
 159  	AESIMC	V3.B16, V4.B16
 160  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 161  	AESIMC	V0.B16, V11.B16
 162  	AESIMC	V1.B16, V10.B16
 163  	AESIMC	V2.B16, V9.B16
 164  	AESIMC	V3.B16, V8.B16
 165  	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
 166  	AESIMC	V0.B16, V14.B16
 167  	AESIMC	V1.B16, V13.B16
 168  	VMOV	V2.B16, V12.B16
 169  	VST1.P	[V12.B16, V13.B16, V14.B16], 48(R11)
 170  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
 171  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
 172  	B	ksDone
 173  ks192:
 174  	LDPW	(R9), (R2, R3)
 175  	LDPW	8(R9), (R4, R5)
 176  	LDPW	16(R9), (R6, R7)
 177  	STPW.P	(R2, R3), 8(R10)
 178  	STPW.P	(R4, R5), 8(R10)
 179  	SUB	$4, R8
 180  ks192Loop:
 181  		STPW.P	(R6, R7), 8(R10)
 182  		VMOV	R7, V2.S[0]
 183  		VTBL	V3.B16, [V2.B16], V2.B16
 184  		AESE	V0.B16, V2.B16
 185  		EORW	R13, R2
 186  		LSLW	$1, R13
 187  		SUBS	$1, R8
 188  		VMOV	V2.S[0], R0
 189  		EORW	R0, R2
 190  		EORW	R2, R3
 191  		EORW	R3, R4
 192  		EORW	R4, R5
 193  		EORW	R5, R6
 194  		EORW	R6, R7
 195  		STPW.P	(R2, R3), 8(R10)
 196  		STPW.P	(R4, R5), 8(R10)
 197  	BNE	ks192Loop
 198  	CBZ	R11, ksDone
 199  	SUB	$208, R10
 200  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 201  	VMOV	V0.B16, V7.B16
 202  	AESIMC	V1.B16, V6.B16
 203  	AESIMC	V2.B16, V5.B16
 204  	AESIMC	V3.B16, V4.B16
 205  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 206  	AESIMC	V0.B16, V11.B16
 207  	AESIMC	V1.B16, V10.B16
 208  	AESIMC	V2.B16, V9.B16
 209  	AESIMC	V3.B16, V8.B16
 210  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 211  	AESIMC	V0.B16, V15.B16
 212  	AESIMC	V1.B16, V14.B16
 213  	AESIMC	V2.B16, V13.B16
 214  	AESIMC	V3.B16, V12.B16
 215  	VLD1	(R10), [V0.B16]
 216  	VST1.P	[V0.B16], 16(R11)
 217  	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
 218  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
 219  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
 220  	B	ksDone
 221  ks256:
 222  	LDP	invSRows<>(SB), (R0, R1)
 223  	VMOV	R0, V4.D[0]
 224  	VMOV	R1, V4.D[1]
 225  	LDPW	(R9), (R0, R1)
 226  	LDPW	8(R9), (R2, R3)
 227  	LDPW	16(R9), (R4, R5)
 228  	LDPW	24(R9), (R6, R7)
 229  	STPW.P	(R0, R1), 8(R10)
 230  	STPW.P	(R2, R3), 8(R10)
 231  	SUB	$7, R8
 232  ks256Loop:
 233  		STPW.P	(R4, R5), 8(R10)
 234  		STPW.P	(R6, R7), 8(R10)
 235  		VMOV	R7, V2.S[0]
 236  		VTBL	V3.B16, [V2.B16], V2.B16
 237  		AESE	V0.B16, V2.B16
 238  		EORW	R13, R0
 239  		LSLW	$1, R13
 240  		SUBS	$1, R8
 241  		VMOV	V2.S[0], R9
 242  		EORW	R9, R0
 243  		EORW	R0, R1
 244  		EORW	R1, R2
 245  		EORW	R2, R3
 246  		VMOV	R3, V2.S[0]
 247  		VTBL	V4.B16, [V2.B16], V2.B16
 248  		AESE	V0.B16, V2.B16
 249  		VMOV	V2.S[0], R9
 250  		EORW	R9, R4
 251  		EORW	R4, R5
 252  		EORW	R5, R6
 253  		EORW	R6, R7
 254  		STPW.P	(R0, R1), 8(R10)
 255  		STPW.P	(R2, R3), 8(R10)
 256  	BNE	ks256Loop
 257  	CBZ	R11, ksDone
 258  	SUB	$240, R10
 259  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 260  	VMOV	V0.B16, V7.B16
 261  	AESIMC	V1.B16, V6.B16
 262  	AESIMC	V2.B16, V5.B16
 263  	AESIMC	V3.B16, V4.B16
 264  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 265  	AESIMC	V0.B16, V11.B16
 266  	AESIMC	V1.B16, V10.B16
 267  	AESIMC	V2.B16, V9.B16
 268  	AESIMC	V3.B16, V8.B16
 269  	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
 270  	AESIMC	V0.B16, V15.B16
 271  	AESIMC	V1.B16, V14.B16
 272  	AESIMC	V2.B16, V13.B16
 273  	AESIMC	V3.B16, V12.B16
 274  	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
 275  	AESIMC	V0.B16, V18.B16
 276  	AESIMC	V1.B16, V17.B16
 277  	VMOV	V2.B16, V16.B16
 278  	VST1.P	[V16.B16, V17.B16, V18.B16], 48(R11)
 279  	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
 280  	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
 281  	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
 282  ksDone:
 283  	RET
 284