gcm_arm64.s raw

   1  // Copyright 2018 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build !purego
   6  
   7  #include "textflag.h"
   8  
   9  #define B0 V0
  10  #define B1 V1
  11  #define B2 V2
  12  #define B3 V3
  13  #define B4 V4
  14  #define B5 V5
  15  #define B6 V6
  16  #define B7 V7
  17  
  18  #define ACC0 V8
  19  #define ACC1 V9
  20  #define ACCM V10
  21  
  22  #define T0 V11
  23  #define T1 V12
  24  #define T2 V13
  25  #define T3 V14
  26  
  27  #define POLY V15
  28  #define ZERO V16
  29  #define INC V17
  30  #define CTR V18
  31  
  32  #define K0 V19
  33  #define K1 V20
  34  #define K2 V21
  35  #define K3 V22
  36  #define K4 V23
  37  #define K5 V24
  38  #define K6 V25
  39  #define K7 V26
  40  #define K8 V27
  41  #define K9 V28
  42  #define K10 V29
  43  #define K11 V30
  44  #define KLAST V31
  45  
  46  #define reduce() \
  47  	VEOR	ACC0.B16, ACCM.B16, ACCM.B16     \
  48  	VEOR	ACC1.B16, ACCM.B16, ACCM.B16     \
  49  	VEXT	$8, ZERO.B16, ACCM.B16, T0.B16   \
  50  	VEXT	$8, ACCM.B16, ZERO.B16, ACCM.B16 \
  51  	VEOR	ACCM.B16, ACC0.B16, ACC0.B16     \
  52  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
  53  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
  54  	VEXT	$8, ACC0.B16, ACC0.B16, ACC0.B16 \
  55  	VEOR	T0.B16, ACC0.B16, ACC0.B16       \
  56  	VPMULL	POLY.D1, ACC0.D1, T0.Q1          \
  57  	VEOR	T0.B16, ACC1.B16, ACC1.B16       \
  58  	VEXT	$8, ACC1.B16, ACC1.B16, ACC1.B16 \
  59  	VEOR	ACC1.B16, ACC0.B16, ACC0.B16     \
  60  
  61  // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
  62  TEXT ·gcmAesFinish(SB),NOSPLIT,$0
  63  #define pTbl R0
  64  #define tMsk R1
  65  #define tPtr R2
  66  #define plen R3
  67  #define dlen R4
  68  
  69  	MOVD	$0xC2, R1
  70  	LSL	$56, R1
  71  	MOVD	$1, R0
  72  	VMOV	R1, POLY.D[0]
  73  	VMOV	R0, POLY.D[1]
  74  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
  75  
  76  	MOVD	productTable+0(FP), pTbl
  77  	MOVD	tagMask+8(FP), tMsk
  78  	MOVD	T+16(FP), tPtr
  79  	MOVD	pLen+24(FP), plen
  80  	MOVD	dLen+32(FP), dlen
  81  
  82  	VLD1	(tPtr), [ACC0.B16]
  83  	VLD1	(tMsk), [B1.B16]
  84  
  85  	LSL	$3, plen
  86  	LSL	$3, dlen
  87  
  88  	VMOV	dlen, B0.D[0]
  89  	VMOV	plen, B0.D[1]
  90  
  91  	ADD	$14*16, pTbl
  92  	VLD1.P	(pTbl), [T1.B16, T2.B16]
  93  
  94  	VEOR	ACC0.B16, B0.B16, B0.B16
  95  
  96  	VEXT	$8, B0.B16, B0.B16, T0.B16
  97  	VEOR	B0.B16, T0.B16, T0.B16
  98  	VPMULL	B0.D1, T1.D1, ACC1.Q1
  99  	VPMULL2	B0.D2, T1.D2, ACC0.Q1
 100  	VPMULL	T0.D1, T2.D1, ACCM.Q1
 101  
 102  	reduce()
 103  
 104  	VREV64	ACC0.B16, ACC0.B16
 105  	VEOR	B1.B16, ACC0.B16, ACC0.B16
 106  
 107  	VST1	[ACC0.B16], (tPtr)
 108  	RET
 109  #undef pTbl
 110  #undef tMsk
 111  #undef tPtr
 112  #undef plen
 113  #undef dlen
 114  
 115  // func gcmAesInit(productTable *[256]byte, ks []uint32)
 116  TEXT ·gcmAesInit(SB),NOSPLIT,$0
 117  #define pTbl R0
 118  #define KS R1
 119  #define NR R2
 120  #define I R3
 121  	MOVD	productTable+0(FP), pTbl
 122  	MOVD	ks_base+8(FP), KS
 123  	MOVD	ks_len+16(FP), NR
 124  
 125  	MOVD	$0xC2, I
 126  	LSL	$56, I
 127  	VMOV	I, POLY.D[0]
 128  	MOVD	$1, I
 129  	VMOV	I, POLY.D[1]
 130  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
 131  
 132  	// Encrypt block 0 with the AES key to generate the hash key H
 133  	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
 134  	VEOR	B0.B16, B0.B16, B0.B16
 135  	AESE	T0.B16, B0.B16
 136  	AESMC	B0.B16, B0.B16
 137  	AESE	T1.B16, B0.B16
 138  	AESMC	B0.B16, B0.B16
 139  	AESE	T2.B16, B0.B16
 140  	AESMC	B0.B16, B0.B16
 141  	AESE	T3.B16, B0.B16
 142  	AESMC	B0.B16, B0.B16
 143  	VLD1.P	64(KS), [T0.B16, T1.B16, T2.B16, T3.B16]
 144  	AESE	T0.B16, B0.B16
 145  	AESMC	B0.B16, B0.B16
 146  	AESE	T1.B16, B0.B16
 147  	AESMC	B0.B16, B0.B16
 148  	AESE	T2.B16, B0.B16
 149  	AESMC	B0.B16, B0.B16
 150  	AESE	T3.B16, B0.B16
 151  	AESMC	B0.B16, B0.B16
 152  	TBZ	$4, NR, initEncFinish
 153  	VLD1.P	32(KS), [T0.B16, T1.B16]
 154  	AESE	T0.B16, B0.B16
 155  	AESMC	B0.B16, B0.B16
 156  	AESE	T1.B16, B0.B16
 157  	AESMC	B0.B16, B0.B16
 158  	TBZ	$3, NR, initEncFinish
 159  	VLD1.P	32(KS), [T0.B16, T1.B16]
 160  	AESE	T0.B16, B0.B16
 161  	AESMC	B0.B16, B0.B16
 162  	AESE	T1.B16, B0.B16
 163  	AESMC	B0.B16, B0.B16
 164  initEncFinish:
 165  	VLD1	(KS), [T0.B16, T1.B16, T2.B16]
 166  	AESE	T0.B16, B0.B16
 167  	AESMC	B0.B16, B0.B16
 168  	AESE	T1.B16, B0.B16
 169  	VEOR	T2.B16, B0.B16, B0.B16
 170  
 171  	VREV64	B0.B16, B0.B16
 172  
 173  	// Multiply by 2 modulo P
 174  	VMOV	B0.D[0], I
 175  	ASR	$63, I
 176  	VMOV	I, T1.D[0]
 177  	VMOV	I, T1.D[1]
 178  	VAND	POLY.B16, T1.B16, T1.B16
 179  	VUSHR	$63, B0.D2, T2.D2
 180  	VEXT	$8, ZERO.B16, T2.B16, T2.B16
 181  	VSHL	$1, B0.D2, B0.D2
 182  	VEOR	T1.B16, B0.B16, B0.B16
 183  	VEOR	T2.B16, B0.B16, B0.B16 // Can avoid this when VSLI is available
 184  
 185  	// Karatsuba pre-computation
 186  	VEXT	$8, B0.B16, B0.B16, B1.B16
 187  	VEOR	B0.B16, B1.B16, B1.B16
 188  
 189  	ADD	$14*16, pTbl
 190  	VST1	[B0.B16, B1.B16], (pTbl)
 191  	SUB	$2*16, pTbl
 192  
 193  	VMOV	B0.B16, B2.B16
 194  	VMOV	B1.B16, B3.B16
 195  
 196  	MOVD	$7, I
 197  
 198  initLoop:
 199  	// Compute powers of H
 200  	SUBS	$1, I
 201  
 202  	VPMULL	B0.D1, B2.D1, T1.Q1
 203  	VPMULL2	B0.D2, B2.D2, T0.Q1
 204  	VPMULL	B1.D1, B3.D1, T2.Q1
 205  	VEOR	T0.B16, T2.B16, T2.B16
 206  	VEOR	T1.B16, T2.B16, T2.B16
 207  	VEXT	$8, ZERO.B16, T2.B16, T3.B16
 208  	VEXT	$8, T2.B16, ZERO.B16, T2.B16
 209  	VEOR	T2.B16, T0.B16, T0.B16
 210  	VEOR	T3.B16, T1.B16, T1.B16
 211  	VPMULL	POLY.D1, T0.D1, T2.Q1
 212  	VEXT	$8, T0.B16, T0.B16, T0.B16
 213  	VEOR	T2.B16, T0.B16, T0.B16
 214  	VPMULL	POLY.D1, T0.D1, T2.Q1
 215  	VEXT	$8, T0.B16, T0.B16, T0.B16
 216  	VEOR	T2.B16, T0.B16, T0.B16
 217  	VEOR	T1.B16, T0.B16, B2.B16
 218  	VMOV	B2.B16, B3.B16
 219  	VEXT	$8, B2.B16, B2.B16, B2.B16
 220  	VEOR	B2.B16, B3.B16, B3.B16
 221  
 222  	VST1	[B2.B16, B3.B16], (pTbl)
 223  	SUB	$2*16, pTbl
 224  
 225  	BNE	initLoop
 226  	RET
 227  #undef I
 228  #undef NR
 229  #undef KS
 230  #undef pTbl
 231  
 232  // func gcmAesData(productTable *[256]byte, data []byte, T *[16]byte)
 233  TEXT ·gcmAesData(SB),NOSPLIT,$0
 234  #define pTbl R0
 235  #define aut R1
 236  #define tPtr R2
 237  #define autLen R3
 238  #define H0 R4
 239  #define pTblSave R5
 240  
 241  #define mulRound(X) \
 242  	VLD1.P	32(pTbl), [T1.B16, T2.B16] \
 243  	VREV64	X.B16, X.B16               \
 244  	VEXT	$8, X.B16, X.B16, T0.B16   \
 245  	VEOR	X.B16, T0.B16, T0.B16      \
 246  	VPMULL	X.D1, T1.D1, T3.Q1         \
 247  	VEOR	T3.B16, ACC1.B16, ACC1.B16 \
 248  	VPMULL2	X.D2, T1.D2, T3.Q1         \
 249  	VEOR	T3.B16, ACC0.B16, ACC0.B16 \
 250  	VPMULL	T0.D1, T2.D1, T3.Q1        \
 251  	VEOR	T3.B16, ACCM.B16, ACCM.B16
 252  
 253  	MOVD	productTable+0(FP), pTbl
 254  	MOVD	data_base+8(FP), aut
 255  	MOVD	data_len+16(FP), autLen
 256  	MOVD	T+32(FP), tPtr
 257  
 258  	VEOR	ACC0.B16, ACC0.B16, ACC0.B16
 259  	CBZ	autLen, dataBail
 260  
 261  	MOVD	$0xC2, H0
 262  	LSL	$56, H0
 263  	VMOV	H0, POLY.D[0]
 264  	MOVD	$1, H0
 265  	VMOV	H0, POLY.D[1]
 266  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
 267  	MOVD	pTbl, pTblSave
 268  
 269  	CMP	$13, autLen
 270  	BEQ	dataTLS
 271  	CMP	$128, autLen
 272  	BLT	startSinglesLoop
 273  	B	octetsLoop
 274  
 275  dataTLS:
 276  	ADD	$14*16, pTbl
 277  	VLD1.P	(pTbl), [T1.B16, T2.B16]
 278  	VEOR	B0.B16, B0.B16, B0.B16
 279  
 280  	MOVD	(aut), H0
 281  	VMOV	H0, B0.D[0]
 282  	MOVW	8(aut), H0
 283  	VMOV	H0, B0.S[2]
 284  	MOVB	12(aut), H0
 285  	VMOV	H0, B0.B[12]
 286  
 287  	MOVD	$0, autLen
 288  	B	dataMul
 289  
 290  octetsLoop:
 291  		CMP	$128, autLen
 292  		BLT	startSinglesLoop
 293  		SUB	$128, autLen
 294  
 295  		VLD1.P	32(aut), [B0.B16, B1.B16]
 296  
 297  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
 298  		VREV64	B0.B16, B0.B16
 299  		VEOR	ACC0.B16, B0.B16, B0.B16
 300  		VEXT	$8, B0.B16, B0.B16, T0.B16
 301  		VEOR	B0.B16, T0.B16, T0.B16
 302  		VPMULL	B0.D1, T1.D1, ACC1.Q1
 303  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
 304  		VPMULL	T0.D1, T2.D1, ACCM.Q1
 305  
 306  		mulRound(B1)
 307  		VLD1.P  32(aut), [B2.B16, B3.B16]
 308  		mulRound(B2)
 309  		mulRound(B3)
 310  		VLD1.P  32(aut), [B4.B16, B5.B16]
 311  		mulRound(B4)
 312  		mulRound(B5)
 313  		VLD1.P  32(aut), [B6.B16, B7.B16]
 314  		mulRound(B6)
 315  		mulRound(B7)
 316  
 317  		MOVD	pTblSave, pTbl
 318  		reduce()
 319  	B	octetsLoop
 320  
 321  startSinglesLoop:
 322  
 323  	ADD	$14*16, pTbl
 324  	VLD1.P	(pTbl), [T1.B16, T2.B16]
 325  
 326  singlesLoop:
 327  
 328  		CMP	$16, autLen
 329  		BLT	dataEnd
 330  		SUB	$16, autLen
 331  
 332  		VLD1.P	16(aut), [B0.B16]
 333  dataMul:
 334  		VREV64	B0.B16, B0.B16
 335  		VEOR	ACC0.B16, B0.B16, B0.B16
 336  
 337  		VEXT	$8, B0.B16, B0.B16, T0.B16
 338  		VEOR	B0.B16, T0.B16, T0.B16
 339  		VPMULL	B0.D1, T1.D1, ACC1.Q1
 340  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
 341  		VPMULL	T0.D1, T2.D1, ACCM.Q1
 342  
 343  		reduce()
 344  
 345  	B	singlesLoop
 346  
 347  dataEnd:
 348  
 349  	CBZ	autLen, dataBail
 350  	VEOR	B0.B16, B0.B16, B0.B16
 351  	ADD	autLen, aut
 352  
 353  dataLoadLoop:
 354  		MOVB.W	-1(aut), H0
 355  		VEXT	$15, B0.B16, ZERO.B16, B0.B16
 356  		VMOV	H0, B0.B[0]
 357  		SUBS	$1, autLen
 358  		BNE	dataLoadLoop
 359  	B	dataMul
 360  
 361  dataBail:
 362  	VST1	[ACC0.B16], (tPtr)
 363  	RET
 364  
 365  #undef pTbl
 366  #undef aut
 367  #undef tPtr
 368  #undef autLen
 369  #undef H0
 370  #undef pTblSave
 371  
 372  // func gcmAesEnc(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
 373  TEXT ·gcmAesEnc(SB),NOSPLIT,$0
 374  #define pTbl R0
 375  #define dstPtr R1
 376  #define ctrPtr R2
 377  #define srcPtr R3
 378  #define ks R4
 379  #define tPtr R5
 380  #define srcPtrLen R6
 381  #define aluCTR R7
 382  #define aluTMP R8
 383  #define aluK R9
 384  #define NR R10
 385  #define H0 R11
 386  #define H1 R12
 387  #define curK R13
 388  #define pTblSave R14
 389  
 390  #define aesrndx8(K) \
 391  	AESE	K.B16, B0.B16    \
 392  	AESMC	B0.B16, B0.B16   \
 393  	AESE	K.B16, B1.B16    \
 394  	AESMC	B1.B16, B1.B16   \
 395  	AESE	K.B16, B2.B16    \
 396  	AESMC	B2.B16, B2.B16   \
 397  	AESE	K.B16, B3.B16    \
 398  	AESMC	B3.B16, B3.B16   \
 399  	AESE	K.B16, B4.B16    \
 400  	AESMC	B4.B16, B4.B16   \
 401  	AESE	K.B16, B5.B16    \
 402  	AESMC	B5.B16, B5.B16   \
 403  	AESE	K.B16, B6.B16    \
 404  	AESMC	B6.B16, B6.B16   \
 405  	AESE	K.B16, B7.B16    \
 406  	AESMC	B7.B16, B7.B16
 407  
 408  #define aesrndlastx8(K) \
 409  	AESE	K.B16, B0.B16    \
 410  	AESE	K.B16, B1.B16    \
 411  	AESE	K.B16, B2.B16    \
 412  	AESE	K.B16, B3.B16    \
 413  	AESE	K.B16, B4.B16    \
 414  	AESE	K.B16, B5.B16    \
 415  	AESE	K.B16, B6.B16    \
 416  	AESE	K.B16, B7.B16
 417  
 418  	MOVD	productTable+0(FP), pTbl
 419  	MOVD	dst+8(FP), dstPtr
 420  	MOVD	src_base+32(FP), srcPtr
 421  	MOVD	src_len+40(FP), srcPtrLen
 422  	MOVD	ctr+56(FP), ctrPtr
 423  	MOVD	T+64(FP), tPtr
 424  	MOVD	ks_base+72(FP), ks
 425  	MOVD	ks_len+80(FP), NR
 426  
 427  	MOVD	$0xC2, H1
 428  	LSL	$56, H1
 429  	MOVD	$1, H0
 430  	VMOV	H1, POLY.D[0]
 431  	VMOV	H0, POLY.D[1]
 432  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
 433  	// Compute NR from len(ks)
 434  	MOVD	pTbl, pTblSave
 435  	// Current tag, after AAD
 436  	VLD1	(tPtr), [ACC0.B16]
 437  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
 438  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
 439  	// Prepare initial counter, and the increment vector
 440  	VLD1	(ctrPtr), [CTR.B16]
 441  	VEOR	INC.B16, INC.B16, INC.B16
 442  	MOVD	$1, H0
 443  	VMOV	H0, INC.S[3]
 444  	VREV32	CTR.B16, CTR.B16
 445  	VADD	CTR.S4, INC.S4, CTR.S4
 446  	// Skip to <8 blocks loop
 447  	CMP	$128, srcPtrLen
 448  
 449  	MOVD	ks, H0
 450  	// For AES-128 round keys are stored in: K0 .. K10, KLAST
 451  	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
 452  	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
 453  	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
 454  	VMOV	K10.B16, KLAST.B16
 455  
 456  	BLT	startSingles
 457  	// There are at least 8 blocks to encrypt
 458  	TBZ	$4, NR, octetsLoop
 459  
 460  	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
 461  	VMOV	K8.B16, K10.B16
 462  	VMOV	K9.B16, K11.B16
 463  	VMOV	KLAST.B16, K8.B16
 464  	VLD1.P	16(H0), [K9.B16]
 465  	VLD1.P  16(H0), [KLAST.B16]
 466  	TBZ	$3, NR, octetsLoop
 467  	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
 468  	VMOV	KLAST.B16, K8.B16
 469  	VLD1.P	16(H0), [K9.B16]
 470  	VLD1.P  16(H0), [KLAST.B16]
 471  	ADD	$10*16, ks, H0
 472  	MOVD	H0, curK
 473  
 474  octetsLoop:
 475  		SUB	$128, srcPtrLen
 476  
 477  		VMOV	CTR.B16, B0.B16
 478  		VADD	B0.S4, INC.S4, B1.S4
 479  		VREV32	B0.B16, B0.B16
 480  		VADD	B1.S4, INC.S4, B2.S4
 481  		VREV32	B1.B16, B1.B16
 482  		VADD	B2.S4, INC.S4, B3.S4
 483  		VREV32	B2.B16, B2.B16
 484  		VADD	B3.S4, INC.S4, B4.S4
 485  		VREV32	B3.B16, B3.B16
 486  		VADD	B4.S4, INC.S4, B5.S4
 487  		VREV32	B4.B16, B4.B16
 488  		VADD	B5.S4, INC.S4, B6.S4
 489  		VREV32	B5.B16, B5.B16
 490  		VADD	B6.S4, INC.S4, B7.S4
 491  		VREV32	B6.B16, B6.B16
 492  		VADD	B7.S4, INC.S4, CTR.S4
 493  		VREV32	B7.B16, B7.B16
 494  
 495  		aesrndx8(K0)
 496  		aesrndx8(K1)
 497  		aesrndx8(K2)
 498  		aesrndx8(K3)
 499  		aesrndx8(K4)
 500  		aesrndx8(K5)
 501  		aesrndx8(K6)
 502  		aesrndx8(K7)
 503  		TBZ	$4, NR, octetsFinish
 504  		aesrndx8(K10)
 505  		aesrndx8(K11)
 506  		TBZ	$3, NR, octetsFinish
 507  		VLD1.P	32(curK), [T1.B16, T2.B16]
 508  		aesrndx8(T1)
 509  		aesrndx8(T2)
 510  		MOVD	H0, curK
 511  octetsFinish:
 512  		aesrndx8(K8)
 513  		aesrndlastx8(K9)
 514  
 515  		VEOR	KLAST.B16, B0.B16, B0.B16
 516  		VEOR	KLAST.B16, B1.B16, B1.B16
 517  		VEOR	KLAST.B16, B2.B16, B2.B16
 518  		VEOR	KLAST.B16, B3.B16, B3.B16
 519  		VEOR	KLAST.B16, B4.B16, B4.B16
 520  		VEOR	KLAST.B16, B5.B16, B5.B16
 521  		VEOR	KLAST.B16, B6.B16, B6.B16
 522  		VEOR	KLAST.B16, B7.B16, B7.B16
 523  
 524  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 525  		VEOR	B0.B16, T1.B16, B0.B16
 526  		VEOR	B1.B16, T2.B16, B1.B16
 527  		VST1.P  [B0.B16, B1.B16], 32(dstPtr)
 528  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 529  		VEOR	B2.B16, T1.B16, B2.B16
 530  		VEOR	B3.B16, T2.B16, B3.B16
 531  		VST1.P  [B2.B16, B3.B16], 32(dstPtr)
 532  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 533  		VEOR	B4.B16, T1.B16, B4.B16
 534  		VEOR	B5.B16, T2.B16, B5.B16
 535  		VST1.P  [B4.B16, B5.B16], 32(dstPtr)
 536  		VLD1.P	32(srcPtr), [T1.B16, T2.B16]
 537  		VEOR	B6.B16, T1.B16, B6.B16
 538  		VEOR	B7.B16, T2.B16, B7.B16
 539  		VST1.P  [B6.B16, B7.B16], 32(dstPtr)
 540  
 541  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
 542  		VREV64	B0.B16, B0.B16
 543  		VEOR	ACC0.B16, B0.B16, B0.B16
 544  		VEXT	$8, B0.B16, B0.B16, T0.B16
 545  		VEOR	B0.B16, T0.B16, T0.B16
 546  		VPMULL	B0.D1, T1.D1, ACC1.Q1
 547  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
 548  		VPMULL	T0.D1, T2.D1, ACCM.Q1
 549  
 550  		mulRound(B1)
 551  		mulRound(B2)
 552  		mulRound(B3)
 553  		mulRound(B4)
 554  		mulRound(B5)
 555  		mulRound(B6)
 556  		mulRound(B7)
 557  		MOVD	pTblSave, pTbl
 558  		reduce()
 559  
 560  		CMP	$128, srcPtrLen
 561  		BGE	octetsLoop
 562  
 563  startSingles:
 564  	CBZ	srcPtrLen, done
 565  	ADD	$14*16, pTbl
 566  	// Preload H and its Karatsuba precomp
 567  	VLD1.P	(pTbl), [T1.B16, T2.B16]
 568  	// Preload AES round keys
 569  	ADD	$128, ks
 570  	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
 571  	VMOV	K10.B16, KLAST.B16
 572  	TBZ	$4, NR, singlesLoop
 573  	VLD1.P	32(ks), [B1.B16, B2.B16]
 574  	VMOV	B2.B16, KLAST.B16
 575  	TBZ	$3, NR, singlesLoop
 576  	VLD1.P	32(ks), [B3.B16, B4.B16]
 577  	VMOV	B4.B16, KLAST.B16
 578  
 579  singlesLoop:
 580  		CMP	$16, srcPtrLen
 581  		BLT	tail
 582  		SUB	$16, srcPtrLen
 583  
 584  		VLD1.P	16(srcPtr), [T0.B16]
 585  		VEOR	KLAST.B16, T0.B16, T0.B16
 586  
 587  		VREV32	CTR.B16, B0.B16
 588  		VADD	CTR.S4, INC.S4, CTR.S4
 589  
 590  		AESE	K0.B16, B0.B16
 591  		AESMC	B0.B16, B0.B16
 592  		AESE	K1.B16, B0.B16
 593  		AESMC	B0.B16, B0.B16
 594  		AESE	K2.B16, B0.B16
 595  		AESMC	B0.B16, B0.B16
 596  		AESE	K3.B16, B0.B16
 597  		AESMC	B0.B16, B0.B16
 598  		AESE	K4.B16, B0.B16
 599  		AESMC	B0.B16, B0.B16
 600  		AESE	K5.B16, B0.B16
 601  		AESMC	B0.B16, B0.B16
 602  		AESE	K6.B16, B0.B16
 603  		AESMC	B0.B16, B0.B16
 604  		AESE	K7.B16, B0.B16
 605  		AESMC	B0.B16, B0.B16
 606  		AESE	K8.B16, B0.B16
 607  		AESMC	B0.B16, B0.B16
 608  		AESE	K9.B16, B0.B16
 609  		TBZ	$4, NR, singlesLast
 610  		AESMC	B0.B16, B0.B16
 611  		AESE	K10.B16, B0.B16
 612  		AESMC	B0.B16, B0.B16
 613  		AESE	B1.B16, B0.B16
 614  		TBZ	$3, NR, singlesLast
 615  		AESMC	B0.B16, B0.B16
 616  		AESE	B2.B16, B0.B16
 617  		AESMC	B0.B16, B0.B16
 618  		AESE	B3.B16, B0.B16
 619  singlesLast:
 620  		VEOR	T0.B16, B0.B16, B0.B16
 621  encReduce:
 622  		VST1.P	[B0.B16], 16(dstPtr)
 623  
 624  		VREV64	B0.B16, B0.B16
 625  		VEOR	ACC0.B16, B0.B16, B0.B16
 626  
 627  		VEXT	$8, B0.B16, B0.B16, T0.B16
 628  		VEOR	B0.B16, T0.B16, T0.B16
 629  		VPMULL	B0.D1, T1.D1, ACC1.Q1
 630  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
 631  		VPMULL	T0.D1, T2.D1, ACCM.Q1
 632  
 633  		reduce()
 634  
 635  	B	singlesLoop
 636  tail:
 637  	CBZ	srcPtrLen, done
 638  
 639  	VEOR	T0.B16, T0.B16, T0.B16
 640  	VEOR	T3.B16, T3.B16, T3.B16
 641  	MOVD	$0, H1
 642  	SUB	$1, H1
 643  	ADD	srcPtrLen, srcPtr
 644  
 645  	TBZ	$3, srcPtrLen, ld4
 646  	MOVD.W	-8(srcPtr), H0
 647  	VMOV	H0, T0.D[0]
 648  	VMOV	H1, T3.D[0]
 649  ld4:
 650  	TBZ	$2, srcPtrLen, ld2
 651  	MOVW.W	-4(srcPtr), H0
 652  	VEXT	$12, T0.B16, ZERO.B16, T0.B16
 653  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
 654  	VMOV	H0, T0.S[0]
 655  	VMOV	H1, T3.S[0]
 656  ld2:
 657  	TBZ	$1, srcPtrLen, ld1
 658  	MOVH.W	-2(srcPtr), H0
 659  	VEXT	$14, T0.B16, ZERO.B16, T0.B16
 660  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
 661  	VMOV	H0, T0.H[0]
 662  	VMOV	H1, T3.H[0]
 663  ld1:
 664  	TBZ	$0, srcPtrLen, ld0
 665  	MOVB.W	-1(srcPtr), H0
 666  	VEXT	$15, T0.B16, ZERO.B16, T0.B16
 667  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
 668  	VMOV	H0, T0.B[0]
 669  	VMOV	H1, T3.B[0]
 670  ld0:
 671  
 672  	MOVD	ZR, srcPtrLen
 673  	VEOR	KLAST.B16, T0.B16, T0.B16
 674  	VREV32	CTR.B16, B0.B16
 675  
 676  	AESE	K0.B16, B0.B16
 677  	AESMC	B0.B16, B0.B16
 678  	AESE	K1.B16, B0.B16
 679  	AESMC	B0.B16, B0.B16
 680  	AESE	K2.B16, B0.B16
 681  	AESMC	B0.B16, B0.B16
 682  	AESE	K3.B16, B0.B16
 683  	AESMC	B0.B16, B0.B16
 684  	AESE	K4.B16, B0.B16
 685  	AESMC	B0.B16, B0.B16
 686  	AESE	K5.B16, B0.B16
 687  	AESMC	B0.B16, B0.B16
 688  	AESE	K6.B16, B0.B16
 689  	AESMC	B0.B16, B0.B16
 690  	AESE	K7.B16, B0.B16
 691  	AESMC	B0.B16, B0.B16
 692  	AESE	K8.B16, B0.B16
 693  	AESMC	B0.B16, B0.B16
 694  	AESE	K9.B16, B0.B16
 695  	TBZ	$4, NR, tailLast
 696  	AESMC	B0.B16, B0.B16
 697  	AESE	K10.B16, B0.B16
 698  	AESMC	B0.B16, B0.B16
 699  	AESE	B1.B16, B0.B16
 700  	TBZ	$3, NR, tailLast
 701  	AESMC	B0.B16, B0.B16
 702  	AESE	B2.B16, B0.B16
 703  	AESMC	B0.B16, B0.B16
 704  	AESE	B3.B16, B0.B16
 705  
 706  tailLast:
 707  	VEOR	T0.B16, B0.B16, B0.B16
 708  	VAND	T3.B16, B0.B16, B0.B16
 709  	B	encReduce
 710  
 711  done:
 712  	VST1	[ACC0.B16], (tPtr)
 713  	RET
 714  
 715  // func gcmAesDec(productTable *[256]byte, dst, src []byte, ctr, T *[16]byte, ks []uint32)
 716  TEXT ·gcmAesDec(SB),NOSPLIT,$0
 717  	MOVD	productTable+0(FP), pTbl
 718  	MOVD	dst+8(FP), dstPtr
 719  	MOVD	src_base+32(FP), srcPtr
 720  	MOVD	src_len+40(FP), srcPtrLen
 721  	MOVD	ctr+56(FP), ctrPtr
 722  	MOVD	T+64(FP), tPtr
 723  	MOVD	ks_base+72(FP), ks
 724  	MOVD	ks_len+80(FP), NR
 725  
 726  	MOVD	$0xC2, H1
 727  	LSL	$56, H1
 728  	MOVD	$1, H0
 729  	VMOV	H1, POLY.D[0]
 730  	VMOV	H0, POLY.D[1]
 731  	VEOR	ZERO.B16, ZERO.B16, ZERO.B16
 732  	// Compute NR from len(ks)
 733  	MOVD	pTbl, pTblSave
 734  	// Current tag, after AAD
 735  	VLD1	(tPtr), [ACC0.B16]
 736  	VEOR	ACC1.B16, ACC1.B16, ACC1.B16
 737  	VEOR	ACCM.B16, ACCM.B16, ACCM.B16
 738  	// Prepare initial counter, and the increment vector
 739  	VLD1	(ctrPtr), [CTR.B16]
 740  	VEOR	INC.B16, INC.B16, INC.B16
 741  	MOVD	$1, H0
 742  	VMOV	H0, INC.S[3]
 743  	VREV32	CTR.B16, CTR.B16
 744  	VADD	CTR.S4, INC.S4, CTR.S4
 745  
 746  	MOVD	ks, H0
 747  	// For AES-128 round keys are stored in: K0 .. K10, KLAST
 748  	VLD1.P	64(H0), [K0.B16, K1.B16, K2.B16, K3.B16]
 749  	VLD1.P	64(H0), [K4.B16, K5.B16, K6.B16, K7.B16]
 750  	VLD1.P	48(H0), [K8.B16, K9.B16, K10.B16]
 751  	VMOV	K10.B16, KLAST.B16
 752  
 753  	// Skip to <8 blocks loop
 754  	CMP	$128, srcPtrLen
 755  	BLT	startSingles
 756  	// There are at least 8 blocks to encrypt
 757  	TBZ	$4, NR, octetsLoop
 758  
 759  	// For AES-192 round keys occupy: K0 .. K7, K10, K11, K8, K9, KLAST
 760  	VMOV	K8.B16, K10.B16
 761  	VMOV	K9.B16, K11.B16
 762  	VMOV	KLAST.B16, K8.B16
 763  	VLD1.P	16(H0), [K9.B16]
 764  	VLD1.P  16(H0), [KLAST.B16]
 765  	TBZ	$3, NR, octetsLoop
 766  	// For AES-256 round keys occupy: K0 .. K7, K10, K11, mem, mem, K8, K9, KLAST
 767  	VMOV	KLAST.B16, K8.B16
 768  	VLD1.P	16(H0), [K9.B16]
 769  	VLD1.P  16(H0), [KLAST.B16]
 770  	ADD	$10*16, ks, H0
 771  	MOVD	H0, curK
 772  
 773  octetsLoop:
 774  		SUB	$128, srcPtrLen
 775  
 776  		VMOV	CTR.B16, B0.B16
 777  		VADD	B0.S4, INC.S4, B1.S4
 778  		VREV32	B0.B16, B0.B16
 779  		VADD	B1.S4, INC.S4, B2.S4
 780  		VREV32	B1.B16, B1.B16
 781  		VADD	B2.S4, INC.S4, B3.S4
 782  		VREV32	B2.B16, B2.B16
 783  		VADD	B3.S4, INC.S4, B4.S4
 784  		VREV32	B3.B16, B3.B16
 785  		VADD	B4.S4, INC.S4, B5.S4
 786  		VREV32	B4.B16, B4.B16
 787  		VADD	B5.S4, INC.S4, B6.S4
 788  		VREV32	B5.B16, B5.B16
 789  		VADD	B6.S4, INC.S4, B7.S4
 790  		VREV32	B6.B16, B6.B16
 791  		VADD	B7.S4, INC.S4, CTR.S4
 792  		VREV32	B7.B16, B7.B16
 793  
 794  		aesrndx8(K0)
 795  		aesrndx8(K1)
 796  		aesrndx8(K2)
 797  		aesrndx8(K3)
 798  		aesrndx8(K4)
 799  		aesrndx8(K5)
 800  		aesrndx8(K6)
 801  		aesrndx8(K7)
 802  		TBZ	$4, NR, octetsFinish
 803  		aesrndx8(K10)
 804  		aesrndx8(K11)
 805  		TBZ	$3, NR, octetsFinish
 806  		VLD1.P	32(curK), [T1.B16, T2.B16]
 807  		aesrndx8(T1)
 808  		aesrndx8(T2)
 809  		MOVD	H0, curK
 810  octetsFinish:
 811  		aesrndx8(K8)
 812  		aesrndlastx8(K9)
 813  
 814  		VEOR	KLAST.B16, B0.B16, T1.B16
 815  		VEOR	KLAST.B16, B1.B16, T2.B16
 816  		VEOR	KLAST.B16, B2.B16, B2.B16
 817  		VEOR	KLAST.B16, B3.B16, B3.B16
 818  		VEOR	KLAST.B16, B4.B16, B4.B16
 819  		VEOR	KLAST.B16, B5.B16, B5.B16
 820  		VEOR	KLAST.B16, B6.B16, B6.B16
 821  		VEOR	KLAST.B16, B7.B16, B7.B16
 822  
 823  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 824  		VEOR	B0.B16, T1.B16, T1.B16
 825  		VEOR	B1.B16, T2.B16, T2.B16
 826  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 827  
 828  		VLD1.P	32(pTbl), [T1.B16, T2.B16]
 829  		VREV64	B0.B16, B0.B16
 830  		VEOR	ACC0.B16, B0.B16, B0.B16
 831  		VEXT	$8, B0.B16, B0.B16, T0.B16
 832  		VEOR	B0.B16, T0.B16, T0.B16
 833  		VPMULL	B0.D1, T1.D1, ACC1.Q1
 834  		VPMULL2	B0.D2, T1.D2, ACC0.Q1
 835  		VPMULL	T0.D1, T2.D1, ACCM.Q1
 836  		mulRound(B1)
 837  
 838  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 839  		VEOR	B2.B16, B0.B16, T1.B16
 840  		VEOR	B3.B16, B1.B16, T2.B16
 841  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 842  		mulRound(B0)
 843  		mulRound(B1)
 844  
 845  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 846  		VEOR	B4.B16, B0.B16, T1.B16
 847  		VEOR	B5.B16, B1.B16, T2.B16
 848  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 849  		mulRound(B0)
 850  		mulRound(B1)
 851  
 852  		VLD1.P	32(srcPtr), [B0.B16, B1.B16]
 853  		VEOR	B6.B16, B0.B16, T1.B16
 854  		VEOR	B7.B16, B1.B16, T2.B16
 855  		VST1.P  [T1.B16, T2.B16], 32(dstPtr)
 856  		mulRound(B0)
 857  		mulRound(B1)
 858  
 859  		MOVD	pTblSave, pTbl
 860  		reduce()
 861  
 862  		CMP	$128, srcPtrLen
 863  		BGE	octetsLoop
 864  
 865  startSingles:
 866  	CBZ	srcPtrLen, done
 867  	ADD	$14*16, pTbl
 868  	// Preload H and its Karatsuba precomp
 869  	VLD1.P	(pTbl), [T1.B16, T2.B16]
 870  	// Preload AES round keys
 871  	ADD	$128, ks
 872  	VLD1.P	48(ks), [K8.B16, K9.B16, K10.B16]
 873  	VMOV	K10.B16, KLAST.B16
 874  	TBZ	$4, NR, singlesLoop
 875  	VLD1.P	32(ks), [B1.B16, B2.B16]
 876  	VMOV	B2.B16, KLAST.B16
 877  	TBZ	$3, NR, singlesLoop
 878  	VLD1.P	32(ks), [B3.B16, B4.B16]
 879  	VMOV	B4.B16, KLAST.B16
 880  
 881  singlesLoop:
 882  		CMP	$16, srcPtrLen
 883  		BLT	tail
 884  		SUB	$16, srcPtrLen
 885  
 886  		VLD1.P	16(srcPtr), [T0.B16]
 887  		VREV64	T0.B16, B5.B16
 888  		VEOR	KLAST.B16, T0.B16, T0.B16
 889  
 890  		VREV32	CTR.B16, B0.B16
 891  		VADD	CTR.S4, INC.S4, CTR.S4
 892  
 893  		AESE	K0.B16, B0.B16
 894  		AESMC	B0.B16, B0.B16
 895  		AESE	K1.B16, B0.B16
 896  		AESMC	B0.B16, B0.B16
 897  		AESE	K2.B16, B0.B16
 898  		AESMC	B0.B16, B0.B16
 899  		AESE	K3.B16, B0.B16
 900  		AESMC	B0.B16, B0.B16
 901  		AESE	K4.B16, B0.B16
 902  		AESMC	B0.B16, B0.B16
 903  		AESE	K5.B16, B0.B16
 904  		AESMC	B0.B16, B0.B16
 905  		AESE	K6.B16, B0.B16
 906  		AESMC	B0.B16, B0.B16
 907  		AESE	K7.B16, B0.B16
 908  		AESMC	B0.B16, B0.B16
 909  		AESE	K8.B16, B0.B16
 910  		AESMC	B0.B16, B0.B16
 911  		AESE	K9.B16, B0.B16
 912  		TBZ	$4, NR, singlesLast
 913  		AESMC	B0.B16, B0.B16
 914  		AESE	K10.B16, B0.B16
 915  		AESMC	B0.B16, B0.B16
 916  		AESE	B1.B16, B0.B16
 917  		TBZ	$3, NR, singlesLast
 918  		AESMC	B0.B16, B0.B16
 919  		AESE	B2.B16, B0.B16
 920  		AESMC	B0.B16, B0.B16
 921  		AESE	B3.B16, B0.B16
 922  singlesLast:
 923  		VEOR	T0.B16, B0.B16, B0.B16
 924  
 925  		VST1.P	[B0.B16], 16(dstPtr)
 926  
 927  		VEOR	ACC0.B16, B5.B16, B5.B16
 928  		VEXT	$8, B5.B16, B5.B16, T0.B16
 929  		VEOR	B5.B16, T0.B16, T0.B16
 930  		VPMULL	B5.D1, T1.D1, ACC1.Q1
 931  		VPMULL2	B5.D2, T1.D2, ACC0.Q1
 932  		VPMULL	T0.D1, T2.D1, ACCM.Q1
 933  		reduce()
 934  
 935  	B	singlesLoop
 936  tail:
 937  	CBZ	srcPtrLen, done
 938  
 939  	VREV32	CTR.B16, B0.B16
 940  	VADD	CTR.S4, INC.S4, CTR.S4
 941  
 942  	AESE	K0.B16, B0.B16
 943  	AESMC	B0.B16, B0.B16
 944  	AESE	K1.B16, B0.B16
 945  	AESMC	B0.B16, B0.B16
 946  	AESE	K2.B16, B0.B16
 947  	AESMC	B0.B16, B0.B16
 948  	AESE	K3.B16, B0.B16
 949  	AESMC	B0.B16, B0.B16
 950  	AESE	K4.B16, B0.B16
 951  	AESMC	B0.B16, B0.B16
 952  	AESE	K5.B16, B0.B16
 953  	AESMC	B0.B16, B0.B16
 954  	AESE	K6.B16, B0.B16
 955  	AESMC	B0.B16, B0.B16
 956  	AESE	K7.B16, B0.B16
 957  	AESMC	B0.B16, B0.B16
 958  	AESE	K8.B16, B0.B16
 959  	AESMC	B0.B16, B0.B16
 960  	AESE	K9.B16, B0.B16
 961  	TBZ	$4, NR, tailLast
 962  	AESMC	B0.B16, B0.B16
 963  	AESE	K10.B16, B0.B16
 964  	AESMC	B0.B16, B0.B16
 965  	AESE	B1.B16, B0.B16
 966  	TBZ	$3, NR, tailLast
 967  	AESMC	B0.B16, B0.B16
 968  	AESE	B2.B16, B0.B16
 969  	AESMC	B0.B16, B0.B16
 970  	AESE	B3.B16, B0.B16
 971  tailLast:
 972  	VEOR	KLAST.B16, B0.B16, B0.B16
 973  
 974  	// Assuming it is safe to load past dstPtr due to the presence of the tag
 975  	VLD1	(srcPtr), [B5.B16]
 976  
 977  	VEOR	B5.B16, B0.B16, B0.B16
 978  
 979  	VEOR	T3.B16, T3.B16, T3.B16
 980  	MOVD	$0, H1
 981  	SUB	$1, H1
 982  
 983  	TBZ	$3, srcPtrLen, ld4
 984  	VMOV	B0.D[0], H0
 985  	MOVD.P	H0, 8(dstPtr)
 986  	VMOV	H1, T3.D[0]
 987  	VEXT	$8, ZERO.B16, B0.B16, B0.B16
 988  ld4:
 989  	TBZ	$2, srcPtrLen, ld2
 990  	VMOV	B0.S[0], H0
 991  	MOVW.P	H0, 4(dstPtr)
 992  	VEXT	$12, T3.B16, ZERO.B16, T3.B16
 993  	VMOV	H1, T3.S[0]
 994  	VEXT	$4, ZERO.B16, B0.B16, B0.B16
 995  ld2:
 996  	TBZ	$1, srcPtrLen, ld1
 997  	VMOV	B0.H[0], H0
 998  	MOVH.P	H0, 2(dstPtr)
 999  	VEXT	$14, T3.B16, ZERO.B16, T3.B16
1000  	VMOV	H1, T3.H[0]
1001  	VEXT	$2, ZERO.B16, B0.B16, B0.B16
1002  ld1:
1003  	TBZ	$0, srcPtrLen, ld0
1004  	VMOV	B0.B[0], H0
1005  	MOVB.P	H0, 1(dstPtr)
1006  	VEXT	$15, T3.B16, ZERO.B16, T3.B16
1007  	VMOV	H1, T3.B[0]
1008  ld0:
1009  
1010  	VAND	T3.B16, B5.B16, B5.B16
1011  	VREV64	B5.B16, B5.B16
1012  
1013  	VEOR	ACC0.B16, B5.B16, B5.B16
1014  	VEXT	$8, B5.B16, B5.B16, T0.B16
1015  	VEOR	B5.B16, T0.B16, T0.B16
1016  	VPMULL	B5.D1, T1.D1, ACC1.Q1
1017  	VPMULL2	B5.D2, T1.D2, ACC0.Q1
1018  	VPMULL	T0.D1, T2.D1, ACCM.Q1
1019  	reduce()
1020  done:
1021  	VST1	[ACC0.B16], (tPtr)
1022  
1023  	RET
1024