aes_ppc64x.s raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build (ppc64 || ppc64le) && !purego
   6  
   7  // Based on CRYPTOGAMS code with the following comment:
   8  // # ====================================================================
   9  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10  // # project. The module is, however, dual licensed under OpenSSL and
  11  // # CRYPTOGAMS licenses depending on where you obtain it. For further
  12  // # details see http://www.openssl.org/~appro/cryptogams/.
  13  // # ====================================================================
  14  
  15  // Original code can be found at the link below:
  16  // https://github.com/dot-asm/cryptogams/blob/master/ppc/aesp8-ppc.pl
  17  
  18  // Some function names were changed to be consistent with Go function
  19  // names. For instance, function aes_p8_set_{en,de}crypt_key become
  20  // set{En,De}cryptKeyAsm. I also split setEncryptKeyAsm in two parts
  21  // and a new session was created (doEncryptKeyAsm). This was necessary to
  22  // avoid arguments overwriting when setDecryptKeyAsm calls setEncryptKeyAsm.
  23  // There were other modifications as well but kept the same functionality.
  24  
  25  #include "textflag.h"
  26  
  27  // For expandKeyAsm
  28  #define INP     R3
  29  #define BITS    R4
  30  #define OUTENC  R5 // Pointer to next expanded encrypt key
  31  #define PTR     R6
  32  #define CNT     R7
  33  #define ROUNDS  R8
  34  #define OUTDEC  R9  // Pointer to next expanded decrypt key
  35  #define TEMP    R19
  36  #define ZERO    V0
  37  #define IN0     V1
  38  #define IN1     V2
  39  #define KEY     V3
  40  #define RCON    V4
  41  #define MASK    V5
  42  #define TMP     V6
  43  #define STAGE   V7
  44  #define OUTPERM V8
  45  #define OUTMASK V9
  46  #define OUTHEAD V10
  47  #define OUTTAIL V11
  48  
  49  // For P9 instruction emulation
  50  #define ESPERM  V21  // Endian swapping permute into BE
  51  #define TMP2    V22  // Temporary for P8_STXVB16X/P8_STXVB16X
  52  
  53  // For {en,de}cryptBlockAsm
  54  #define BLK_INP    R3
  55  #define BLK_OUT    R4
  56  #define BLK_KEY    R5
  57  #define BLK_ROUNDS R6
  58  #define BLK_IDX    R7
  59  
  60  DATA ·rcon+0x00(SB)/8, $0x0f0e0d0c0b0a0908 // Permute for vector doubleword endian swap
  61  DATA ·rcon+0x08(SB)/8, $0x0706050403020100
  62  DATA ·rcon+0x10(SB)/8, $0x0100000001000000 // RCON
  63  DATA ·rcon+0x18(SB)/8, $0x0100000001000000 // RCON
  64  DATA ·rcon+0x20(SB)/8, $0x1b0000001b000000
  65  DATA ·rcon+0x28(SB)/8, $0x1b0000001b000000
  66  DATA ·rcon+0x30(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
  67  DATA ·rcon+0x38(SB)/8, $0x0d0e0f0c0d0e0f0c // MASK
  68  DATA ·rcon+0x40(SB)/8, $0x0000000000000000
  69  DATA ·rcon+0x48(SB)/8, $0x0000000000000000
  70  GLOBL ·rcon(SB), RODATA, $80
  71  
  72  #ifdef GOARCH_ppc64le
  73  #  ifdef GOPPC64_power9
  74  #define P8_LXVB16X(RA,RB,VT)  LXVB16X	(RA+RB), VT
  75  #define P8_STXVB16X(VS,RA,RB) STXVB16X	VS, (RA+RB)
  76  #define XXBRD_ON_LE(VA,VT)    XXBRD	VA, VT
  77  #define SETUP_ESPERM(rtmp)
  78  #  else
  79  // On POWER8/ppc64le, emulate the POWER9 instructions by loading unaligned
  80  // doublewords and byte-swapping each doubleword to emulate BE load/stores.
  81  #define NEEDS_ESPERM
  82  #define P8_LXVB16X(RA,RB,VT) \
  83  	LXVD2X	(RA+RB), VT \
  84  	VPERM	VT, VT, ESPERM, VT
  85  
  86  #define P8_STXVB16X(VS,RA,RB) \
  87  	VPERM	VS, VS, ESPERM, TMP2 \
  88  	STXVD2X	TMP2, (RA+RB)
  89  
  90  #define XXBRD_ON_LE(VA,VT) \
  91  	VPERM	VA, VA, ESPERM, VT
  92  
  93  // Setup byte-swapping permute value in ESPERM for POWER9 instruction
  94  // emulation macros.
  95  #define SETUP_ESPERM(rtmp) \
  96  	MOVD	$·rcon(SB), rtmp \
  97  	LVX	(rtmp), ESPERM
  98  #  endif // defined(GOPPC64_power9)
  99  #else
 100  #define P8_LXVB16X(RA,RB,VT)  LXVD2X	(RA+RB), VT
 101  #define P8_STXVB16X(VS,RA,RB) STXVD2X	VS, (RA+RB)
 102  #define XXBRD_ON_LE(VA, VT)
 103  #define SETUP_ESPERM(rtmp)
 104  #endif // defined(GOARCH_ppc64le)
 105  
 106  // func setEncryptKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)
 107  TEXT ·expandKeyAsm(SB), NOSPLIT|NOFRAME, $0
 108  	// Load the arguments inside the registers
 109  	MOVD	nr+0(FP), ROUNDS
 110  	MOVD	key+8(FP), INP
 111  	MOVD	enc+16(FP), OUTENC
 112  	MOVD	dec+24(FP), OUTDEC
 113  
 114  #ifdef NEEDS_ESPERM
 115  	MOVD	$·rcon(SB), PTR // PTR points to rcon addr
 116  	LVX	(PTR), ESPERM
 117  	ADD	$0x10, PTR
 118  #else
 119  	MOVD	$·rcon+0x10(SB), PTR // PTR points to rcon addr (skipping permute vector)
 120  #endif
 121  
 122  	// Get key from memory and write aligned into VR
 123  	P8_LXVB16X(INP, R0, IN0)
 124  	ADD	$0x10, INP, INP
 125  	MOVD	$0x20, TEMP
 126  
 127  	CMPW	ROUNDS, $12
 128  	LVX	(PTR)(R0), RCON    // lvx   4,0,6      Load first 16 bytes into RCON
 129  	LVX	(PTR)(TEMP), MASK
 130  	ADD	$0x10, PTR, PTR    // addi  6,6,0x10   PTR to next 16 bytes of RCON
 131  	MOVD	$8, CNT            // li    7,8        CNT = 8
 132  	VXOR	ZERO, ZERO, ZERO   // vxor  0,0,0      Zero to be zero :)
 133  	MOVD	CNT, CTR           // mtctr 7          Set the counter to 8 (rounds)
 134  
 135  	// The expanded decrypt key is the expanded encrypt key stored in reverse order.
 136  	// Move OUTDEC to the last key location, and store in descending order.
 137  	ADD	$160, OUTDEC, OUTDEC
 138  	BLT	loop128
 139  	ADD	$32, OUTDEC, OUTDEC
 140  	BEQ	l192
 141  	ADD	$32, OUTDEC, OUTDEC
 142  	JMP	l256
 143  
 144  loop128:
 145  	// Key schedule (Round 1 to 8)
 146  	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5         Rotate-n-splat
 147  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
 148  	STXVD2X	IN0, (R0+OUTENC)
 149  	STXVD2X	IN0, (R0+OUTDEC)
 150  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
 151  	ADD	$16, OUTENC, OUTENC
 152  	ADD	$-16, OUTDEC, OUTDEC
 153  
 154  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 155  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 156  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 157  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 158  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 159  	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
 160  	VXOR	IN0, KEY, IN0       // vxor 1,1,3
 161  	BDNZ	loop128
 162  
 163  	LVX	(PTR)(R0), RCON // lvx 4,0,6     Last two round keys
 164  
 165  	// Key schedule (Round 9)
 166  	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-spat
 167  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
 168  	STXVD2X	IN0, (R0+OUTENC)
 169  	STXVD2X	IN0, (R0+OUTDEC)
 170  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
 171  	ADD	$16, OUTENC, OUTENC
 172  	ADD	$-16, OUTDEC, OUTDEC
 173  
 174  	// Key schedule (Round 10)
 175  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 176  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 177  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 178  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 179  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 180  	VADDUWM	RCON, RCON, RCON    // vadduwm 4,4,4
 181  	VXOR	IN0, KEY, IN0       // vxor 1,1,3
 182  
 183  	VPERM	IN0, IN0, MASK, KEY              // vperm 3,1,1,5   Rotate-n-splat
 184  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
 185  	STXVD2X	IN0, (R0+OUTENC)
 186  	STXVD2X	IN0, (R0+OUTDEC)
 187  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
 188  	ADD	$16, OUTENC, OUTENC
 189  	ADD	$-16, OUTDEC, OUTDEC
 190  
 191  	// Key schedule (Round 11)
 192  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 193  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
 194  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 195  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
 196  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 197  	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
 198  	STXVD2X	IN0, (R0+OUTENC)
 199  	STXVD2X	IN0, (R0+OUTDEC)
 200  
 201  	RET
 202  
 203  l192:
 204  	LXSDX	(INP+R0), IN1                    // Load next 8 bytes into upper half of VSR.
 205  	XXBRD_ON_LE(IN1, IN1)                    // and convert to BE ordering on LE hosts.
 206  	MOVD	$4, CNT                          // li 7,4
 207  	STXVD2X	IN0, (R0+OUTENC)
 208  	STXVD2X	IN0, (R0+OUTDEC)
 209  	ADD	$16, OUTENC, OUTENC
 210  	ADD	$-16, OUTDEC, OUTDEC
 211  	VSPLTISB	$8, KEY                  // vspltisb 3,8
 212  	MOVD	CNT, CTR                         // mtctr 7
 213  	VSUBUBM	MASK, KEY, MASK                  // vsububm 5,5,3
 214  
 215  loop192:
 216  	VPERM	IN1, IN1, MASK, KEY // vperm 3,2,2,5
 217  	VSLDOI	$12, ZERO, IN0, TMP // vsldoi 6,0,1,12
 218  	VCIPHERLAST	KEY, RCON, KEY      // vcipherlast 3,3,4
 219  
 220  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 221  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 222  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 223  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 224  	VXOR	IN0, TMP, IN0       // vxor 1,1,6
 225  
 226  	VSLDOI	$8, ZERO, IN1, STAGE  // vsldoi 7,0,2,8
 227  	VSPLTW	$3, IN0, TMP          // vspltw 6,1,3
 228  	VXOR	TMP, IN1, TMP         // vxor 6,6,2
 229  	VSLDOI	$12, ZERO, IN1, IN1   // vsldoi 2,0,2,12
 230  	VADDUWM	RCON, RCON, RCON      // vadduwm 4,4,4
 231  	VXOR	IN1, TMP, IN1         // vxor 2,2,6
 232  	VXOR	IN0, KEY, IN0         // vxor 1,1,3
 233  	VXOR	IN1, KEY, IN1         // vxor 2,2,3
 234  	VSLDOI	$8, STAGE, IN0, STAGE // vsldoi 7,7,1,8
 235  
 236  	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
 237  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
 238  	STXVD2X	STAGE, (R0+OUTENC)
 239  	STXVD2X	STAGE, (R0+OUTDEC)
 240  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
 241  	ADD	$16, OUTENC, OUTENC
 242  	ADD	$-16, OUTDEC, OUTDEC
 243  
 244  	VSLDOI	$8, IN0, IN1, STAGE              // vsldoi 7,1,2,8
 245  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 246  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
 247  	STXVD2X	STAGE, (R0+OUTENC)
 248  	STXVD2X	STAGE, (R0+OUTDEC)
 249  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 250  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
 251  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 252  	ADD	$16, OUTENC, OUTENC
 253  	ADD	$-16, OUTDEC, OUTDEC
 254  
 255  	VSPLTW	$3, IN0, TMP                     // vspltw 6,1,3
 256  	VXOR	TMP, IN1, TMP                    // vxor 6,6,2
 257  	VSLDOI	$12, ZERO, IN1, IN1              // vsldoi 2,0,2,12
 258  	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
 259  	VXOR	IN1, TMP, IN1                    // vxor 2,2,6
 260  	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
 261  	VXOR	IN1, KEY, IN1                    // vxor 2,2,3
 262  	STXVD2X	IN0, (R0+OUTENC)
 263  	STXVD2X	IN0, (R0+OUTDEC)
 264  	ADD	$16, OUTENC, OUTENC
 265  	ADD	$-16, OUTDEC, OUTDEC
 266  	BDNZ	loop192
 267  
 268  	RET
 269  
 270  l256:
 271  	P8_LXVB16X(INP, R0, IN1)
 272  	MOVD	$7, CNT                          // li 7,7
 273  	STXVD2X	IN0, (R0+OUTENC)
 274  	STXVD2X	IN0, (R0+OUTDEC)
 275  	ADD	$16, OUTENC, OUTENC
 276  	ADD	$-16, OUTDEC, OUTDEC
 277  	MOVD	CNT, CTR                         // mtctr 7
 278  
 279  loop256:
 280  	VPERM	IN1, IN1, MASK, KEY              // vperm 3,2,2,5
 281  	VSLDOI	$12, ZERO, IN0, TMP              // vsldoi 6,0,1,12
 282  	STXVD2X	IN1, (R0+OUTENC)
 283  	STXVD2X	IN1, (R0+OUTDEC)
 284  	VCIPHERLAST	KEY, RCON, KEY           // vcipherlast 3,3,4
 285  	ADD	$16, OUTENC, OUTENC
 286  	ADD	$-16, OUTDEC, OUTDEC
 287  
 288  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 289  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
 290  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 291  	VSLDOI	$12, ZERO, TMP, TMP              // vsldoi 6,0,6,12
 292  	VXOR	IN0, TMP, IN0                    // vxor 1,1,6
 293  	VADDUWM	RCON, RCON, RCON                 // vadduwm 4,4,4
 294  	VXOR	IN0, KEY, IN0                    // vxor 1,1,3
 295  	STXVD2X	IN0, (R0+OUTENC)
 296  	STXVD2X	IN0, (R0+OUTDEC)
 297  	ADD	$16, OUTENC, OUTENC
 298  	ADD	$-16, OUTDEC, OUTDEC
 299  	BDZ	done
 300  
 301  	VSPLTW	$3, IN0, KEY        // vspltw 3,1,3
 302  	VSLDOI	$12, ZERO, IN1, TMP // vsldoi 6,0,2,12
 303  	VSBOX	KEY, KEY            // vsbox 3,3
 304  
 305  	VXOR	IN1, TMP, IN1       // vxor 2,2,6
 306  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 307  	VXOR	IN1, TMP, IN1       // vxor 2,2,6
 308  	VSLDOI	$12, ZERO, TMP, TMP // vsldoi 6,0,6,12
 309  	VXOR	IN1, TMP, IN1       // vxor 2,2,6
 310  
 311  	VXOR	IN1, KEY, IN1 // vxor 2,2,3
 312  	JMP	loop256       // b .Loop256
 313  
 314  done:
 315  	RET
 316  
 317  // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
 318  TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
 319  	MOVD	nr+0(FP), R6   // Round count/Key size
 320  	MOVD	xk+8(FP), R5   // Key pointer
 321  	MOVD	dst+16(FP), R3 // Dest pointer
 322  	MOVD	src+24(FP), R4 // Src pointer
 323  	SETUP_ESPERM(R7)
 324  
 325  	// Set CR{1,2,3}EQ to hold the key size information.
 326  	CMPU	R6, $10, CR1
 327  	CMPU	R6, $12, CR2
 328  	CMPU	R6, $14, CR3
 329  
 330  	MOVD	$16, R6
 331  	MOVD	$32, R7
 332  	MOVD	$48, R8
 333  	MOVD	$64, R9
 334  	MOVD	$80, R10
 335  	MOVD	$96, R11
 336  	MOVD	$112, R12
 337  
 338  	// Load text in BE order
 339  	P8_LXVB16X(R4, R0, V0)
 340  
 341  	// V1, V2 will hold keys, V0 is a temp.
 342  	// At completion, V2 will hold the ciphertext.
 343  	// Load xk[0:3] and xor with text
 344  	LXVD2X	(R0+R5), V1
 345  	VXOR	V0, V1, V0
 346  
 347  	// Load xk[4:11] and cipher
 348  	LXVD2X	(R6+R5), V1
 349  	LXVD2X	(R7+R5), V2
 350  	VCIPHER	V0, V1, V0
 351  	VCIPHER	V0, V2, V0
 352  
 353  	// Load xk[12:19] and cipher
 354  	LXVD2X	(R8+R5), V1
 355  	LXVD2X	(R9+R5), V2
 356  	VCIPHER	V0, V1, V0
 357  	VCIPHER	V0, V2, V0
 358  
 359  	// Load xk[20:27] and cipher
 360  	LXVD2X	(R10+R5), V1
 361  	LXVD2X	(R11+R5), V2
 362  	VCIPHER	V0, V1, V0
 363  	VCIPHER	V0, V2, V0
 364  
 365  	// Increment xk pointer to reuse constant offsets in R6-R12.
 366  	ADD	$112, R5
 367  
 368  	// Load xk[28:35] and cipher
 369  	LXVD2X	(R0+R5), V1
 370  	LXVD2X	(R6+R5), V2
 371  	VCIPHER	V0, V1, V0
 372  	VCIPHER	V0, V2, V0
 373  
 374  	// Load xk[36:43] and cipher
 375  	LXVD2X	(R7+R5), V1
 376  	LXVD2X	(R8+R5), V2
 377  	BEQ	CR1, Ldec_tail // Key size 10?
 378  	VCIPHER	V0, V1, V0
 379  	VCIPHER	V0, V2, V0
 380  
 381  	// Load xk[44:51] and cipher
 382  	LXVD2X	(R9+R5), V1
 383  	LXVD2X	(R10+R5), V2
 384  	BEQ	CR2, Ldec_tail // Key size 12?
 385  	VCIPHER	V0, V1, V0
 386  	VCIPHER	V0, V2, V0
 387  
 388  	// Load xk[52:59] and cipher
 389  	LXVD2X	(R11+R5), V1
 390  	LXVD2X	(R12+R5), V2
 391  	BNE	CR3, Linvalid_key_len // Not key size 14?
 392  	// Fallthrough to final cipher
 393  
 394  Ldec_tail:
 395  	// Cipher last two keys such that key information is
 396  	// cleared from V1 and V2.
 397  	VCIPHER		V0, V1, V1
 398  	VCIPHERLAST	V1, V2, V2
 399  
 400  	// Store the result in BE order.
 401  	P8_STXVB16X(V2, R3, R0)
 402  	RET
 403  
 404  Linvalid_key_len:
 405  	// Segfault, this should never happen. Only 3 keys sizes are created/used.
 406  	MOVD	R0, 0(R0)
 407  	RET
 408  
 409  // func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
 410  TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
 411  	MOVD	nr+0(FP), R6   // Round count/Key size
 412  	MOVD	xk+8(FP), R5   // Key pointer
 413  	MOVD	dst+16(FP), R3 // Dest pointer
 414  	MOVD	src+24(FP), R4 // Src pointer
 415  	SETUP_ESPERM(R7)
 416  
 417  	// Set CR{1,2,3}EQ to hold the key size information.
 418  	CMPU	R6, $10, CR1
 419  	CMPU	R6, $12, CR2
 420  	CMPU	R6, $14, CR3
 421  
 422  	MOVD	$16, R6
 423  	MOVD	$32, R7
 424  	MOVD	$48, R8
 425  	MOVD	$64, R9
 426  	MOVD	$80, R10
 427  	MOVD	$96, R11
 428  	MOVD	$112, R12
 429  
 430  	// Load text in BE order
 431  	P8_LXVB16X(R4, R0, V0)
 432  
 433  	// V1, V2 will hold keys, V0 is a temp.
 434  	// At completion, V2 will hold the text.
 435  	// Load xk[0:3] and xor with ciphertext
 436  	LXVD2X	(R0+R5), V1
 437  	VXOR	V0, V1, V0
 438  
 439  	// Load xk[4:11] and cipher
 440  	LXVD2X	(R6+R5), V1
 441  	LXVD2X	(R7+R5), V2
 442  	VNCIPHER	V0, V1, V0
 443  	VNCIPHER	V0, V2, V0
 444  
 445  	// Load xk[12:19] and cipher
 446  	LXVD2X	(R8+R5), V1
 447  	LXVD2X	(R9+R5), V2
 448  	VNCIPHER	V0, V1, V0
 449  	VNCIPHER	V0, V2, V0
 450  
 451  	// Load xk[20:27] and cipher
 452  	LXVD2X	(R10+R5), V1
 453  	LXVD2X	(R11+R5), V2
 454  	VNCIPHER	V0, V1, V0
 455  	VNCIPHER	V0, V2, V0
 456  
 457  	// Increment xk pointer to reuse constant offsets in R6-R12.
 458  	ADD	$112, R5
 459  
 460  	// Load xk[28:35] and cipher
 461  	LXVD2X	(R0+R5), V1
 462  	LXVD2X	(R6+R5), V2
 463  	VNCIPHER	V0, V1, V0
 464  	VNCIPHER	V0, V2, V0
 465  
 466  	// Load xk[36:43] and cipher
 467  	LXVD2X	(R7+R5), V1
 468  	LXVD2X	(R8+R5), V2
 469  	BEQ	CR1, Ldec_tail // Key size 10?
 470  	VNCIPHER	V0, V1, V0
 471  	VNCIPHER	V0, V2, V0
 472  
 473  	// Load xk[44:51] and cipher
 474  	LXVD2X	(R9+R5), V1
 475  	LXVD2X	(R10+R5), V2
 476  	BEQ	CR2, Ldec_tail // Key size 12?
 477  	VNCIPHER	V0, V1, V0
 478  	VNCIPHER	V0, V2, V0
 479  
 480  	// Load xk[52:59] and cipher
 481  	LXVD2X	(R11+R5), V1
 482  	LXVD2X	(R12+R5), V2
 483  	BNE	CR3, Linvalid_key_len // Not key size 14?
 484  	// Fallthrough to final cipher
 485  
 486  Ldec_tail:
 487  	// Cipher last two keys such that key information is
 488  	// cleared from V1 and V2.
 489  	VNCIPHER	V0, V1, V1
 490  	VNCIPHERLAST	V1, V2, V2
 491  
 492  	// Store the result in BE order.
 493  	P8_STXVB16X(V2, R3, R0)
 494  	RET
 495  
 496  Linvalid_key_len:
 497  	// Segfault, this should never happen. Only 3 keys sizes are created/used.
 498  	MOVD	R0, 0(R0)
 499  	RET
 500  
 501  // Remove defines from above so they can be defined here
 502  #undef INP
 503  #undef OUTENC
 504  #undef ROUNDS
 505  #undef KEY
 506  #undef TMP
 507  
 508  #define INP R3
 509  #define OUTP R4
 510  #define LEN R5
 511  #define KEYP R6
 512  #define ROUNDS R7
 513  #define IVP R8
 514  #define ENC R9
 515  
 516  #define INOUT V2
 517  #define TMP V3
 518  #define IVEC V4
 519  
 520  // Load the crypt key into VSRs.
 521  //
 522  // The expanded key is stored and loaded using
 523  // STXVD2X/LXVD2X. The in-memory byte ordering
 524  // depends on the endianness of the machine. The
 525  // expanded keys are generated by expandKeyAsm above.
 526  //
 527  // Rkeyp holds the key pointer. It is clobbered. Once
 528  // the expanded keys are loaded, it is not needed.
 529  //
 530  // R12,R14-R21 are scratch registers.
 531  // For keyp of 10, V6, V11-V20 hold the expanded key.
 532  // For keyp of 12, V6, V9-V20 hold the expanded key.
 533  // For keyp of 14, V6, V7-V20 hold the expanded key.
 534  #define LOAD_KEY(Rkeyp) \
 535  	MOVD	$16, R12 \
 536  	MOVD	$32, R14 \
 537  	MOVD	$48, R15 \
 538  	MOVD	$64, R16 \
 539  	MOVD	$80, R17 \
 540  	MOVD	$96, R18 \
 541  	MOVD	$112, R19 \
 542  	MOVD	$128, R20 \
 543  	MOVD	$144, R21 \
 544  	LXVD2X	(R0+Rkeyp), V6 \
 545  	ADD	$16, Rkeyp \
 546  	BEQ	CR1, L_start10 \
 547  	BEQ	CR2, L_start12 \
 548  	LXVD2X	(R0+Rkeyp), V7 \
 549  	LXVD2X	(R12+Rkeyp), V8 \
 550  	ADD	$32, Rkeyp \
 551  	L_start12: \
 552  	LXVD2X	(R0+Rkeyp), V9 \
 553  	LXVD2X	(R12+Rkeyp), V10 \
 554  	ADD	$32, Rkeyp \
 555  	L_start10: \
 556  	LXVD2X	(R0+Rkeyp), V11 \
 557  	LXVD2X	(R12+Rkeyp), V12 \
 558  	LXVD2X	(R14+Rkeyp), V13 \
 559  	LXVD2X	(R15+Rkeyp), V14 \
 560  	LXVD2X	(R16+Rkeyp), V15 \
 561  	LXVD2X	(R17+Rkeyp), V16 \
 562  	LXVD2X	(R18+Rkeyp), V17 \
 563  	LXVD2X	(R19+Rkeyp), V18 \
 564  	LXVD2X	(R20+Rkeyp), V19 \
 565  	LXVD2X	(R21+Rkeyp), V20
 566  
 567  // Perform aes cipher operation for keysize 10/12/14 using the keys
 568  // loaded by LOAD_KEY, and key size information held in CR1EQ/CR2EQ.
 569  //
 570  // Vxor is ideally V6 (Key[0-3]), but for slightly improved encrypting
 571  // performance V6 and IVEC can be swapped (xor is both associative and
 572  // commutative) during encryption:
 573  //
 574  //	VXOR INOUT, IVEC, INOUT
 575  //	VXOR INOUT, V6, INOUT
 576  //
 577  //	into
 578  //
 579  //	VXOR INOUT, V6, INOUT
 580  //	VXOR INOUT, IVEC, INOUT
 581  //
 582  #define CIPHER_BLOCK(Vin, Vxor, Vout, vcipher, vciphel, label10, label12) \
 583  	VXOR	Vin, Vxor, Vout \
 584  	BEQ	CR1, label10 \
 585  	BEQ	CR2, label12 \
 586  	vcipher	Vout, V7, Vout \
 587  	vcipher	Vout, V8, Vout \
 588  	label12: \
 589  	vcipher	Vout, V9, Vout \
 590  	vcipher	Vout, V10, Vout \
 591  	label10: \
 592  	vcipher	Vout, V11, Vout \
 593  	vcipher	Vout, V12, Vout \
 594  	vcipher	Vout, V13, Vout \
 595  	vcipher	Vout, V14, Vout \
 596  	vcipher	Vout, V15, Vout \
 597  	vcipher	Vout, V16, Vout \
 598  	vcipher	Vout, V17, Vout \
 599  	vcipher	Vout, V18, Vout \
 600  	vcipher	Vout, V19, Vout \
 601  	vciphel	Vout, V20, Vout \
 602  
 603  #define CLEAR_KEYS() \
 604  	VXOR	V6, V6, V6 \
 605  	VXOR	V7, V7, V7 \
 606  	VXOR	V8, V8, V8 \
 607  	VXOR	V9, V9, V9 \
 608  	VXOR	V10, V10, V10 \
 609  	VXOR	V11, V11, V11 \
 610  	VXOR	V12, V12, V12 \
 611  	VXOR	V13, V13, V13 \
 612  	VXOR	V14, V14, V14 \
 613  	VXOR	V15, V15, V15 \
 614  	VXOR	V16, V16, V16 \
 615  	VXOR	V17, V17, V17 \
 616  	VXOR	V18, V18, V18 \
 617  	VXOR	V19, V19, V19 \
 618  	VXOR	V20, V20, V20
 619  
 620  //func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int, nr int)
 621  TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
 622  	MOVD	src+0(FP), INP
 623  	MOVD	dst+8(FP), OUTP
 624  	MOVD	length+16(FP), LEN
 625  	MOVD	key+24(FP), KEYP
 626  	MOVD	iv+32(FP), IVP
 627  	MOVD	enc+40(FP), ENC
 628  	MOVD	nr+48(FP), ROUNDS
 629  
 630  	SETUP_ESPERM(R11)
 631  
 632  	// Assume len > 0 && len % blockSize == 0.
 633  	CMPW	ENC, $0
 634  	P8_LXVB16X(IVP, R0, IVEC)
 635  	CMPU	ROUNDS, $10, CR1
 636  	CMPU	ROUNDS, $12, CR2 // Only sizes 10/12/14 are supported.
 637  
 638  	// Setup key in VSRs, and set loop count in CTR.
 639  	LOAD_KEY(KEYP)
 640  	SRD	$4, LEN
 641  	MOVD	LEN, CTR
 642  
 643  	BEQ	Lcbc_dec
 644  
 645  	PCALIGN $16
 646  Lcbc_enc:
 647  	P8_LXVB16X(INP, R0, INOUT)
 648  	ADD	$16, INP
 649  	VXOR	INOUT, V6, INOUT
 650  	CIPHER_BLOCK(INOUT, IVEC, INOUT, VCIPHER, VCIPHERLAST, Lcbc_enc10, Lcbc_enc12)
 651  	VOR	INOUT, INOUT, IVEC // ciphertext (INOUT) is IVEC for next block.
 652  	P8_STXVB16X(INOUT, OUTP, R0)
 653  	ADD	$16, OUTP
 654  	BDNZ	Lcbc_enc
 655  
 656  	P8_STXVB16X(INOUT, IVP, R0)
 657  	CLEAR_KEYS()
 658  	RET
 659  
 660  	PCALIGN $16
 661  Lcbc_dec:
 662  	P8_LXVB16X(INP, R0, TMP)
 663  	ADD	$16, INP
 664  	CIPHER_BLOCK(TMP, V6, INOUT, VNCIPHER, VNCIPHERLAST, Lcbc_dec10, Lcbc_dec12)
 665  	VXOR	INOUT, IVEC, INOUT
 666  	VOR	TMP, TMP, IVEC // TMP is IVEC for next block.
 667  	P8_STXVB16X(INOUT, OUTP, R0)
 668  	ADD	$16, OUTP
 669  	BDNZ	Lcbc_dec
 670  
 671  	P8_STXVB16X(IVEC, IVP, R0)
 672  	CLEAR_KEYS()
 673  	RET
 674  
 675  
 676  #define DO1_CIPHER(iv0, keyv, key, op) \
 677  	LXVD2X	(key), keyv   \
 678  	ADD	$16, key      \
 679  	op	iv0, keyv, iv0
 680  
 681  #define DO2_CIPHER(iv0, iv1, keyv, key, op) \
 682  	DO1_CIPHER(iv0, keyv, key, op) \
 683  	op	iv1, keyv, iv1
 684  
 685  #define DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
 686  	DO2_CIPHER(iv0, iv1, keyv, key, op) \
 687  	op	iv2, keyv, iv2              \
 688  	op	iv3, keyv, iv3
 689  
 690  #define DO8_CIPHER(iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7, keyv, key, op) \
 691  	DO4_CIPHER(iv0, iv1, iv2, iv3, keyv, key, op) \
 692  	op	iv4, keyv, iv4                        \
 693  	op	iv5, keyv, iv5                        \
 694  	op	iv6, keyv, iv6                        \
 695  	op	iv7, keyv, iv7
 696  
 697  #define XOR_STORE(src, iv, dstp, dstpoff) \
 698  	XXLXOR    src, iv, V8 \
 699  	P8_STXVB16X(V8,dstp,dstpoff)
 700  
 701  //func ctrBlocks1Asm(nr int, xk *[60]uint32, dst, src *[1 * BlockSize]byte, ivlo, ivhi uint64)
 702  TEXT ·ctrBlocks1Asm(SB), NOSPLIT|NOFRAME, $0
 703  
 704  #define CTRBLOCK_PROLOGUE \
 705  	MOVD	nr+0(FP), R3     \
 706  	MOVD	xk+8(FP), R4     \
 707  	MOVD	dst+16(FP), R5   \
 708  	MOVD	src+24(FP), R6   \
 709  	MOVD	ivlo+32(FP), R8  \
 710  	MOVD	ivhi+40(FP), R9  \
 711  	CMP	R3, $12, CR1     \
 712  	MTVSRD	R8, V0		 \
 713  	MTVSRD	R9, V1		 \
 714  	XXPERMDI V1, V0, $0, V0	 \
 715  	SETUP_ESPERM(R8)
 716  
 717  	CTRBLOCK_PROLOGUE
 718  
 719  	DO1_CIPHER(V0,V8,R4,VXOR)
 720  
 721  	BEQ	CR1, key_12
 722  	BLT	CR1, key_10
 723  key_14:
 724  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 725  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 726  key_12:
 727  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 728  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 729  key_10:
 730  	P8_LXVB16X(R6,R0,V9)
 731  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 732  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 733  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 734  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 735  
 736  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 737  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 738  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 739  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 740  
 741  	DO1_CIPHER(V0,V8,R4,VCIPHER)
 742  	DO1_CIPHER(V0,V8,R4,VCIPHERLAST)
 743  
 744  	XOR_STORE(V9,V0,R5,R0)
 745  	RET
 746  
 747  //func ctrBlocks2Asm(nr int, xk *[60]uint32, dst, src *[2 * BlockSize]byte, ivlo, ivhi uint64)
 748  TEXT ·ctrBlocks2Asm(SB), NOSPLIT|NOFRAME, $0
 749  	CTRBLOCK_PROLOGUE
 750  
 751  	XXLEQV  V8, V8, V8	// V0 is -1
 752  	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
 753  
 754  	DO2_CIPHER(V0,V1,V8,R4,VXOR)
 755  
 756  	BEQ	CR1, key_12
 757  	BLT	CR1, key_10
 758  key_14:
 759  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 760  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 761  key_12:
 762  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 763  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 764  key_10:
 765  	P8_LXVB16X(R6,R0,V9)
 766  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 767  	MOVD	$16, R8
 768  	P8_LXVB16X(R6,R8,V10)
 769  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 770  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 771  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 772  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 773  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 774  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 775  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 776  	DO2_CIPHER(V0,V1,V8,R4,VCIPHER)
 777  	DO2_CIPHER(V0,V1,V8,R4,VCIPHERLAST)
 778  
 779  	XOR_STORE(V9,V0,R5,R0)
 780  	XOR_STORE(V10,V1,R5,R8)
 781  
 782  	RET
 783  
 784  //func ctrBlocks4Asm(nr int, xk *[60]uint32, dst, src *[4 * BlockSize]byte, ivlo, ivhi uint64)
 785  TEXT ·ctrBlocks4Asm(SB), NOSPLIT|NOFRAME, $0
 786  	CTRBLOCK_PROLOGUE
 787  
 788  	XXLEQV  V8, V8, V8	// V0 is -1
 789  	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
 790  	VSUBUQM V1, V8, V2
 791  	VSUBUQM V2, V8, V3
 792  
 793  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VXOR)
 794  
 795  	BEQ	CR1, key_12
 796  	BLT	CR1, key_10
 797  key_14:
 798  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 799  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 800  key_12:
 801  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 802  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 803  key_10:
 804  	P8_LXVB16X(R6,R0,V9)
 805  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 806  	MOVD	$16, R8
 807  	P8_LXVB16X(R6,R8,V10)
 808  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 809  	MOVD	$32, R9
 810  	P8_LXVB16X(R6,R9,V11)
 811  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 812  	MOVD	$48, R10
 813  	P8_LXVB16X(R6,R10,V12)
 814  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 815  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 816  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 817  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 818  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 819  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHER)
 820  	DO4_CIPHER(V0,V1,V2,V3,V8,R4,VCIPHERLAST)
 821  
 822  	XOR_STORE(V9,V0,R5,R0)
 823  	XOR_STORE(V10,V1,R5,R8)
 824  	XOR_STORE(V11,V2,R5,R9)
 825  	XOR_STORE(V12,V3,R5,R10)
 826  
 827  	RET
 828  
 829  //func ctrBlocks8Asm(nr int, xk *[60]uint32, dst, src *[8 * BlockSize]byte, ivlo, ivhi uint64)
 830  TEXT ·ctrBlocks8Asm(SB), NOSPLIT|NOFRAME, $0
 831  	CTRBLOCK_PROLOGUE
 832  
 833  	XXLEQV  V8, V8, V8	// V8 is -1
 834  	VSUBUQM V0, V8, V1	// Vi = IV + i (as IV - (-1))
 835  	VADDUQM V8, V8, V9	// V9 is -2
 836  
 837  	VSUBUQM V0, V9, V2
 838  	VSUBUQM V1, V9, V3
 839  	VSUBUQM V2, V9, V4
 840  	VSUBUQM V3, V9, V5
 841  	VSUBUQM V4, V9, V6
 842  	VSUBUQM V5, V9, V7
 843  
 844  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VXOR)
 845  
 846  	BEQ	CR1, key_12
 847  	BLT	CR1, key_10
 848  key_14:
 849  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 850  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 851  key_12:
 852  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 853  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 854  key_10:
 855  	P8_LXVB16X(R6,R0,V9)
 856  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 857  	MOVD	$16, R8
 858  	P8_LXVB16X(R6,R8,V10)
 859  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 860  	MOVD	$32, R9
 861  	P8_LXVB16X(R6,R9,V11)
 862  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 863  	MOVD	$48, R10
 864  	P8_LXVB16X(R6,R10,V12)
 865  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 866  	MOVD	$64, R11
 867  	P8_LXVB16X(R6,R11,V13)
 868  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 869  	MOVD	$80, R12
 870  	P8_LXVB16X(R6,R12,V14)
 871  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 872  	MOVD	$96, R14
 873  	P8_LXVB16X(R6,R14,V15)
 874  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 875  	MOVD	$112, R15
 876  	P8_LXVB16X(R6,R15,V16)
 877  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 878  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHER)
 879  	DO8_CIPHER(V0,V1,V2,V3,V4,V5,V6,V7,V8,R4,VCIPHERLAST)
 880  
 881  	XOR_STORE(V9,V0,R5,R0)
 882  	XOR_STORE(V10,V1,R5,R8)
 883  	XOR_STORE(V11,V2,R5,R9)
 884  	XOR_STORE(V12,V3,R5,R10)
 885  	XOR_STORE(V13,V4,R5,R11)
 886  	XOR_STORE(V14,V5,R5,R12)
 887  	XOR_STORE(V15,V6,R5,R14)
 888  	XOR_STORE(V16,V7,R5,R15)
 889  
 890  	RET
 891  
 892