chacha_s390x.s raw

   1  // Copyright 2018 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build gc && !purego
   6  
   7  #include "go_asm.h"
   8  #include "textflag.h"
   9  
  10  // This is an implementation of the ChaCha20 encryption algorithm as
  11  // specified in RFC 7539. It uses vector instructions to compute
  12  // 4 keystream blocks in parallel (256 bytes) which are then XORed
  13  // with the bytes in the input slice.
  14  
  15  GLOBL ·constants<>(SB), RODATA|NOPTR, $32
  16  // BSWAP: swap bytes in each 4-byte element
  17  DATA ·constants<>+0x00(SB)/4, $0x03020100
  18  DATA ·constants<>+0x04(SB)/4, $0x07060504
  19  DATA ·constants<>+0x08(SB)/4, $0x0b0a0908
  20  DATA ·constants<>+0x0c(SB)/4, $0x0f0e0d0c
  21  // J0: [j0, j1, j2, j3]
  22  DATA ·constants<>+0x10(SB)/4, $0x61707865
  23  DATA ·constants<>+0x14(SB)/4, $0x3320646e
  24  DATA ·constants<>+0x18(SB)/4, $0x79622d32
  25  DATA ·constants<>+0x1c(SB)/4, $0x6b206574
  26  
  27  #define BSWAP V5
  28  #define J0    V6
  29  #define KEY0  V7
  30  #define KEY1  V8
  31  #define NONCE V9
  32  #define CTR   V10
  33  #define M0    V11
  34  #define M1    V12
  35  #define M2    V13
  36  #define M3    V14
  37  #define INC   V15
  38  #define X0    V16
  39  #define X1    V17
  40  #define X2    V18
  41  #define X3    V19
  42  #define X4    V20
  43  #define X5    V21
  44  #define X6    V22
  45  #define X7    V23
  46  #define X8    V24
  47  #define X9    V25
  48  #define X10   V26
  49  #define X11   V27
  50  #define X12   V28
  51  #define X13   V29
  52  #define X14   V30
  53  #define X15   V31
  54  
  55  #define NUM_ROUNDS 20
  56  
  57  #define ROUND4(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3) \
  58  	VAF    a1, a0, a0  \
  59  	VAF    b1, b0, b0  \
  60  	VAF    c1, c0, c0  \
  61  	VAF    d1, d0, d0  \
  62  	VX     a0, a2, a2  \
  63  	VX     b0, b2, b2  \
  64  	VX     c0, c2, c2  \
  65  	VX     d0, d2, d2  \
  66  	VERLLF $16, a2, a2 \
  67  	VERLLF $16, b2, b2 \
  68  	VERLLF $16, c2, c2 \
  69  	VERLLF $16, d2, d2 \
  70  	VAF    a2, a3, a3  \
  71  	VAF    b2, b3, b3  \
  72  	VAF    c2, c3, c3  \
  73  	VAF    d2, d3, d3  \
  74  	VX     a3, a1, a1  \
  75  	VX     b3, b1, b1  \
  76  	VX     c3, c1, c1  \
  77  	VX     d3, d1, d1  \
  78  	VERLLF $12, a1, a1 \
  79  	VERLLF $12, b1, b1 \
  80  	VERLLF $12, c1, c1 \
  81  	VERLLF $12, d1, d1 \
  82  	VAF    a1, a0, a0  \
  83  	VAF    b1, b0, b0  \
  84  	VAF    c1, c0, c0  \
  85  	VAF    d1, d0, d0  \
  86  	VX     a0, a2, a2  \
  87  	VX     b0, b2, b2  \
  88  	VX     c0, c2, c2  \
  89  	VX     d0, d2, d2  \
  90  	VERLLF $8, a2, a2  \
  91  	VERLLF $8, b2, b2  \
  92  	VERLLF $8, c2, c2  \
  93  	VERLLF $8, d2, d2  \
  94  	VAF    a2, a3, a3  \
  95  	VAF    b2, b3, b3  \
  96  	VAF    c2, c3, c3  \
  97  	VAF    d2, d3, d3  \
  98  	VX     a3, a1, a1  \
  99  	VX     b3, b1, b1  \
 100  	VX     c3, c1, c1  \
 101  	VX     d3, d1, d1  \
 102  	VERLLF $7, a1, a1  \
 103  	VERLLF $7, b1, b1  \
 104  	VERLLF $7, c1, c1  \
 105  	VERLLF $7, d1, d1
 106  
 107  #define PERMUTE(mask, v0, v1, v2, v3) \
 108  	VPERM v0, v0, mask, v0 \
 109  	VPERM v1, v1, mask, v1 \
 110  	VPERM v2, v2, mask, v2 \
 111  	VPERM v3, v3, mask, v3
 112  
 113  #define ADDV(x, v0, v1, v2, v3) \
 114  	VAF x, v0, v0 \
 115  	VAF x, v1, v1 \
 116  	VAF x, v2, v2 \
 117  	VAF x, v3, v3
 118  
 119  #define XORV(off, dst, src, v0, v1, v2, v3) \
 120  	VLM  off(src), M0, M3          \
 121  	PERMUTE(BSWAP, v0, v1, v2, v3) \
 122  	VX   v0, M0, M0                \
 123  	VX   v1, M1, M1                \
 124  	VX   v2, M2, M2                \
 125  	VX   v3, M3, M3                \
 126  	VSTM M0, M3, off(dst)
 127  
 128  #define SHUFFLE(a, b, c, d, t, u, v, w) \
 129  	VMRHF a, c, t \ // t = {a[0], c[0], a[1], c[1]}
 130  	VMRHF b, d, u \ // u = {b[0], d[0], b[1], d[1]}
 131  	VMRLF a, c, v \ // v = {a[2], c[2], a[3], c[3]}
 132  	VMRLF b, d, w \ // w = {b[2], d[2], b[3], d[3]}
 133  	VMRHF t, u, a \ // a = {a[0], b[0], c[0], d[0]}
 134  	VMRLF t, u, b \ // b = {a[1], b[1], c[1], d[1]}
 135  	VMRHF v, w, c \ // c = {a[2], b[2], c[2], d[2]}
 136  	VMRLF v, w, d // d = {a[3], b[3], c[3], d[3]}
 137  
 138  // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
 139  TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
 140  	MOVD $·constants<>(SB), R1
 141  	MOVD dst+0(FP), R2         // R2=&dst[0]
 142  	LMG  src+24(FP), R3, R4    // R3=&src[0] R4=len(src)
 143  	MOVD key+48(FP), R5        // R5=key
 144  	MOVD nonce+56(FP), R6      // R6=nonce
 145  	MOVD counter+64(FP), R7    // R7=counter
 146  
 147  	// load BSWAP and J0
 148  	VLM (R1), BSWAP, J0
 149  
 150  	// setup
 151  	MOVD  $95, R0
 152  	VLM   (R5), KEY0, KEY1
 153  	VLL   R0, (R6), NONCE
 154  	VZERO M0
 155  	VLEIB $7, $32, M0
 156  	VSRLB M0, NONCE, NONCE
 157  
 158  	// initialize counter values
 159  	VLREPF (R7), CTR
 160  	VZERO  INC
 161  	VLEIF  $1, $1, INC
 162  	VLEIF  $2, $2, INC
 163  	VLEIF  $3, $3, INC
 164  	VAF    INC, CTR, CTR
 165  	VREPIF $4, INC
 166  
 167  chacha:
 168  	VREPF $0, J0, X0
 169  	VREPF $1, J0, X1
 170  	VREPF $2, J0, X2
 171  	VREPF $3, J0, X3
 172  	VREPF $0, KEY0, X4
 173  	VREPF $1, KEY0, X5
 174  	VREPF $2, KEY0, X6
 175  	VREPF $3, KEY0, X7
 176  	VREPF $0, KEY1, X8
 177  	VREPF $1, KEY1, X9
 178  	VREPF $2, KEY1, X10
 179  	VREPF $3, KEY1, X11
 180  	VLR   CTR, X12
 181  	VREPF $1, NONCE, X13
 182  	VREPF $2, NONCE, X14
 183  	VREPF $3, NONCE, X15
 184  
 185  	MOVD $(NUM_ROUNDS/2), R1
 186  
 187  loop:
 188  	ROUND4(X0, X4, X12,  X8, X1, X5, X13,  X9, X2, X6, X14, X10, X3, X7, X15, X11)
 189  	ROUND4(X0, X5, X15, X10, X1, X6, X12, X11, X2, X7, X13, X8,  X3, X4, X14, X9)
 190  
 191  	ADD $-1, R1
 192  	BNE loop
 193  
 194  	// decrement length
 195  	ADD $-256, R4
 196  
 197  	// rearrange vectors
 198  	SHUFFLE(X0, X1, X2, X3, M0, M1, M2, M3)
 199  	ADDV(J0, X0, X1, X2, X3)
 200  	SHUFFLE(X4, X5, X6, X7, M0, M1, M2, M3)
 201  	ADDV(KEY0, X4, X5, X6, X7)
 202  	SHUFFLE(X8, X9, X10, X11, M0, M1, M2, M3)
 203  	ADDV(KEY1, X8, X9, X10, X11)
 204  	VAF CTR, X12, X12
 205  	SHUFFLE(X12, X13, X14, X15, M0, M1, M2, M3)
 206  	ADDV(NONCE, X12, X13, X14, X15)
 207  
 208  	// increment counters
 209  	VAF INC, CTR, CTR
 210  
 211  	// xor keystream with plaintext
 212  	XORV(0*64, R2, R3, X0, X4,  X8, X12)
 213  	XORV(1*64, R2, R3, X1, X5,  X9, X13)
 214  	XORV(2*64, R2, R3, X2, X6, X10, X14)
 215  	XORV(3*64, R2, R3, X3, X7, X11, X15)
 216  
 217  	// increment pointers
 218  	MOVD $256(R2), R2
 219  	MOVD $256(R3), R3
 220  
 221  	CMPBNE  R4, $0, chacha
 222  
 223  	VSTEF $0, CTR, (R7)
 224  	RET
 225