chacha_arm64.s raw

   1  // Copyright 2018 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build gc && !purego
   6  
   7  #include "textflag.h"
   8  
   9  #define NUM_ROUNDS 10
  10  
  11  // func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32)
  12  TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0
  13  	MOVD	dst+0(FP), R1
  14  	MOVD	src+24(FP), R2
  15  	MOVD	src_len+32(FP), R3
  16  	MOVD	key+48(FP), R4
  17  	MOVD	nonce+56(FP), R6
  18  	MOVD	counter+64(FP), R7
  19  
  20  	MOVD	$·constants(SB), R10
  21  	MOVD	$·incRotMatrix(SB), R11
  22  
  23  	MOVW	(R7), R20
  24  
  25  	AND	$~255, R3, R13
  26  	ADD	R2, R13, R12 // R12 for block end
  27  	AND	$255, R3, R13
  28  loop:
  29  	MOVD	$NUM_ROUNDS, R21
  30  	VLD1	(R11), [V30.S4, V31.S4]
  31  
  32  	// load contants
  33  	// VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4]
  34  	WORD	$0x4D60E940
  35  
  36  	// load keys
  37  	// VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4]
  38  	WORD	$0x4DFFE884
  39  	// VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4]
  40  	WORD	$0x4DFFE888
  41  	SUB	$32, R4
  42  
  43  	// load counter + nonce
  44  	// VLD1R (R7), [V12.S4]
  45  	WORD	$0x4D40C8EC
  46  
  47  	// VLD3R (R6), [V13.S4, V14.S4, V15.S4]
  48  	WORD	$0x4D40E8CD
  49  
  50  	// update counter
  51  	VADD	V30.S4, V12.S4, V12.S4
  52  
  53  chacha:
  54  	// V0..V3 += V4..V7
  55  	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 16)
  56  	VADD	V0.S4, V4.S4, V0.S4
  57  	VADD	V1.S4, V5.S4, V1.S4
  58  	VADD	V2.S4, V6.S4, V2.S4
  59  	VADD	V3.S4, V7.S4, V3.S4
  60  	VEOR	V12.B16, V0.B16, V12.B16
  61  	VEOR	V13.B16, V1.B16, V13.B16
  62  	VEOR	V14.B16, V2.B16, V14.B16
  63  	VEOR	V15.B16, V3.B16, V15.B16
  64  	VREV32	V12.H8, V12.H8
  65  	VREV32	V13.H8, V13.H8
  66  	VREV32	V14.H8, V14.H8
  67  	VREV32	V15.H8, V15.H8
  68  	// V8..V11 += V12..V15
  69  	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 12)
  70  	VADD	V8.S4, V12.S4, V8.S4
  71  	VADD	V9.S4, V13.S4, V9.S4
  72  	VADD	V10.S4, V14.S4, V10.S4
  73  	VADD	V11.S4, V15.S4, V11.S4
  74  	VEOR	V8.B16, V4.B16, V16.B16
  75  	VEOR	V9.B16, V5.B16, V17.B16
  76  	VEOR	V10.B16, V6.B16, V18.B16
  77  	VEOR	V11.B16, V7.B16, V19.B16
  78  	VSHL	$12, V16.S4, V4.S4
  79  	VSHL	$12, V17.S4, V5.S4
  80  	VSHL	$12, V18.S4, V6.S4
  81  	VSHL	$12, V19.S4, V7.S4
  82  	VSRI	$20, V16.S4, V4.S4
  83  	VSRI	$20, V17.S4, V5.S4
  84  	VSRI	$20, V18.S4, V6.S4
  85  	VSRI	$20, V19.S4, V7.S4
  86  
  87  	// V0..V3 += V4..V7
  88  	// V12..V15 <<<= ((V12..V15 XOR V0..V3), 8)
  89  	VADD	V0.S4, V4.S4, V0.S4
  90  	VADD	V1.S4, V5.S4, V1.S4
  91  	VADD	V2.S4, V6.S4, V2.S4
  92  	VADD	V3.S4, V7.S4, V3.S4
  93  	VEOR	V12.B16, V0.B16, V12.B16
  94  	VEOR	V13.B16, V1.B16, V13.B16
  95  	VEOR	V14.B16, V2.B16, V14.B16
  96  	VEOR	V15.B16, V3.B16, V15.B16
  97  	VTBL	V31.B16, [V12.B16], V12.B16
  98  	VTBL	V31.B16, [V13.B16], V13.B16
  99  	VTBL	V31.B16, [V14.B16], V14.B16
 100  	VTBL	V31.B16, [V15.B16], V15.B16
 101  
 102  	// V8..V11 += V12..V15
 103  	// V4..V7 <<<= ((V4..V7 XOR V8..V11), 7)
 104  	VADD	V12.S4, V8.S4, V8.S4
 105  	VADD	V13.S4, V9.S4, V9.S4
 106  	VADD	V14.S4, V10.S4, V10.S4
 107  	VADD	V15.S4, V11.S4, V11.S4
 108  	VEOR	V8.B16, V4.B16, V16.B16
 109  	VEOR	V9.B16, V5.B16, V17.B16
 110  	VEOR	V10.B16, V6.B16, V18.B16
 111  	VEOR	V11.B16, V7.B16, V19.B16
 112  	VSHL	$7, V16.S4, V4.S4
 113  	VSHL	$7, V17.S4, V5.S4
 114  	VSHL	$7, V18.S4, V6.S4
 115  	VSHL	$7, V19.S4, V7.S4
 116  	VSRI	$25, V16.S4, V4.S4
 117  	VSRI	$25, V17.S4, V5.S4
 118  	VSRI	$25, V18.S4, V6.S4
 119  	VSRI	$25, V19.S4, V7.S4
 120  
 121  	// V0..V3 += V5..V7, V4
 122  	// V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16)
 123  	VADD	V0.S4, V5.S4, V0.S4
 124  	VADD	V1.S4, V6.S4, V1.S4
 125  	VADD	V2.S4, V7.S4, V2.S4
 126  	VADD	V3.S4, V4.S4, V3.S4
 127  	VEOR	V15.B16, V0.B16, V15.B16
 128  	VEOR	V12.B16, V1.B16, V12.B16
 129  	VEOR	V13.B16, V2.B16, V13.B16
 130  	VEOR	V14.B16, V3.B16, V14.B16
 131  	VREV32	V12.H8, V12.H8
 132  	VREV32	V13.H8, V13.H8
 133  	VREV32	V14.H8, V14.H8
 134  	VREV32	V15.H8, V15.H8
 135  
 136  	// V10 += V15; V5 <<<= ((V10 XOR V5), 12)
 137  	// ...
 138  	VADD	V15.S4, V10.S4, V10.S4
 139  	VADD	V12.S4, V11.S4, V11.S4
 140  	VADD	V13.S4, V8.S4, V8.S4
 141  	VADD	V14.S4, V9.S4, V9.S4
 142  	VEOR	V10.B16, V5.B16, V16.B16
 143  	VEOR	V11.B16, V6.B16, V17.B16
 144  	VEOR	V8.B16, V7.B16, V18.B16
 145  	VEOR	V9.B16, V4.B16, V19.B16
 146  	VSHL	$12, V16.S4, V5.S4
 147  	VSHL	$12, V17.S4, V6.S4
 148  	VSHL	$12, V18.S4, V7.S4
 149  	VSHL	$12, V19.S4, V4.S4
 150  	VSRI	$20, V16.S4, V5.S4
 151  	VSRI	$20, V17.S4, V6.S4
 152  	VSRI	$20, V18.S4, V7.S4
 153  	VSRI	$20, V19.S4, V4.S4
 154  
 155  	// V0 += V5; V15 <<<= ((V0 XOR V15), 8)
 156  	// ...
 157  	VADD	V5.S4, V0.S4, V0.S4
 158  	VADD	V6.S4, V1.S4, V1.S4
 159  	VADD	V7.S4, V2.S4, V2.S4
 160  	VADD	V4.S4, V3.S4, V3.S4
 161  	VEOR	V0.B16, V15.B16, V15.B16
 162  	VEOR	V1.B16, V12.B16, V12.B16
 163  	VEOR	V2.B16, V13.B16, V13.B16
 164  	VEOR	V3.B16, V14.B16, V14.B16
 165  	VTBL	V31.B16, [V12.B16], V12.B16
 166  	VTBL	V31.B16, [V13.B16], V13.B16
 167  	VTBL	V31.B16, [V14.B16], V14.B16
 168  	VTBL	V31.B16, [V15.B16], V15.B16
 169  
 170  	// V10 += V15; V5 <<<= ((V10 XOR V5), 7)
 171  	// ...
 172  	VADD	V15.S4, V10.S4, V10.S4
 173  	VADD	V12.S4, V11.S4, V11.S4
 174  	VADD	V13.S4, V8.S4, V8.S4
 175  	VADD	V14.S4, V9.S4, V9.S4
 176  	VEOR	V10.B16, V5.B16, V16.B16
 177  	VEOR	V11.B16, V6.B16, V17.B16
 178  	VEOR	V8.B16, V7.B16, V18.B16
 179  	VEOR	V9.B16, V4.B16, V19.B16
 180  	VSHL	$7, V16.S4, V5.S4
 181  	VSHL	$7, V17.S4, V6.S4
 182  	VSHL	$7, V18.S4, V7.S4
 183  	VSHL	$7, V19.S4, V4.S4
 184  	VSRI	$25, V16.S4, V5.S4
 185  	VSRI	$25, V17.S4, V6.S4
 186  	VSRI	$25, V18.S4, V7.S4
 187  	VSRI	$25, V19.S4, V4.S4
 188  
 189  	SUB	$1, R21
 190  	CBNZ	R21, chacha
 191  
 192  	// VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4]
 193  	WORD	$0x4D60E950
 194  
 195  	// VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4]
 196  	WORD	$0x4DFFE894
 197  	VADD	V30.S4, V12.S4, V12.S4
 198  	VADD	V16.S4, V0.S4, V0.S4
 199  	VADD	V17.S4, V1.S4, V1.S4
 200  	VADD	V18.S4, V2.S4, V2.S4
 201  	VADD	V19.S4, V3.S4, V3.S4
 202  	// VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4]
 203  	WORD	$0x4DFFE898
 204  	// restore R4
 205  	SUB	$32, R4
 206  
 207  	// load counter + nonce
 208  	// VLD1R (R7), [V28.S4]
 209  	WORD	$0x4D40C8FC
 210  	// VLD3R (R6), [V29.S4, V30.S4, V31.S4]
 211  	WORD	$0x4D40E8DD
 212  
 213  	VADD	V20.S4, V4.S4, V4.S4
 214  	VADD	V21.S4, V5.S4, V5.S4
 215  	VADD	V22.S4, V6.S4, V6.S4
 216  	VADD	V23.S4, V7.S4, V7.S4
 217  	VADD	V24.S4, V8.S4, V8.S4
 218  	VADD	V25.S4, V9.S4, V9.S4
 219  	VADD	V26.S4, V10.S4, V10.S4
 220  	VADD	V27.S4, V11.S4, V11.S4
 221  	VADD	V28.S4, V12.S4, V12.S4
 222  	VADD	V29.S4, V13.S4, V13.S4
 223  	VADD	V30.S4, V14.S4, V14.S4
 224  	VADD	V31.S4, V15.S4, V15.S4
 225  
 226  	VZIP1	V1.S4, V0.S4, V16.S4
 227  	VZIP2	V1.S4, V0.S4, V17.S4
 228  	VZIP1	V3.S4, V2.S4, V18.S4
 229  	VZIP2	V3.S4, V2.S4, V19.S4
 230  	VZIP1	V5.S4, V4.S4, V20.S4
 231  	VZIP2	V5.S4, V4.S4, V21.S4
 232  	VZIP1	V7.S4, V6.S4, V22.S4
 233  	VZIP2	V7.S4, V6.S4, V23.S4
 234  	VZIP1	V9.S4, V8.S4, V24.S4
 235  	VZIP2	V9.S4, V8.S4, V25.S4
 236  	VZIP1	V11.S4, V10.S4, V26.S4
 237  	VZIP2	V11.S4, V10.S4, V27.S4
 238  	VZIP1	V13.S4, V12.S4, V28.S4
 239  	VZIP2	V13.S4, V12.S4, V29.S4
 240  	VZIP1	V15.S4, V14.S4, V30.S4
 241  	VZIP2	V15.S4, V14.S4, V31.S4
 242  	VZIP1	V18.D2, V16.D2, V0.D2
 243  	VZIP2	V18.D2, V16.D2, V4.D2
 244  	VZIP1	V19.D2, V17.D2, V8.D2
 245  	VZIP2	V19.D2, V17.D2, V12.D2
 246  	VLD1.P	64(R2), [V16.B16, V17.B16, V18.B16, V19.B16]
 247  
 248  	VZIP1	V22.D2, V20.D2, V1.D2
 249  	VZIP2	V22.D2, V20.D2, V5.D2
 250  	VZIP1	V23.D2, V21.D2, V9.D2
 251  	VZIP2	V23.D2, V21.D2, V13.D2
 252  	VLD1.P	64(R2), [V20.B16, V21.B16, V22.B16, V23.B16]
 253  	VZIP1	V26.D2, V24.D2, V2.D2
 254  	VZIP2	V26.D2, V24.D2, V6.D2
 255  	VZIP1	V27.D2, V25.D2, V10.D2
 256  	VZIP2	V27.D2, V25.D2, V14.D2
 257  	VLD1.P	64(R2), [V24.B16, V25.B16, V26.B16, V27.B16]
 258  	VZIP1	V30.D2, V28.D2, V3.D2
 259  	VZIP2	V30.D2, V28.D2, V7.D2
 260  	VZIP1	V31.D2, V29.D2, V11.D2
 261  	VZIP2	V31.D2, V29.D2, V15.D2
 262  	VLD1.P	64(R2), [V28.B16, V29.B16, V30.B16, V31.B16]
 263  	VEOR	V0.B16, V16.B16, V16.B16
 264  	VEOR	V1.B16, V17.B16, V17.B16
 265  	VEOR	V2.B16, V18.B16, V18.B16
 266  	VEOR	V3.B16, V19.B16, V19.B16
 267  	VST1.P	[V16.B16, V17.B16, V18.B16, V19.B16], 64(R1)
 268  	VEOR	V4.B16, V20.B16, V20.B16
 269  	VEOR	V5.B16, V21.B16, V21.B16
 270  	VEOR	V6.B16, V22.B16, V22.B16
 271  	VEOR	V7.B16, V23.B16, V23.B16
 272  	VST1.P	[V20.B16, V21.B16, V22.B16, V23.B16], 64(R1)
 273  	VEOR	V8.B16, V24.B16, V24.B16
 274  	VEOR	V9.B16, V25.B16, V25.B16
 275  	VEOR	V10.B16, V26.B16, V26.B16
 276  	VEOR	V11.B16, V27.B16, V27.B16
 277  	VST1.P	[V24.B16, V25.B16, V26.B16, V27.B16], 64(R1)
 278  	VEOR	V12.B16, V28.B16, V28.B16
 279  	VEOR	V13.B16, V29.B16, V29.B16
 280  	VEOR	V14.B16, V30.B16, V30.B16
 281  	VEOR	V15.B16, V31.B16, V31.B16
 282  	VST1.P	[V28.B16, V29.B16, V30.B16, V31.B16], 64(R1)
 283  
 284  	ADD	$4, R20
 285  	MOVW	R20, (R7) // update counter
 286  
 287  	CMP	R2, R12
 288  	BGT	loop
 289  
 290  	RET
 291  
 292  
 293  DATA	·constants+0x00(SB)/4, $0x61707865
 294  DATA	·constants+0x04(SB)/4, $0x3320646e
 295  DATA	·constants+0x08(SB)/4, $0x79622d32
 296  DATA	·constants+0x0c(SB)/4, $0x6b206574
 297  GLOBL	·constants(SB), NOPTR|RODATA, $32
 298  
 299  DATA	·incRotMatrix+0x00(SB)/4, $0x00000000
 300  DATA	·incRotMatrix+0x04(SB)/4, $0x00000001
 301  DATA	·incRotMatrix+0x08(SB)/4, $0x00000002
 302  DATA	·incRotMatrix+0x0c(SB)/4, $0x00000003
 303  DATA	·incRotMatrix+0x10(SB)/4, $0x02010003
 304  DATA	·incRotMatrix+0x14(SB)/4, $0x06050407
 305  DATA	·incRotMatrix+0x18(SB)/4, $0x0A09080B
 306  DATA	·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F
 307  GLOBL	·incRotMatrix(SB), NOPTR|RODATA, $32
 308