chacha_ppc64x.s raw

   1  // Copyright 2019 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Based on CRYPTOGAMS code with the following comment:
   6  // # ====================================================================
   7  // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   8  // # project. The module is, however, dual licensed under OpenSSL and
   9  // # CRYPTOGAMS licenses depending on where you obtain it. For further
  10  // # details see http://www.openssl.org/~appro/cryptogams/.
  11  // # ====================================================================
  12  
  13  // Code for the perl script that generates the ppc64 assembler
  14  // can be found in the cryptogams repository at the link below. It is based on
  15  // the original from openssl.
  16  
  17  // https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91
  18  
  19  // The differences in this and the original implementation are
  20  // due to the calling conventions and initialization of constants.
  21  
  22  //go:build gc && !purego && (ppc64 || ppc64le)
  23  
  24  #include "textflag.h"
  25  
  26  #define OUT  R3
  27  #define INP  R4
  28  #define LEN  R5
  29  #define KEY  R6
  30  #define CNT  R7
  31  #define TMP  R15
  32  
  33  #define CONSTBASE  R16
  34  #define BLOCKS R17
  35  
  36  // for VPERMXOR
  37  #define MASK  R18
  38  
  39  DATA consts<>+0x00(SB)/4, $0x61707865
  40  DATA consts<>+0x04(SB)/4, $0x3320646e
  41  DATA consts<>+0x08(SB)/4, $0x79622d32
  42  DATA consts<>+0x0c(SB)/4, $0x6b206574
  43  DATA consts<>+0x10(SB)/4, $0x00000001
  44  DATA consts<>+0x14(SB)/4, $0x00000000
  45  DATA consts<>+0x18(SB)/4, $0x00000000
  46  DATA consts<>+0x1c(SB)/4, $0x00000000
  47  DATA consts<>+0x20(SB)/4, $0x00000004
  48  DATA consts<>+0x24(SB)/4, $0x00000000
  49  DATA consts<>+0x28(SB)/4, $0x00000000
  50  DATA consts<>+0x2c(SB)/4, $0x00000000
  51  DATA consts<>+0x30(SB)/4, $0x0e0f0c0d
  52  DATA consts<>+0x34(SB)/4, $0x0a0b0809
  53  DATA consts<>+0x38(SB)/4, $0x06070405
  54  DATA consts<>+0x3c(SB)/4, $0x02030001
  55  DATA consts<>+0x40(SB)/4, $0x0d0e0f0c
  56  DATA consts<>+0x44(SB)/4, $0x090a0b08
  57  DATA consts<>+0x48(SB)/4, $0x05060704
  58  DATA consts<>+0x4c(SB)/4, $0x01020300
  59  DATA consts<>+0x50(SB)/4, $0x61707865
  60  DATA consts<>+0x54(SB)/4, $0x61707865
  61  DATA consts<>+0x58(SB)/4, $0x61707865
  62  DATA consts<>+0x5c(SB)/4, $0x61707865
  63  DATA consts<>+0x60(SB)/4, $0x3320646e
  64  DATA consts<>+0x64(SB)/4, $0x3320646e
  65  DATA consts<>+0x68(SB)/4, $0x3320646e
  66  DATA consts<>+0x6c(SB)/4, $0x3320646e
  67  DATA consts<>+0x70(SB)/4, $0x79622d32
  68  DATA consts<>+0x74(SB)/4, $0x79622d32
  69  DATA consts<>+0x78(SB)/4, $0x79622d32
  70  DATA consts<>+0x7c(SB)/4, $0x79622d32
  71  DATA consts<>+0x80(SB)/4, $0x6b206574
  72  DATA consts<>+0x84(SB)/4, $0x6b206574
  73  DATA consts<>+0x88(SB)/4, $0x6b206574
  74  DATA consts<>+0x8c(SB)/4, $0x6b206574
  75  DATA consts<>+0x90(SB)/4, $0x00000000
  76  DATA consts<>+0x94(SB)/4, $0x00000001
  77  DATA consts<>+0x98(SB)/4, $0x00000002
  78  DATA consts<>+0x9c(SB)/4, $0x00000003
  79  DATA consts<>+0xa0(SB)/4, $0x11223300
  80  DATA consts<>+0xa4(SB)/4, $0x55667744
  81  DATA consts<>+0xa8(SB)/4, $0x99aabb88
  82  DATA consts<>+0xac(SB)/4, $0xddeeffcc
  83  DATA consts<>+0xb0(SB)/4, $0x22330011
  84  DATA consts<>+0xb4(SB)/4, $0x66774455
  85  DATA consts<>+0xb8(SB)/4, $0xaabb8899
  86  DATA consts<>+0xbc(SB)/4, $0xeeffccdd
  87  GLOBL consts<>(SB), RODATA, $0xc0
  88  
  89  #ifdef GOARCH_ppc64
  90  #define BE_XXBRW_INIT() \
  91  		LVSL (R0)(R0), V24 \
  92  		VSPLTISB $3, V25   \
  93  		VXOR V24, V25, V24 \
  94  
  95  #define BE_XXBRW(vr) VPERM vr, vr, V24, vr
  96  #else
  97  #define BE_XXBRW_INIT()
  98  #define BE_XXBRW(vr)
  99  #endif
 100  
 101  //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32)
 102  TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40
 103  	MOVD out+0(FP), OUT
 104  	MOVD inp+8(FP), INP
 105  	MOVD len+16(FP), LEN
 106  	MOVD key+24(FP), KEY
 107  	MOVD counter+32(FP), CNT
 108  
 109  	// Addressing for constants
 110  	MOVD $consts<>+0x00(SB), CONSTBASE
 111  	MOVD $16, R8
 112  	MOVD $32, R9
 113  	MOVD $48, R10
 114  	MOVD $64, R11
 115  	SRD $6, LEN, BLOCKS
 116  	// for VPERMXOR
 117  	MOVD $consts<>+0xa0(SB), MASK
 118  	MOVD $16, R20
 119  	// V16
 120  	LXVW4X (CONSTBASE)(R0), VS48
 121  	ADD $80,CONSTBASE
 122  
 123  	// Load key into V17,V18
 124  	LXVW4X (KEY)(R0), VS49
 125  	LXVW4X (KEY)(R8), VS50
 126  
 127  	// Load CNT, NONCE into V19
 128  	LXVW4X (CNT)(R0), VS51
 129  
 130  	// Clear V27
 131  	VXOR V27, V27, V27
 132  
 133  	BE_XXBRW_INIT()
 134  
 135  	// V28
 136  	LXVW4X (CONSTBASE)(R11), VS60
 137  
 138  	// Load mask constants for VPERMXOR
 139  	LXVW4X (MASK)(R0), V20
 140  	LXVW4X (MASK)(R20), V21
 141  
 142  	// splat slot from V19 -> V26
 143  	VSPLTW $0, V19, V26
 144  
 145  	VSLDOI $4, V19, V27, V19
 146  	VSLDOI $12, V27, V19, V19
 147  
 148  	VADDUWM V26, V28, V26
 149  
 150  	MOVD $10, R14
 151  	MOVD R14, CTR
 152  	PCALIGN $16
 153  loop_outer_vsx:
 154  	// V0, V1, V2, V3
 155  	LXVW4X (R0)(CONSTBASE), VS32
 156  	LXVW4X (R8)(CONSTBASE), VS33
 157  	LXVW4X (R9)(CONSTBASE), VS34
 158  	LXVW4X (R10)(CONSTBASE), VS35
 159  
 160  	// splat values from V17, V18 into V4-V11
 161  	VSPLTW $0, V17, V4
 162  	VSPLTW $1, V17, V5
 163  	VSPLTW $2, V17, V6
 164  	VSPLTW $3, V17, V7
 165  	VSPLTW $0, V18, V8
 166  	VSPLTW $1, V18, V9
 167  	VSPLTW $2, V18, V10
 168  	VSPLTW $3, V18, V11
 169  
 170  	// VOR
 171  	VOR V26, V26, V12
 172  
 173  	// splat values from V19 -> V13, V14, V15
 174  	VSPLTW $1, V19, V13
 175  	VSPLTW $2, V19, V14
 176  	VSPLTW $3, V19, V15
 177  
 178  	// splat   const values
 179  	VSPLTISW $-16, V27
 180  	VSPLTISW $12, V28
 181  	VSPLTISW $8, V29
 182  	VSPLTISW $7, V30
 183  	PCALIGN $16
 184  loop_vsx:
 185  	VADDUWM V0, V4, V0
 186  	VADDUWM V1, V5, V1
 187  	VADDUWM V2, V6, V2
 188  	VADDUWM V3, V7, V3
 189  
 190  	VPERMXOR V12, V0, V21, V12
 191  	VPERMXOR V13, V1, V21, V13
 192  	VPERMXOR V14, V2, V21, V14
 193  	VPERMXOR V15, V3, V21, V15
 194  
 195  	VADDUWM V8, V12, V8
 196  	VADDUWM V9, V13, V9
 197  	VADDUWM V10, V14, V10
 198  	VADDUWM V11, V15, V11
 199  
 200  	VXOR V4, V8, V4
 201  	VXOR V5, V9, V5
 202  	VXOR V6, V10, V6
 203  	VXOR V7, V11, V7
 204  
 205  	VRLW V4, V28, V4
 206  	VRLW V5, V28, V5
 207  	VRLW V6, V28, V6
 208  	VRLW V7, V28, V7
 209  
 210  	VADDUWM V0, V4, V0
 211  	VADDUWM V1, V5, V1
 212  	VADDUWM V2, V6, V2
 213  	VADDUWM V3, V7, V3
 214  
 215  	VPERMXOR V12, V0, V20, V12
 216  	VPERMXOR V13, V1, V20, V13
 217  	VPERMXOR V14, V2, V20, V14
 218  	VPERMXOR V15, V3, V20, V15
 219  
 220  	VADDUWM V8, V12, V8
 221  	VADDUWM V9, V13, V9
 222  	VADDUWM V10, V14, V10
 223  	VADDUWM V11, V15, V11
 224  
 225  	VXOR V4, V8, V4
 226  	VXOR V5, V9, V5
 227  	VXOR V6, V10, V6
 228  	VXOR V7, V11, V7
 229  
 230  	VRLW V4, V30, V4
 231  	VRLW V5, V30, V5
 232  	VRLW V6, V30, V6
 233  	VRLW V7, V30, V7
 234  
 235  	VADDUWM V0, V5, V0
 236  	VADDUWM V1, V6, V1
 237  	VADDUWM V2, V7, V2
 238  	VADDUWM V3, V4, V3
 239  
 240  	VPERMXOR V15, V0, V21, V15
 241  	VPERMXOR V12, V1, V21, V12
 242  	VPERMXOR V13, V2, V21, V13
 243  	VPERMXOR V14, V3, V21, V14
 244  
 245  	VADDUWM V10, V15, V10
 246  	VADDUWM V11, V12, V11
 247  	VADDUWM V8, V13, V8
 248  	VADDUWM V9, V14, V9
 249  
 250  	VXOR V5, V10, V5
 251  	VXOR V6, V11, V6
 252  	VXOR V7, V8, V7
 253  	VXOR V4, V9, V4
 254  
 255  	VRLW V5, V28, V5
 256  	VRLW V6, V28, V6
 257  	VRLW V7, V28, V7
 258  	VRLW V4, V28, V4
 259  
 260  	VADDUWM V0, V5, V0
 261  	VADDUWM V1, V6, V1
 262  	VADDUWM V2, V7, V2
 263  	VADDUWM V3, V4, V3
 264  
 265  	VPERMXOR V15, V0, V20, V15
 266  	VPERMXOR V12, V1, V20, V12
 267  	VPERMXOR V13, V2, V20, V13
 268  	VPERMXOR V14, V3, V20, V14
 269  
 270  	VADDUWM V10, V15, V10
 271  	VADDUWM V11, V12, V11
 272  	VADDUWM V8, V13, V8
 273  	VADDUWM V9, V14, V9
 274  
 275  	VXOR V5, V10, V5
 276  	VXOR V6, V11, V6
 277  	VXOR V7, V8, V7
 278  	VXOR V4, V9, V4
 279  
 280  	VRLW V5, V30, V5
 281  	VRLW V6, V30, V6
 282  	VRLW V7, V30, V7
 283  	VRLW V4, V30, V4
 284  	BDNZ   loop_vsx
 285  
 286  	VADDUWM V12, V26, V12
 287  
 288  	VMRGEW V0, V1, V27
 289  	VMRGEW V2, V3, V28
 290  
 291  	VMRGOW V0, V1, V0
 292  	VMRGOW V2, V3, V2
 293  
 294  	VMRGEW V4, V5, V29
 295  	VMRGEW V6, V7, V30
 296  
 297  	XXPERMDI VS32, VS34, $0, VS33
 298  	XXPERMDI VS32, VS34, $3, VS35
 299  	XXPERMDI VS59, VS60, $0, VS32
 300  	XXPERMDI VS59, VS60, $3, VS34
 301  
 302  	VMRGOW V4, V5, V4
 303  	VMRGOW V6, V7, V6
 304  
 305  	VMRGEW V8, V9, V27
 306  	VMRGEW V10, V11, V28
 307  
 308  	XXPERMDI VS36, VS38, $0, VS37
 309  	XXPERMDI VS36, VS38, $3, VS39
 310  	XXPERMDI VS61, VS62, $0, VS36
 311  	XXPERMDI VS61, VS62, $3, VS38
 312  
 313  	VMRGOW V8, V9, V8
 314  	VMRGOW V10, V11, V10
 315  
 316  	VMRGEW V12, V13, V29
 317  	VMRGEW V14, V15, V30
 318  
 319  	XXPERMDI VS40, VS42, $0, VS41
 320  	XXPERMDI VS40, VS42, $3, VS43
 321  	XXPERMDI VS59, VS60, $0, VS40
 322  	XXPERMDI VS59, VS60, $3, VS42
 323  
 324  	VMRGOW V12, V13, V12
 325  	VMRGOW V14, V15, V14
 326  
 327  	VSPLTISW $4, V27
 328  	VADDUWM V26, V27, V26
 329  
 330  	XXPERMDI VS44, VS46, $0, VS45
 331  	XXPERMDI VS44, VS46, $3, VS47
 332  	XXPERMDI VS61, VS62, $0, VS44
 333  	XXPERMDI VS61, VS62, $3, VS46
 334  
 335  	VADDUWM V0, V16, V0
 336  	VADDUWM V4, V17, V4
 337  	VADDUWM V8, V18, V8
 338  	VADDUWM V12, V19, V12
 339  
 340  	BE_XXBRW(V0)
 341  	BE_XXBRW(V4)
 342  	BE_XXBRW(V8)
 343  	BE_XXBRW(V12)
 344  
 345  	CMPU LEN, $64
 346  	BLT tail_vsx
 347  
 348  	// Bottom of loop
 349  	LXVW4X (INP)(R0), VS59
 350  	LXVW4X (INP)(R8), VS60
 351  	LXVW4X (INP)(R9), VS61
 352  	LXVW4X (INP)(R10), VS62
 353  
 354  	VXOR V27, V0, V27
 355  	VXOR V28, V4, V28
 356  	VXOR V29, V8, V29
 357  	VXOR V30, V12, V30
 358  
 359  	STXVW4X VS59, (OUT)(R0)
 360  	STXVW4X VS60, (OUT)(R8)
 361  	ADD     $64, INP
 362  	STXVW4X VS61, (OUT)(R9)
 363  	ADD     $-64, LEN
 364  	STXVW4X VS62, (OUT)(R10)
 365  	ADD     $64, OUT
 366  	BEQ     done_vsx
 367  
 368  	VADDUWM V1, V16, V0
 369  	VADDUWM V5, V17, V4
 370  	VADDUWM V9, V18, V8
 371  	VADDUWM V13, V19, V12
 372  
 373  	BE_XXBRW(V0)
 374  	BE_XXBRW(V4)
 375  	BE_XXBRW(V8)
 376  	BE_XXBRW(V12)
 377  
 378  	CMPU  LEN, $64
 379  	BLT   tail_vsx
 380  
 381  	LXVW4X (INP)(R0), VS59
 382  	LXVW4X (INP)(R8), VS60
 383  	LXVW4X (INP)(R9), VS61
 384  	LXVW4X (INP)(R10), VS62
 385  
 386  	VXOR V27, V0, V27
 387  	VXOR V28, V4, V28
 388  	VXOR V29, V8, V29
 389  	VXOR V30, V12, V30
 390  
 391  	STXVW4X VS59, (OUT)(R0)
 392  	STXVW4X VS60, (OUT)(R8)
 393  	ADD     $64, INP
 394  	STXVW4X VS61, (OUT)(R9)
 395  	ADD     $-64, LEN
 396  	STXVW4X VS62, (OUT)(V10)
 397  	ADD     $64, OUT
 398  	BEQ     done_vsx
 399  
 400  	VADDUWM V2, V16, V0
 401  	VADDUWM V6, V17, V4
 402  	VADDUWM V10, V18, V8
 403  	VADDUWM V14, V19, V12
 404  
 405  	BE_XXBRW(V0)
 406  	BE_XXBRW(V4)
 407  	BE_XXBRW(V8)
 408  	BE_XXBRW(V12)
 409  
 410  	CMPU LEN, $64
 411  	BLT  tail_vsx
 412  
 413  	LXVW4X (INP)(R0), VS59
 414  	LXVW4X (INP)(R8), VS60
 415  	LXVW4X (INP)(R9), VS61
 416  	LXVW4X (INP)(R10), VS62
 417  
 418  	VXOR V27, V0, V27
 419  	VXOR V28, V4, V28
 420  	VXOR V29, V8, V29
 421  	VXOR V30, V12, V30
 422  
 423  	STXVW4X VS59, (OUT)(R0)
 424  	STXVW4X VS60, (OUT)(R8)
 425  	ADD     $64, INP
 426  	STXVW4X VS61, (OUT)(R9)
 427  	ADD     $-64, LEN
 428  	STXVW4X VS62, (OUT)(R10)
 429  	ADD     $64, OUT
 430  	BEQ     done_vsx
 431  
 432  	VADDUWM V3, V16, V0
 433  	VADDUWM V7, V17, V4
 434  	VADDUWM V11, V18, V8
 435  	VADDUWM V15, V19, V12
 436  
 437  	BE_XXBRW(V0)
 438  	BE_XXBRW(V4)
 439  	BE_XXBRW(V8)
 440  	BE_XXBRW(V12)
 441  
 442  	CMPU  LEN, $64
 443  	BLT   tail_vsx
 444  
 445  	LXVW4X (INP)(R0), VS59
 446  	LXVW4X (INP)(R8), VS60
 447  	LXVW4X (INP)(R9), VS61
 448  	LXVW4X (INP)(R10), VS62
 449  
 450  	VXOR V27, V0, V27
 451  	VXOR V28, V4, V28
 452  	VXOR V29, V8, V29
 453  	VXOR V30, V12, V30
 454  
 455  	STXVW4X VS59, (OUT)(R0)
 456  	STXVW4X VS60, (OUT)(R8)
 457  	ADD     $64, INP
 458  	STXVW4X VS61, (OUT)(R9)
 459  	ADD     $-64, LEN
 460  	STXVW4X VS62, (OUT)(R10)
 461  	ADD     $64, OUT
 462  
 463  	MOVD $10, R14
 464  	MOVD R14, CTR
 465  	BNE  loop_outer_vsx
 466  
 467  done_vsx:
 468  	// Increment counter by number of 64 byte blocks
 469  	MOVWZ (CNT), R14
 470  	ADD  BLOCKS, R14
 471  	MOVWZ R14, (CNT)
 472  	RET
 473  
 474  tail_vsx:
 475  	ADD  $32, R1, R11
 476  	MOVD LEN, CTR
 477  
 478  	// Save values on stack to copy from
 479  	STXVW4X VS32, (R11)(R0)
 480  	STXVW4X VS36, (R11)(R8)
 481  	STXVW4X VS40, (R11)(R9)
 482  	STXVW4X VS44, (R11)(R10)
 483  	ADD $-1, R11, R12
 484  	ADD $-1, INP
 485  	ADD $-1, OUT
 486  	PCALIGN $16
 487  looptail_vsx:
 488  	// Copying the result to OUT
 489  	// in bytes.
 490  	MOVBZU 1(R12), KEY
 491  	MOVBZU 1(INP), TMP
 492  	XOR    KEY, TMP, KEY
 493  	MOVBU  KEY, 1(OUT)
 494  	BDNZ   looptail_vsx
 495  
 496  	// Clear the stack values
 497  	STXVW4X VS48, (R11)(R0)
 498  	STXVW4X VS48, (R11)(R8)
 499  	STXVW4X VS48, (R11)(R9)
 500  	STXVW4X VS48, (R11)(R10)
 501  	BR      done_vsx
 502