f1600x2_arm64.s raw

   1  // +build arm64,go1.16,!purego
   2  
   3  // Taken from https://github.com/bwesterb/armed-keccak
   4  
   5  #include "textflag.h"
   6  
   7  // func f1600x2ARM(state *uint64, rc *[24]uint64, turbo bool)
   8  TEXT ·f1600x2ARM(SB), NOSPLIT, $0-17
   9      MOVD state+0(FP), R0
  10      MOVD rc+8(FP), R1
  11      MOVD R0, R2
  12      MOVD $24, R3
  13  
  14      VLD1.P 64(R0), [ V0.B16,  V1.B16,  V2.B16,  V3.B16]
  15      VLD1.P 64(R0), [ V4.B16,  V5.B16,  V6.B16,  V7.B16]
  16      VLD1.P 64(R0), [ V8.B16,  V9.B16, V10.B16, V11.B16]
  17      VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
  18      VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
  19      VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
  20      VLD1.P (R0),   [V24.B16]
  21  
  22      MOVBU turbo+16(FP), R4
  23      CBZ R4, loop
  24  
  25      SUB  $12, R3, R3
  26      ADD  $96, R1, R1
  27  
  28  loop:
  29      // Execute theta but without xorring into the state yet.
  30      VEOR3 V10.B16, V5.B16, V0.B16, V25.B16
  31      VEOR3 V11.B16, V6.B16, V1.B16, V26.B16
  32      VEOR3 V12.B16, V7.B16, V2.B16, V27.B16
  33      VEOR3 V13.B16, V8.B16, V3.B16, V28.B16
  34      VEOR3 V14.B16, V9.B16, V4.B16, V29.B16
  35  
  36      VEOR3 V20.B16, V15.B16, V25.B16, V25.B16
  37      VEOR3 V21.B16, V16.B16, V26.B16, V26.B16
  38      VEOR3 V22.B16, V17.B16, V27.B16, V27.B16
  39      VEOR3 V23.B16, V18.B16, V28.B16, V28.B16
  40      VEOR3 V24.B16, V19.B16, V29.B16, V29.B16
  41  
  42      // Xor parities from step theta into the state at the same time as
  43      // exeuting rho and pi.   
  44      VRAX1 V26.D2, V29.D2, V30.D2
  45      VRAX1 V29.D2, V27.D2, V29.D2
  46      VRAX1 V27.D2, V25.D2, V27.D2
  47      VRAX1 V25.D2, V28.D2, V25.D2
  48      VRAX1 V28.D2, V26.D2, V28.D2
  49  
  50      VEOR V30.B16, V0.B16, V0.B16
  51      VMOV V1.B16, V31.B16
  52  
  53      VXAR $20, V27.D2,  V6.D2,  V1.D2   
  54      VXAR $44, V25.D2,  V9.D2,  V6.D2   
  55      VXAR $3 , V28.D2, V22.D2,  V9.D2   
  56      VXAR $25, V25.D2, V14.D2, V22.D2  
  57      VXAR $46, V30.D2, V20.D2, V14.D2  
  58      VXAR $2 , V28.D2,  V2.D2, V20.D2  
  59      VXAR $21, V28.D2, V12.D2,  V2.D2  
  60      VXAR $39, V29.D2, V13.D2, V12.D2  
  61      VXAR $56, V25.D2, V19.D2, V13.D2  
  62      VXAR $8 , V29.D2, V23.D2, V19.D2  
  63      VXAR $23, V30.D2, V15.D2, V23.D2  
  64      VXAR $37, V25.D2,  V4.D2, V15.D2  
  65      VXAR $50, V25.D2, V24.D2,  V4.D2   
  66      VXAR $62, V27.D2, V21.D2, V24.D2  
  67      VXAR $9 , V29.D2,  V8.D2, V21.D2  
  68      VXAR $19, V27.D2, V16.D2,  V8.D2   
  69      VXAR $28, V30.D2,  V5.D2, V16.D2  
  70      VXAR $36, V29.D2,  V3.D2,  V5.D2   
  71      VXAR $43, V29.D2, V18.D2,  V3.D2   
  72      VXAR $49, V28.D2, V17.D2, V18.D2  
  73      VXAR $54, V27.D2, V11.D2, V17.D2  
  74      VXAR $58, V28.D2,  V7.D2, V11.D2  
  75      VXAR $61, V30.D2, V10.D2,  V7.D2   
  76      VXAR $63, V27.D2, V31.D2, V10.D2  
  77  
  78      // Chi
  79      VBCAX V1.B16, V2.B16, V0.B16, V25.B16
  80      VBCAX V2.B16, V3.B16, V1.B16, V26.B16
  81      VBCAX V3.B16, V4.B16, V2.B16,  V2.B16
  82      VBCAX V4.B16, V0.B16, V3.B16,  V3.B16
  83      VBCAX V0.B16, V1.B16, V4.B16,  V4.B16
  84      VMOV V25.B16, V0.B16
  85      VMOV V26.B16, V1.B16
  86  
  87      VBCAX V6.B16, V7.B16, V5.B16, V25.B16
  88      VBCAX V7.B16, V8.B16, V6.B16, V26.B16
  89      VBCAX V8.B16, V9.B16, V7.B16,  V7.B16
  90      VBCAX V9.B16, V5.B16, V8.B16,  V8.B16
  91      VBCAX V5.B16, V6.B16, V9.B16,  V9.B16
  92      VMOV V25.B16, V5.B16
  93      VMOV V26.B16, V6.B16
  94  
  95      VBCAX V11.B16, V12.B16, V10.B16, V25.B16
  96      VBCAX V12.B16, V13.B16, V11.B16, V26.B16
  97      VBCAX V13.B16, V14.B16, V12.B16, V12.B16
  98      VBCAX V14.B16, V10.B16, V13.B16, V13.B16
  99      VBCAX V10.B16, V11.B16, V14.B16, V14.B16
 100      VMOV V25.B16, V10.B16
 101      VMOV V26.B16, V11.B16
 102  
 103      VBCAX V16.B16, V17.B16, V15.B16, V25.B16
 104      VBCAX V17.B16, V18.B16, V16.B16, V26.B16
 105      VBCAX V18.B16, V19.B16, V17.B16, V17.B16
 106      VBCAX V19.B16, V15.B16, V18.B16, V18.B16
 107      VBCAX V15.B16, V16.B16, V19.B16, V19.B16
 108      VMOV V25.B16, V15.B16
 109      VMOV V26.B16, V16.B16
 110  
 111      VBCAX V21.B16, V22.B16, V20.B16, V25.B16
 112      VBCAX V22.B16, V23.B16, V21.B16, V26.B16
 113      VBCAX V23.B16, V24.B16, V22.B16, V22.B16
 114      VBCAX V24.B16, V20.B16, V23.B16, V23.B16
 115      VBCAX V20.B16, V21.B16, V24.B16, V24.B16
 116      VMOV V25.B16, V20.B16
 117      VMOV V26.B16, V21.B16
 118  
 119      // Iota
 120      VLD1R.P 8(R1), [V25.D2]
 121      VEOR V25.B16, V0.B16, V0.B16
 122  
 123      SUBS $1, R3, R3
 124      CBNZ R3, loop
 125  
 126      MOVD R2, R0
 127  
 128      VST1.P [ V0.B16,  V1.B16,  V2.B16,  V3.B16], 64(R0) 
 129      VST1.P [ V4.B16,  V5.B16,  V6.B16,  V7.B16], 64(R0)
 130      VST1.P [ V8.B16,  V9.B16, V10.B16, V11.B16], 64(R0)
 131      VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
 132      VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
 133      VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
 134      VST1.P [V24.B16], (R0)
 135  
 136      RET
 137