sha256block_amd64_shani.mx raw

   1  // Copyright 2024 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package main
   6  
   7  import (
   8  	. "github.com/mmcloughlin/avo/build"
   9  	. "github.com/mmcloughlin/avo/operand"
  10  	. "github.com/mmcloughlin/avo/reg"
  11  )
  12  
  13  // The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
  14  // It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
  15  //
  16  // Reference
  17  // S. Gulley, et al, "New Instructions Supporting the Secure Hash
  18  // Algorithm on IntelĀ® Architecture Processors", July 2013
  19  // https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
  20  
  21  func blockSHANI() {
  22  	Implement("blockSHANI")
  23  	Load(Param("dig"), digestPtr)    //                   init digest hash vector H0, H1,..., H7 pointer
  24  	Load(Param("p").Base(), dataPtr) //                   init input data base pointer
  25  	Load(Param("p").Len(), numBytes) //                   get number of input bytes to hash
  26  	SHRQ(Imm(6), numBytes)           //                   force modulo 64 input buffer length
  27  	SHLQ(Imm(6), numBytes)
  28  	CMPQ(numBytes, Imm(0)) //                             exit early for zero-length input buffer
  29  	JEQ(LabelRef("done"))
  30  	ADDQ(dataPtr, numBytes)                            // point numBytes to end of input buffer
  31  	VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder
  32  	VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH
  33  	PSHUFD(Imm(0xb1), state0, state0)                  // CDAB
  34  	PSHUFD(Imm(0x1b), state1, state1)                  // EFGH
  35  	VMOVDQA(state0, m4)
  36  	PALIGNR(Imm(8), state1, state0) //                    ABEF
  37  	PBLENDW(Imm(0xf0), m4, state1)  //                    CDGH
  38  	flip_mask := flip_mask_DATA()
  39  	VMOVDQA(flip_mask, shufMask)
  40  	LEAQ(K256_DATA(), sha256Constants)
  41  
  42  	roundLoop()
  43  	done()
  44  }
  45  
  46  func roundLoop() {
  47  	Label("roundLoop")
  48  	Comment("save hash values for addition after rounds")
  49  	VMOVDQA(state0, abefSave)
  50  	VMOVDQA(state1, cdghSave)
  51  
  52  	Comment("do rounds 0-59")
  53  	rounds0to11(m0, nil, 0, nop)       //                 0-3
  54  	rounds0to11(m1, m0, 1, sha256msg1) //                 4-7
  55  	rounds0to11(m2, m1, 2, sha256msg1) //                8-11
  56  	VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg)
  57  	PSHUFB(shufMask, msg)
  58  	rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15
  59  	rounds12to59(m0, 4, m3, m1, sha256msg1, vmov)    // 16-19
  60  	rounds12to59(m1, 5, m0, m2, sha256msg1, vmov)    // 20-23
  61  	rounds12to59(m2, 6, m1, m3, sha256msg1, vmov)    // 24-27
  62  	rounds12to59(m3, 7, m2, m0, sha256msg1, vmov)    // 28-31
  63  	rounds12to59(m0, 8, m3, m1, sha256msg1, vmov)    // 32-35
  64  	rounds12to59(m1, 9, m0, m2, sha256msg1, vmov)    // 36-39
  65  	rounds12to59(m2, 10, m1, m3, sha256msg1, vmov)   // 40-43
  66  	rounds12to59(m3, 11, m2, m0, sha256msg1, vmov)   // 44-47
  67  	rounds12to59(m0, 12, m3, m1, sha256msg1, vmov)   // 48-51
  68  	rounds12to59(m1, 13, m0, m2, nop, vmov)          // 52-55
  69  	rounds12to59(m2, 14, m1, m3, nop, vmov)          // 56-59
  70  
  71  	Comment("do rounds 60-63")
  72  	VMOVDQA(m3, msg)
  73  	PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg)
  74  	SHA256RNDS2(msg, state0, state1)
  75  	PSHUFD(Imm(0x0e), msg, msg)
  76  	SHA256RNDS2(msg, state1, state0)
  77  
  78  	Comment("add current hash values with previously saved")
  79  	PADDD(abefSave, state0)
  80  	PADDD(cdghSave, state1)
  81  
  82  	Comment("advance data pointer; loop until buffer empty")
  83  	ADDQ(Imm(64), dataPtr)
  84  	CMPQ(numBytes, dataPtr)
  85  	JNE(LabelRef("roundLoop"))
  86  
  87  	Comment("write hash values back in the correct order")
  88  	PSHUFD(Imm(0x1b), state0, state0)
  89  	PSHUFD(Imm(0xb1), state1, state1)
  90  	VMOVDQA(state0, m4)
  91  	PBLENDW(Imm(0xf0), state1, state0)
  92  	PALIGNR(Imm(8), m4, state1)
  93  	VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16))
  94  	VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16))
  95  }
  96  
  97  func done() {
  98  	Label("done")
  99  	RET()
 100  }
 101  
 102  var (
 103  	digestPtr       GPPhysical  = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7
 104  	dataPtr                     = RSI // input, base pointer to first input data block
 105  	numBytes                    = RDX // input, number of input bytes to be processed
 106  	sha256Constants             = RAX // round contents from K256 table, indexed by round number x 32
 107  	msg             VecPhysical = X0  // input data
 108  	state0                      = X1  // round intermediates and outputs
 109  	state1                      = X2
 110  	m0                          = X3 //  m0, m1,... m4 -- round message temps
 111  	m1                          = X4
 112  	m2                          = X5
 113  	m3                          = X6
 114  	m4                          = X7
 115  	shufMask                    = X8  // input data endian conversion control mask
 116  	abefSave                    = X9  // digest hash vector inter-block buffer abef
 117  	cdghSave                    = X10 // digest hash vector inter-block buffer cdgh
 118  )
 119  
 120  // nop instead of final SHA256MSG1 for first and last few rounds
 121  func nop(m, a VecPhysical) {
 122  }
 123  
 124  // final SHA256MSG1 for middle rounds that require it
 125  func sha256msg1(m, a VecPhysical) {
 126  	SHA256MSG1(m, a)
 127  }
 128  
 129  // msg copy for all but rounds 12-15
 130  func vmov(a, b VecPhysical) {
 131  	VMOVDQA(a, b)
 132  }
 133  
 134  // reverse copy for rounds 12-15
 135  func vmovrev(a, b VecPhysical) {
 136  	VMOVDQA(b, a)
 137  }
 138  
 139  type VecFunc func(a, b VecPhysical)
 140  
 141  // sha rounds 0 to 11
 142  //
 143  // identical with the exception of the final msg op
 144  // which is replaced with a nop for rounds where it is not needed
 145  // refer to Gulley, et al for more information
 146  func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) {
 147  	VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg)
 148  	PSHUFB(shufMask, msg)
 149  	VMOVDQA(msg, m)
 150  	PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
 151  	SHA256RNDS2(msg, state0, state1)
 152  	PSHUFD(U8(0x0e), msg, msg)
 153  	SHA256RNDS2(msg, state1, state0)
 154  	sha256msg1(m, a)
 155  }
 156  
 157  // sha rounds 12 to 59
 158  //
 159  // identical with the exception of the final msg op
 160  // and the reverse copy(m,msg) in round 12 which is required
 161  // after the last data load
 162  // refer to Gulley, et al for more information
 163  func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) {
 164  	movop(m, msg)
 165  	PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
 166  	SHA256RNDS2(msg, state0, state1)
 167  	VMOVDQA(m, m4)
 168  	PALIGNR(Imm(4), a, m4)
 169  	PADDD(m4, t)
 170  	SHA256MSG2(m, t)
 171  	PSHUFD(Imm(0x0e), msg, msg)
 172  	SHA256RNDS2(msg, state1, state0)
 173  	sha256msg1(m, a)
 174  }
 175