sha256block_amd64_shani.mx raw
1 // Copyright 2024 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package main
6
7 import (
8 . "github.com/mmcloughlin/avo/build"
9 . "github.com/mmcloughlin/avo/operand"
10 . "github.com/mmcloughlin/avo/reg"
11 )
12
13 // The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
14 // It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
15 //
16 // Reference
17 // S. Gulley, et al, "New Instructions Supporting the Secure Hash
18 // Algorithm on IntelĀ® Architecture Processors", July 2013
19 // https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
20
21 func blockSHANI() {
22 Implement("blockSHANI")
23 Load(Param("dig"), digestPtr) // init digest hash vector H0, H1,..., H7 pointer
24 Load(Param("p").Base(), dataPtr) // init input data base pointer
25 Load(Param("p").Len(), numBytes) // get number of input bytes to hash
26 SHRQ(Imm(6), numBytes) // force modulo 64 input buffer length
27 SHLQ(Imm(6), numBytes)
28 CMPQ(numBytes, Imm(0)) // exit early for zero-length input buffer
29 JEQ(LabelRef("done"))
30 ADDQ(dataPtr, numBytes) // point numBytes to end of input buffer
31 VMOVDQU(Mem{Base: digestPtr}.Offset(0*16), state0) // load initial hash values and reorder
32 VMOVDQU(Mem{Base: digestPtr}.Offset(1*16), state1) // DCBA, HGFE -> ABEF, CDGH
33 PSHUFD(Imm(0xb1), state0, state0) // CDAB
34 PSHUFD(Imm(0x1b), state1, state1) // EFGH
35 VMOVDQA(state0, m4)
36 PALIGNR(Imm(8), state1, state0) // ABEF
37 PBLENDW(Imm(0xf0), m4, state1) // CDGH
38 flip_mask := flip_mask_DATA()
39 VMOVDQA(flip_mask, shufMask)
40 LEAQ(K256_DATA(), sha256Constants)
41
42 roundLoop()
43 done()
44 }
45
46 func roundLoop() {
47 Label("roundLoop")
48 Comment("save hash values for addition after rounds")
49 VMOVDQA(state0, abefSave)
50 VMOVDQA(state1, cdghSave)
51
52 Comment("do rounds 0-59")
53 rounds0to11(m0, nil, 0, nop) // 0-3
54 rounds0to11(m1, m0, 1, sha256msg1) // 4-7
55 rounds0to11(m2, m1, 2, sha256msg1) // 8-11
56 VMOVDQU(Mem{Base: dataPtr}.Offset(3*16), msg)
57 PSHUFB(shufMask, msg)
58 rounds12to59(m3, 3, m2, m0, sha256msg1, vmovrev) // 12-15
59 rounds12to59(m0, 4, m3, m1, sha256msg1, vmov) // 16-19
60 rounds12to59(m1, 5, m0, m2, sha256msg1, vmov) // 20-23
61 rounds12to59(m2, 6, m1, m3, sha256msg1, vmov) // 24-27
62 rounds12to59(m3, 7, m2, m0, sha256msg1, vmov) // 28-31
63 rounds12to59(m0, 8, m3, m1, sha256msg1, vmov) // 32-35
64 rounds12to59(m1, 9, m0, m2, sha256msg1, vmov) // 36-39
65 rounds12to59(m2, 10, m1, m3, sha256msg1, vmov) // 40-43
66 rounds12to59(m3, 11, m2, m0, sha256msg1, vmov) // 44-47
67 rounds12to59(m0, 12, m3, m1, sha256msg1, vmov) // 48-51
68 rounds12to59(m1, 13, m0, m2, nop, vmov) // 52-55
69 rounds12to59(m2, 14, m1, m3, nop, vmov) // 56-59
70
71 Comment("do rounds 60-63")
72 VMOVDQA(m3, msg)
73 PADDD(Mem{Base: sha256Constants}.Offset(15*32), msg)
74 SHA256RNDS2(msg, state0, state1)
75 PSHUFD(Imm(0x0e), msg, msg)
76 SHA256RNDS2(msg, state1, state0)
77
78 Comment("add current hash values with previously saved")
79 PADDD(abefSave, state0)
80 PADDD(cdghSave, state1)
81
82 Comment("advance data pointer; loop until buffer empty")
83 ADDQ(Imm(64), dataPtr)
84 CMPQ(numBytes, dataPtr)
85 JNE(LabelRef("roundLoop"))
86
87 Comment("write hash values back in the correct order")
88 PSHUFD(Imm(0x1b), state0, state0)
89 PSHUFD(Imm(0xb1), state1, state1)
90 VMOVDQA(state0, m4)
91 PBLENDW(Imm(0xf0), state1, state0)
92 PALIGNR(Imm(8), m4, state1)
93 VMOVDQU(state0, Mem{Base: digestPtr}.Offset(0*16))
94 VMOVDQU(state1, Mem{Base: digestPtr}.Offset(1*16))
95 }
96
97 func done() {
98 Label("done")
99 RET()
100 }
101
102 var (
103 digestPtr GPPhysical = RDI // input/output, base pointer to digest hash vector H0, H1, ..., H7
104 dataPtr = RSI // input, base pointer to first input data block
105 numBytes = RDX // input, number of input bytes to be processed
106 sha256Constants = RAX // round contents from K256 table, indexed by round number x 32
107 msg VecPhysical = X0 // input data
108 state0 = X1 // round intermediates and outputs
109 state1 = X2
110 m0 = X3 // m0, m1,... m4 -- round message temps
111 m1 = X4
112 m2 = X5
113 m3 = X6
114 m4 = X7
115 shufMask = X8 // input data endian conversion control mask
116 abefSave = X9 // digest hash vector inter-block buffer abef
117 cdghSave = X10 // digest hash vector inter-block buffer cdgh
118 )
119
120 // nop instead of final SHA256MSG1 for first and last few rounds
121 func nop(m, a VecPhysical) {
122 }
123
124 // final SHA256MSG1 for middle rounds that require it
125 func sha256msg1(m, a VecPhysical) {
126 SHA256MSG1(m, a)
127 }
128
129 // msg copy for all but rounds 12-15
130 func vmov(a, b VecPhysical) {
131 VMOVDQA(a, b)
132 }
133
134 // reverse copy for rounds 12-15
135 func vmovrev(a, b VecPhysical) {
136 VMOVDQA(b, a)
137 }
138
139 type VecFunc func(a, b VecPhysical)
140
141 // sha rounds 0 to 11
142 //
143 // identical with the exception of the final msg op
144 // which is replaced with a nop for rounds where it is not needed
145 // refer to Gulley, et al for more information
146 func rounds0to11(m, a VecPhysical, c int, sha256msg1 VecFunc) {
147 VMOVDQU(Mem{Base: dataPtr}.Offset(c*16), msg)
148 PSHUFB(shufMask, msg)
149 VMOVDQA(msg, m)
150 PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
151 SHA256RNDS2(msg, state0, state1)
152 PSHUFD(U8(0x0e), msg, msg)
153 SHA256RNDS2(msg, state1, state0)
154 sha256msg1(m, a)
155 }
156
157 // sha rounds 12 to 59
158 //
159 // identical with the exception of the final msg op
160 // and the reverse copy(m,msg) in round 12 which is required
161 // after the last data load
162 // refer to Gulley, et al for more information
163 func rounds12to59(m VecPhysical, c int, a, t VecPhysical, sha256msg1, movop VecFunc) {
164 movop(m, msg)
165 PADDD(Mem{Base: sha256Constants}.Offset(c*32), msg)
166 SHA256RNDS2(msg, state0, state1)
167 VMOVDQA(m, m4)
168 PALIGNR(Imm(4), a, m4)
169 PADDD(m4, t)
170 SHA256MSG2(m, t)
171 PSHUFD(Imm(0x0e), msg, msg)
172 SHA256RNDS2(msg, state1, state0)
173 sha256msg1(m, a)
174 }
175