blocks_generic.go raw
1 // Copyright (c) 2024 The Decred developers
2 // Use of this source code is governed by an ISC
3 // license that can be found in the LICENSE file.
4 //
5 // Main Go code originally written and optimized by Dave Collins May 2020.
6 // Additional cleanup and comments added July 2024.
7
8 package compress
9
10 import (
11 "math/bits"
12 )
13
14 const (
15 // blockSize is the block size of the hash algorithm in bytes.
16 blockSize = 64
17
18 // blockSizeLog2 is the base-2 log of the block size. It is used to
19 // efficiently perform integer division by the block size.
20 blockSizeLog2 = 6
21 )
22
23 // State houses the current chain value and salt used during block compression.
24 // It is housed in a separate struct in order to reduce the number of parameters
25 // to help prevent register spillage on platforms with a limited number of
26 // registers for better performance.
27 type State struct {
28 CV [8]uint32 // the current chain value
29 S [4]uint32 // salt (zero by default)
30 }
31
32 // g is the quarter round function that each round applies to the 4x4 internal
33 // state in the compression function.
34 func g(a, b, c, d, mx, my, cx, cy uint32) (uint32, uint32, uint32, uint32) {
35 a += b + (mx ^ cx)
36 d = bits.RotateLeft32(d^a, -16)
37 c += d
38 b = bits.RotateLeft32(b^c, -12)
39 a += b + (my ^ cy)
40 d = bits.RotateLeft32(d^a, -8)
41 c += d
42 b = bits.RotateLeft32(b^c, -7)
43 return a, b, c, d
44 }
45
46 // blocksGeneric performs BLAKE-224 and BLAKE-256 block compression using pure
47 // Go. It will compress as many full blocks as are available in the provided
48 // message.
49 //
50 // The parameters are:
51 //
52 // state: the block compression state with the chain value and salt
53 // msg: the padded message to compress (must be at least 64 bytes)
54 // counter: the total number of message bits hashed so far
55 //
56 // It will panic if the provided message block does not have at least 64 bytes.
57 //
58 // The chain value in the provided state is updated in place.
59 func blocksGeneric(state *State, msg []byte, counter uint64) {
60 // The compression func initializes the 16-word state matrix as follows:
61 //
62 // h0..h7 is the input chaining value.
63 //
64 // s0..s3 is the salt.
65 //
66 // c0..c7 are the first 8 words of the BLAKE constants defined by the
67 // specification (the first digits of π).
68 //
69 // t0 and t1 are the lower and higher order words of the 64-bit counter.
70 //
71 // |v0 v1 v2 v3| | h0 h1 h2 h3 |
72 // |v4 v5 v6 v7| | h4 h5 h6 h7 |
73 // |v8 v9 va vb| = |s0^c0 s1^c1 s2^c2 s3^c3|
74 // |vc vd ve vf| |t0^c4 t0^c5 t1^c6 t1^c7|
75 //
76 // Each round consists of 8 applications of the G function as follows:
77 // G0(v0,v4,v8,vc) G1(v1,v5,v9,vd) G2(v2,v6,va,ve) G3(v3,v7,vb,vf)
78 // G4(v0,v5,va,vf) G5(v1,v6,vb,vc) G6(v2,v7,v8,vd) G7(v3,v4,v9,ve)
79 //
80 // In other words, the G function is applied to each column of the 4x4 state
81 // and then to each of the diagonals.
82 //
83 // In addition, at each round, the message words are permuted according to
84 // the following schedule where each round is mod 10. In other words round
85 // 11 uses the same permutation as round 1, round 12 the same as round 2,
86 // etc:
87 //
88 // round 1: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
89 // round 2: 14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3
90 // round 3: 11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4
91 // round 4: 7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8
92 // round 5: 9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13
93 // round 6: 2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9
94 // round 7: 12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11
95 // round 8: 13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10
96 // round 9: 6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5
97 // round 10: 10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0
98
99 const (
100 c0, c1, c2, c3 = 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344
101 c4, c5, c6, c7 = 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89
102 c8, c9, ca, cb = 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c
103 cc, cd, ce, cf = 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
104 )
105
106 // Ideally these hints wouldn't be necessary, but they do make quite a big
107 // difference in avoiding a bunch of additional bounds checks as determined
108 // by both reviewing the resulting compiled asm as well as benchmarks.
109 h := &state.CV
110 s := &state.S
111 _ = h[7] // Bounds check hint to compiler.
112 _ = s[3] // Bounds check hint to compiler.
113 for numBlocks := len(msg) >> blockSizeLog2; numBlocks > 0; numBlocks-- {
114 _ = msg[63] // Bounds check hint to compiler.
115
116 // Convert the provided message of at least 64 bytes to an array of 16
117 // 32-bit unsigned big-endian words.
118 //
119 // Note that this conversion is intentionally not in a separate function
120 // because the Go compiler unfortunately sees a function that does this
121 // as too complex to inline.
122 //
123 // It also avoids using binary.BigEndian.Uint32 for these even though it
124 // is more readable for increased performance in the critical path.
125 //
126 // Finally, this is optimized to favor arm since there currently is only
127 // a pure Go implementation for that arch while amd64 will be using the
128 // vector accelerated versions in practice. It's only slightly slower
129 // on amd64 versus various rearrangements that favor it instead.
130 m0 := uint32(msg[3]) | uint32(msg[2])<<8 | uint32(msg[1])<<16 | uint32(msg[0])<<24
131 m1 := uint32(msg[7]) | uint32(msg[6])<<8 | uint32(msg[5])<<16 | uint32(msg[4])<<24
132 m2 := uint32(msg[11]) | uint32(msg[10])<<8 | uint32(msg[9])<<16 | uint32(msg[8])<<24
133 m3 := uint32(msg[15]) | uint32(msg[14])<<8 | uint32(msg[13])<<16 | uint32(msg[12])<<24
134 m4 := uint32(msg[19]) | uint32(msg[18])<<8 | uint32(msg[17])<<16 | uint32(msg[16])<<24
135 m5 := uint32(msg[23]) | uint32(msg[22])<<8 | uint32(msg[21])<<16 | uint32(msg[20])<<24
136 m6 := uint32(msg[27]) | uint32(msg[26])<<8 | uint32(msg[25])<<16 | uint32(msg[24])<<24
137 m7 := uint32(msg[31]) | uint32(msg[30])<<8 | uint32(msg[29])<<16 | uint32(msg[28])<<24
138 m8 := uint32(msg[35]) | uint32(msg[34])<<8 | uint32(msg[33])<<16 | uint32(msg[32])<<24
139 m9 := uint32(msg[39]) | uint32(msg[38])<<8 | uint32(msg[37])<<16 | uint32(msg[36])<<24
140 m10 := uint32(msg[43]) | uint32(msg[42])<<8 | uint32(msg[41])<<16 | uint32(msg[40])<<24
141 m11 := uint32(msg[47]) | uint32(msg[46])<<8 | uint32(msg[45])<<16 | uint32(msg[44])<<24
142 m12 := uint32(msg[51]) | uint32(msg[50])<<8 | uint32(msg[49])<<16 | uint32(msg[48])<<24
143 m13 := uint32(msg[55]) | uint32(msg[54])<<8 | uint32(msg[53])<<16 | uint32(msg[52])<<24
144 m14 := uint32(msg[59]) | uint32(msg[58])<<8 | uint32(msg[57])<<16 | uint32(msg[56])<<24
145 m15 := uint32(msg[63]) | uint32(msg[62])<<8 | uint32(msg[61])<<16 | uint32(msg[60])<<24
146
147 // Round 1 plus state matrix initialization.
148 t0, t1 := uint32(counter), uint32(counter>>32)
149 v0, v4, v8, vc := g(h[0], h[4], s[0]^c0, t0^c4, m0, m1, c1, c0)
150 v1, v5, v9, vd := g(h[1], h[5], s[1]^c1, t0^c5, m2, m3, c3, c2)
151 v2, v6, va, ve := g(h[2], h[6], s[2]^c2, t1^c6, m4, m5, c5, c4)
152 v3, v7, vb, vf := g(h[3], h[7], s[3]^c3, t1^c7, m6, m7, c7, c6)
153 v0, v5, va, vf = g(v0, v5, va, vf, m8, m9, c9, c8)
154 v1, v6, vb, vc = g(v1, v6, vb, vc, m10, m11, cb, ca)
155 v2, v7, v8, vd = g(v2, v7, v8, vd, m12, m13, cd, cc)
156 v3, v4, v9, ve = g(v3, v4, v9, ve, m14, m15, cf, ce)
157
158 // Round 2 with message word and constants permutation.
159 v0, v4, v8, vc = g(v0, v4, v8, vc, m14, m10, ca, ce)
160 v1, v5, v9, vd = g(v1, v5, v9, vd, m4, m8, c8, c4)
161 v2, v6, va, ve = g(v2, v6, va, ve, m9, m15, cf, c9)
162 v3, v7, vb, vf = g(v3, v7, vb, vf, m13, m6, c6, cd)
163 v0, v5, va, vf = g(v0, v5, va, vf, m1, m12, cc, c1)
164 v1, v6, vb, vc = g(v1, v6, vb, vc, m0, m2, c2, c0)
165 v2, v7, v8, vd = g(v2, v7, v8, vd, m11, m7, c7, cb)
166 v3, v4, v9, ve = g(v3, v4, v9, ve, m5, m3, c3, c5)
167
168 // Round 3 with message word and constants permutation.
169 v0, v4, v8, vc = g(v0, v4, v8, vc, m11, m8, c8, cb)
170 v1, v5, v9, vd = g(v1, v5, v9, vd, m12, m0, c0, cc)
171 v2, v6, va, ve = g(v2, v6, va, ve, m5, m2, c2, c5)
172 v3, v7, vb, vf = g(v3, v7, vb, vf, m15, m13, cd, cf)
173 v0, v5, va, vf = g(v0, v5, va, vf, m10, m14, ce, ca)
174 v1, v6, vb, vc = g(v1, v6, vb, vc, m3, m6, c6, c3)
175 v2, v7, v8, vd = g(v2, v7, v8, vd, m7, m1, c1, c7)
176 v3, v4, v9, ve = g(v3, v4, v9, ve, m9, m4, c4, c9)
177
178 // Round 4 with message word and constants permutation.
179 v0, v4, v8, vc = g(v0, v4, v8, vc, m7, m9, c9, c7)
180 v1, v5, v9, vd = g(v1, v5, v9, vd, m3, m1, c1, c3)
181 v2, v6, va, ve = g(v2, v6, va, ve, m13, m12, cc, cd)
182 v3, v7, vb, vf = g(v3, v7, vb, vf, m11, m14, ce, cb)
183 v0, v5, va, vf = g(v0, v5, va, vf, m2, m6, c6, c2)
184 v1, v6, vb, vc = g(v1, v6, vb, vc, m5, m10, ca, c5)
185 v2, v7, v8, vd = g(v2, v7, v8, vd, m4, m0, c0, c4)
186 v3, v4, v9, ve = g(v3, v4, v9, ve, m15, m8, c8, cf)
187
188 // Round 5 with message word and constants permutation.
189 v0, v4, v8, vc = g(v0, v4, v8, vc, m9, m0, c0, c9)
190 v1, v5, v9, vd = g(v1, v5, v9, vd, m5, m7, c7, c5)
191 v2, v6, va, ve = g(v2, v6, va, ve, m2, m4, c4, c2)
192 v3, v7, vb, vf = g(v3, v7, vb, vf, m10, m15, cf, ca)
193 v0, v5, va, vf = g(v0, v5, va, vf, m14, m1, c1, ce)
194 v1, v6, vb, vc = g(v1, v6, vb, vc, m11, m12, cc, cb)
195 v2, v7, v8, vd = g(v2, v7, v8, vd, m6, m8, c8, c6)
196 v3, v4, v9, ve = g(v3, v4, v9, ve, m3, m13, cd, c3)
197
198 // Round 6 with message word and constants permutation.
199 v0, v4, v8, vc = g(v0, v4, v8, vc, m2, m12, cc, c2)
200 v1, v5, v9, vd = g(v1, v5, v9, vd, m6, m10, ca, c6)
201 v2, v6, va, ve = g(v2, v6, va, ve, m0, m11, cb, c0)
202 v3, v7, vb, vf = g(v3, v7, vb, vf, m8, m3, c3, c8)
203 v0, v5, va, vf = g(v0, v5, va, vf, m4, m13, cd, c4)
204 v1, v6, vb, vc = g(v1, v6, vb, vc, m7, m5, c5, c7)
205 v2, v7, v8, vd = g(v2, v7, v8, vd, m15, m14, ce, cf)
206 v3, v4, v9, ve = g(v3, v4, v9, ve, m1, m9, c9, c1)
207
208 // Round 7 with message word and constants permutation.
209 v0, v4, v8, vc = g(v0, v4, v8, vc, m12, m5, c5, cc)
210 v1, v5, v9, vd = g(v1, v5, v9, vd, m1, m15, cf, c1)
211 v2, v6, va, ve = g(v2, v6, va, ve, m14, m13, cd, ce)
212 v3, v7, vb, vf = g(v3, v7, vb, vf, m4, m10, ca, c4)
213 v0, v5, va, vf = g(v0, v5, va, vf, m0, m7, c7, c0)
214 v1, v6, vb, vc = g(v1, v6, vb, vc, m6, m3, c3, c6)
215 v2, v7, v8, vd = g(v2, v7, v8, vd, m9, m2, c2, c9)
216 v3, v4, v9, ve = g(v3, v4, v9, ve, m8, m11, cb, c8)
217
218 // Round 8 with message word and constants permutation.
219 v0, v4, v8, vc = g(v0, v4, v8, vc, m13, m11, cb, cd)
220 v1, v5, v9, vd = g(v1, v5, v9, vd, m7, m14, ce, c7)
221 v2, v6, va, ve = g(v2, v6, va, ve, m12, m1, c1, cc)
222 v3, v7, vb, vf = g(v3, v7, vb, vf, m3, m9, c9, c3)
223 v0, v5, va, vf = g(v0, v5, va, vf, m5, m0, c0, c5)
224 v1, v6, vb, vc = g(v1, v6, vb, vc, m15, m4, c4, cf)
225 v2, v7, v8, vd = g(v2, v7, v8, vd, m8, m6, c6, c8)
226 v3, v4, v9, ve = g(v3, v4, v9, ve, m2, m10, ca, c2)
227
228 // Round 9 with message word and constants permutation.
229 v0, v4, v8, vc = g(v0, v4, v8, vc, m6, m15, cf, c6)
230 v1, v5, v9, vd = g(v1, v5, v9, vd, m14, m9, c9, ce)
231 v2, v6, va, ve = g(v2, v6, va, ve, m11, m3, c3, cb)
232 v3, v7, vb, vf = g(v3, v7, vb, vf, m0, m8, c8, c0)
233 v0, v5, va, vf = g(v0, v5, va, vf, m12, m2, c2, cc)
234 v1, v6, vb, vc = g(v1, v6, vb, vc, m13, m7, c7, cd)
235 v2, v7, v8, vd = g(v2, v7, v8, vd, m1, m4, c4, c1)
236 v3, v4, v9, ve = g(v3, v4, v9, ve, m10, m5, c5, ca)
237
238 // Round 10 with message word and constants permutation.
239 v0, v4, v8, vc = g(v0, v4, v8, vc, m10, m2, c2, ca)
240 v1, v5, v9, vd = g(v1, v5, v9, vd, m8, m4, c4, c8)
241 v2, v6, va, ve = g(v2, v6, va, ve, m7, m6, c6, c7)
242 v3, v7, vb, vf = g(v3, v7, vb, vf, m1, m5, c5, c1)
243 v0, v5, va, vf = g(v0, v5, va, vf, m15, m11, cb, cf)
244 v1, v6, vb, vc = g(v1, v6, vb, vc, m9, m14, ce, c9)
245 v2, v7, v8, vd = g(v2, v7, v8, vd, m3, m12, cc, c3)
246 v3, v4, v9, ve = g(v3, v4, v9, ve, m13, m0, c0, cd)
247
248 // Round 11 with message word and constants permutation.
249 v0, v4, v8, vc = g(v0, v4, v8, vc, m0, m1, c1, c0)
250 v1, v5, v9, vd = g(v1, v5, v9, vd, m2, m3, c3, c2)
251 v2, v6, va, ve = g(v2, v6, va, ve, m4, m5, c5, c4)
252 v3, v7, vb, vf = g(v3, v7, vb, vf, m6, m7, c7, c6)
253 v0, v5, va, vf = g(v0, v5, va, vf, m8, m9, c9, c8)
254 v1, v6, vb, vc = g(v1, v6, vb, vc, m10, m11, cb, ca)
255 v2, v7, v8, vd = g(v2, v7, v8, vd, m12, m13, cd, cc)
256 v3, v4, v9, ve = g(v3, v4, v9, ve, m14, m15, cf, ce)
257
258 // Round 12 with message word and constants permutation.
259 v0, v4, v8, vc = g(v0, v4, v8, vc, m14, m10, ca, ce)
260 v1, v5, v9, vd = g(v1, v5, v9, vd, m4, m8, c8, c4)
261 v2, v6, va, ve = g(v2, v6, va, ve, m9, m15, cf, c9)
262 v3, v7, vb, vf = g(v3, v7, vb, vf, m13, m6, c6, cd)
263 v0, v5, va, vf = g(v0, v5, va, vf, m1, m12, cc, c1)
264 v1, v6, vb, vc = g(v1, v6, vb, vc, m0, m2, c2, c0)
265 v2, v7, v8, vd = g(v2, v7, v8, vd, m11, m7, c7, cb)
266 v3, v4, v9, ve = g(v3, v4, v9, ve, m5, m3, c3, c5)
267
268 // Round 13 with message word and constants permutation.
269 v0, v4, v8, vc = g(v0, v4, v8, vc, m11, m8, c8, cb)
270 v1, v5, v9, vd = g(v1, v5, v9, vd, m12, m0, c0, cc)
271 v2, v6, va, ve = g(v2, v6, va, ve, m5, m2, c2, c5)
272 v3, v7, vb, vf = g(v3, v7, vb, vf, m15, m13, cd, cf)
273 v0, v5, va, vf = g(v0, v5, va, vf, m10, m14, ce, ca)
274 v1, v6, vb, vc = g(v1, v6, vb, vc, m3, m6, c6, c3)
275 v2, v7, v8, vd = g(v2, v7, v8, vd, m7, m1, c1, c7)
276 v3, v4, v9, ve = g(v3, v4, v9, ve, m9, m4, c4, c9)
277
278 // Round 14 with message word and constants permutation.
279 v0, v4, v8, vc = g(v0, v4, v8, vc, m7, m9, c9, c7)
280 v1, v5, v9, vd = g(v1, v5, v9, vd, m3, m1, c1, c3)
281 v2, v6, va, ve = g(v2, v6, va, ve, m13, m12, cc, cd)
282 v3, v7, vb, vf = g(v3, v7, vb, vf, m11, m14, ce, cb)
283 v0, v5, va, vf = g(v0, v5, va, vf, m2, m6, c6, c2)
284 v1, v6, vb, vc = g(v1, v6, vb, vc, m5, m10, ca, c5)
285 v2, v7, v8, vd = g(v2, v7, v8, vd, m4, m0, c0, c4)
286 v3, v4, v9, ve = g(v3, v4, v9, ve, m15, m8, c8, cf)
287
288 // Finally the output is defined as:
289 //
290 // h'0 = h0^s0^v0^v8
291 // h'1 = h1^s1^v1^v9
292 // h'2 = h2^s2^v2^va
293 // h'3 = h3^s3^v3^vb
294 // h'4 = h4^s0^v4^vc
295 // h'5 = h5^s1^v5^vd
296 // h'6 = h6^s2^v6^ve
297 // h'7 = h7^s3^v7^vf
298 h[0] ^= s[0] ^ v0 ^ v8
299 h[1] ^= s[1] ^ v1 ^ v9
300 h[2] ^= s[2] ^ v2 ^ va
301 h[3] ^= s[3] ^ v3 ^ vb
302 h[4] ^= s[0] ^ v4 ^ vc
303 h[5] ^= s[1] ^ v5 ^ vd
304 h[6] ^= s[2] ^ v6 ^ ve
305 h[7] ^= s[3] ^ v7 ^ vf
306
307 // Move to the next message and increase the message bits counter
308 // accordingly.
309 msg = msg[blockSize:]
310 counter += blockSize << 3
311 }
312 }
313