1 // Copyright 2023 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // ChaCha8 is ChaCha with 8 rounds.
6 // See https://cr.yp.to/chacha/chacha-20080128.pdf.
7 //
8 // ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to:
9 //
10 // const1 const2 const3 const4
11 // seed seed seed seed
12 // seed seed seed seed
13 // counter64 0 0
14 //
15 // We use the same constants as ChaCha20 does, a random seed,
16 // and a counter. Running ChaCha8 on this input produces
17 // a 4x4 matrix of pseudo-random values with as much entropy
18 // as the seed.
19 //
20 // Given SIMD registers that can hold N uint32s, it is possible
21 // to run N ChaCha8 block transformations in parallel by filling
22 // the first register with the N copies of const1, the second
23 // with N copies of const2, and so on, and then running the operations.
24 //
25 // Each iteration of ChaCha8Rand operates over 32 bytes of input and
26 // produces 992 bytes of RNG output, plus 32 bytes of input for the next
27 // iteration.
28 //
29 // The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to
30 // produce 1024 bytes of output (16 blocks, with counters 0 to 15).
31 // First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32,
32 // 0x6b206574 are subtracted from the 32-bit little-endian words at
33 // position 0, 1, 2, and 3 respectively, and an increasing counter
34 // starting at zero is subtracted from each word at position 12. Then,
35 // this stream is permuted such that for each sequence of four blocks,
36 // first we output the first four bytes of each block, then the next four
37 // bytes of each block, and so on. Finally, the last 32 bytes of output
38 // are used as the input of the next iteration, and the remaining 992
39 // bytes are the RNG output.
40 //
41 // See https://c2sp.org/chacha8rand for additional details.
42 //
43 // Normal ChaCha20 implementations for encryption use this same
44 // parallelism but then have to deinterlace the results so that
45 // it appears the blocks were generated separately. For the purposes
46 // of generating random numbers, the interlacing is fine.
47 // We are simply locked in to preserving the 4-way interlacing
48 // in any future optimizations.
49 package chacha8rand
50 51 import (
52 "internal/goarch"
53 "unsafe"
54 )
55 56 // setup sets up 4 ChaCha8 blocks in b32 with the counter and seed.
57 // Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced
58 // the same way they would be in a 4-way SIMD implementations.
59 func setup(seed *[4]uint64, b32 *[16][4]uint32, counter uint32) {
60 // Convert to uint64 to do half as many stores to memory.
61 b := (*[16][2]uint64)(unsafe.Pointer(b32))
62 63 // Constants; same as in ChaCha20: "expand 32-byte k"
64 b[0][0] = 0x61707865_61707865
65 b[0][1] = 0x61707865_61707865
66 67 b[1][0] = 0x3320646e_3320646e
68 b[1][1] = 0x3320646e_3320646e
69 70 b[2][0] = 0x79622d32_79622d32
71 b[2][1] = 0x79622d32_79622d32
72 73 b[3][0] = 0x6b206574_6b206574
74 b[3][1] = 0x6b206574_6b206574
75 76 // Seed values.
77 var x64 uint64
78 var x uint32
79 80 x = uint32(seed[0])
81 x64 = uint64(x)<<32 | uint64(x)
82 b[4][0] = x64
83 b[4][1] = x64
84 85 x = uint32(seed[0] >> 32)
86 x64 = uint64(x)<<32 | uint64(x)
87 b[5][0] = x64
88 b[5][1] = x64
89 90 x = uint32(seed[1])
91 x64 = uint64(x)<<32 | uint64(x)
92 b[6][0] = x64
93 b[6][1] = x64
94 95 x = uint32(seed[1] >> 32)
96 x64 = uint64(x)<<32 | uint64(x)
97 b[7][0] = x64
98 b[7][1] = x64
99 100 x = uint32(seed[2])
101 x64 = uint64(x)<<32 | uint64(x)
102 b[8][0] = x64
103 b[8][1] = x64
104 105 x = uint32(seed[2] >> 32)
106 x64 = uint64(x)<<32 | uint64(x)
107 b[9][0] = x64
108 b[9][1] = x64
109 110 x = uint32(seed[3])
111 x64 = uint64(x)<<32 | uint64(x)
112 b[10][0] = x64
113 b[10][1] = x64
114 115 x = uint32(seed[3] >> 32)
116 x64 = uint64(x)<<32 | uint64(x)
117 b[11][0] = x64
118 b[11][1] = x64
119 120 // Counters.
121 if goarch.BigEndian {
122 b[12][0] = uint64(counter+0)<<32 | uint64(counter+1)
123 b[12][1] = uint64(counter+2)<<32 | uint64(counter+3)
124 } else {
125 b[12][0] = uint64(counter+0) | uint64(counter+1)<<32
126 b[12][1] = uint64(counter+2) | uint64(counter+3)<<32
127 }
128 129 // Zeros.
130 b[13][0] = 0
131 b[13][1] = 0
132 b[14][0] = 0
133 b[14][1] = 0
134 135 b[15][0] = 0
136 b[15][1] = 0
137 }
138 139 func _() {
140 // block and block_generic must have same type
141 x := block
142 x = block_generic
143 _ = x
144 }
145 146 // block_generic is the non-assembly block implementation,
147 // for use on systems without special assembly.
148 // Even on such systems, it is quite fast: on GOOS=386,
149 // ChaCha8 using this code generates random values faster than PCG-DXSM.
150 func block_generic(seed *[4]uint64, buf *[32]uint64, counter uint32) {
151 b := (*[16][4]uint32)(unsafe.Pointer(buf))
152 153 setup(seed, b, counter)
154 155 for i := range b[0] {
156 // Load block i from b[*][i] into local variables.
157 b0 := b[0][i]
158 b1 := b[1][i]
159 b2 := b[2][i]
160 b3 := b[3][i]
161 b4 := b[4][i]
162 b5 := b[5][i]
163 b6 := b[6][i]
164 b7 := b[7][i]
165 b8 := b[8][i]
166 b9 := b[9][i]
167 b10 := b[10][i]
168 b11 := b[11][i]
169 b12 := b[12][i]
170 b13 := b[13][i]
171 b14 := b[14][i]
172 b15 := b[15][i]
173 174 // 4 iterations of eight quarter-rounds each is 8 rounds
175 for round := 0; round < 4; round++ {
176 b0, b4, b8, b12 = qr(b0, b4, b8, b12)
177 b1, b5, b9, b13 = qr(b1, b5, b9, b13)
178 b2, b6, b10, b14 = qr(b2, b6, b10, b14)
179 b3, b7, b11, b15 = qr(b3, b7, b11, b15)
180 181 b0, b5, b10, b15 = qr(b0, b5, b10, b15)
182 b1, b6, b11, b12 = qr(b1, b6, b11, b12)
183 b2, b7, b8, b13 = qr(b2, b7, b8, b13)
184 b3, b4, b9, b14 = qr(b3, b4, b9, b14)
185 }
186 187 // Store block i back into b[*][i].
188 // Add b4..b11 back to the original key material,
189 // like in ChaCha20, to avoid trivial invertibility.
190 // There is no entropy in b0..b3 and b12..b15
191 // so we can skip the additions and save some time.
192 b[0][i] = b0
193 b[1][i] = b1
194 b[2][i] = b2
195 b[3][i] = b3
196 b[4][i] += b4
197 b[5][i] += b5
198 b[6][i] += b6
199 b[7][i] += b7
200 b[8][i] += b8
201 b[9][i] += b9
202 b[10][i] += b10
203 b[11][i] += b11
204 b[12][i] = b12
205 b[13][i] = b13
206 b[14][i] = b14
207 b[15][i] = b15
208 }
209 210 if goarch.BigEndian {
211 // On a big-endian system, reading the uint32 pairs as uint64s
212 // will word-swap them compared to little-endian, so we word-swap
213 // them here first to make the next swap get the right answer.
214 for i, x := range buf {
215 buf[i] = x>>32 | x<<32
216 }
217 }
218 }
219 220 // qr is the (inlinable) ChaCha8 quarter round.
221 func qr(a, b, c, d uint32) (_a, _b, _c, _d uint32) {
222 a += b
223 d ^= a
224 d = d<<16 | d>>16
225 c += d
226 b ^= c
227 b = b<<12 | b>>20
228 a += b
229 d ^= a
230 d = d<<8 | d>>24
231 c += d
232 b ^= c
233 b = b<<7 | b>>25
234 return a, b, c, d
235 }
236