1 // Copyright 2024 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 package mlkem
6 7 import (
8 "crypto/internal/fips140/sha3"
9 "crypto/internal/fips140deps/byteorder"
10 "errors"
11 )
12 13 // fieldElement is an integer modulo q, an element of ℤ_q. It is always reduced.
14 type fieldElement uint16
15 16 // fieldCheckReduced checks that a value a is < q.
17 func fieldCheckReduced(a uint16) (fieldElement, error) {
18 if a >= q {
19 return 0, errors.New("unreduced field element")
20 }
21 return fieldElement(a), nil
22 }
23 24 // fieldReduceOnce reduces a value a < 2q.
25 func fieldReduceOnce(a uint16) fieldElement {
26 x := a - q
27 // If x underflowed, then x >= 2¹⁶ - q > 2¹⁵, so the top bit is set.
28 x += (x >> 15) * q
29 return fieldElement(x)
30 }
31 32 func fieldAdd(a, b fieldElement) fieldElement {
33 x := uint16(a + b)
34 return fieldReduceOnce(x)
35 }
36 37 func fieldSub(a, b fieldElement) fieldElement {
38 x := uint16(a - b + q)
39 return fieldReduceOnce(x)
40 }
41 42 const (
43 barrettMultiplier = 5039 // 2¹² * 2¹² / q
44 barrettShift = 24 // log₂(2¹² * 2¹²)
45 )
46 47 // fieldReduce reduces a value a < 2q² using Barrett reduction, to avoid
48 // potentially variable-time division.
49 func fieldReduce(a uint32) fieldElement {
50 quotient := uint32((uint64(a) * barrettMultiplier) >> barrettShift)
51 return fieldReduceOnce(uint16(a - quotient*q))
52 }
53 54 func fieldMul(a, b fieldElement) fieldElement {
55 x := uint32(a) * uint32(b)
56 return fieldReduce(x)
57 }
58 59 // fieldMulSub returns a * (b - c). This operation is fused to save a
60 // fieldReduceOnce after the subtraction.
61 func fieldMulSub(a, b, c fieldElement) fieldElement {
62 x := uint32(a) * uint32(b-c+q)
63 return fieldReduce(x)
64 }
65 66 // fieldAddMul returns a * b + c * d. This operation is fused to save a
67 // fieldReduceOnce and a fieldReduce.
68 func fieldAddMul(a, b, c, d fieldElement) fieldElement {
69 x := uint32(a) * uint32(b)
70 x += uint32(c) * uint32(d)
71 return fieldReduce(x)
72 }
73 74 // compress maps a field element uniformly to the range 0 to 2ᵈ-1, according to
75 // FIPS 203, Definition 4.7.
76 func compress(x fieldElement, d uint8) uint16 {
77 // We want to compute (x * 2ᵈ) / q, rounded to nearest integer, with 1/2
78 // rounding up (see FIPS 203, Section 2.3).
79 80 // Barrett reduction produces a quotient and a remainder in the range [0, 2q),
81 // such that dividend = quotient * q + remainder.
82 dividend := uint32(x) << d // x * 2ᵈ
83 quotient := uint32(uint64(dividend) * barrettMultiplier >> barrettShift)
84 remainder := dividend - quotient*q
85 86 // Since the remainder is in the range [0, 2q), not [0, q), we need to
87 // portion it into three spans for rounding.
88 //
89 // [ 0, q/2 ) -> round to 0
90 // [ q/2, q + q/2 ) -> round to 1
91 // [ q + q/2, 2q ) -> round to 2
92 //
93 // We can convert that to the following logic: add 1 if remainder > q/2,
94 // then add 1 again if remainder > q + q/2.
95 //
96 // Note that if remainder > x, then ⌊x⌋ - remainder underflows, and the top
97 // bit of the difference will be set.
98 quotient += (q/2 - remainder) >> 31 & 1
99 quotient += (q + q/2 - remainder) >> 31 & 1
100 101 // quotient might have overflowed at this point, so reduce it by masking.
102 var mask uint32 = (1 << d) - 1
103 return uint16(quotient & mask)
104 }
105 106 // decompress maps a number x between 0 and 2ᵈ-1 uniformly to the full range of
107 // field elements, according to FIPS 203, Definition 4.8.
108 func decompress(y uint16, d uint8) fieldElement {
109 // We want to compute (y * q) / 2ᵈ, rounded to nearest integer, with 1/2
110 // rounding up (see FIPS 203, Section 2.3).
111 112 dividend := uint32(y) * q
113 quotient := dividend >> d // (y * q) / 2ᵈ
114 115 // The d'th least-significant bit of the dividend (the most significant bit
116 // of the remainder) is 1 for the top half of the values that divide to the
117 // same quotient, which are the ones that round up.
118 quotient += dividend >> (d - 1) & 1
119 120 // quotient is at most (2¹¹-1) * q / 2¹¹ + 1 = 3328, so it didn't overflow.
121 return fieldElement(quotient)
122 }
123 124 // ringElement is a polynomial, an element of R_q, represented as an array
125 // according to FIPS 203, Section 2.4.4.
126 type ringElement [n]fieldElement
127 128 // polyAdd adds two ringElements or nttElements.
129 func polyAdd[T ~[n]fieldElement](a, b T) (s T) {
130 for i := range s {
131 s[i] = fieldAdd(a[i], b[i])
132 }
133 return s
134 }
135 136 // polySub subtracts two ringElements or nttElements.
137 func polySub[T ~[n]fieldElement](a, b T) (s T) {
138 for i := range s {
139 s[i] = fieldSub(a[i], b[i])
140 }
141 return s
142 }
143 144 // polyByteEncode appends the 384-byte encoding of f to b.
145 //
146 // It implements ByteEncode₁₂, according to FIPS 203, Algorithm 5.
147 func polyByteEncode[T ~[n]fieldElement](b []byte, f T) []byte {
148 out, B := sliceForAppend(b, encodingSize12)
149 for i := 0; i < n; i += 2 {
150 x := uint32(f[i]) | uint32(f[i+1])<<12
151 B[0] = uint8(x)
152 B[1] = uint8(x >> 8)
153 B[2] = uint8(x >> 16)
154 B = B[3:]
155 }
156 return out
157 }
158 159 // polyByteDecode decodes the 384-byte encoding of a polynomial, checking that
160 // all the coefficients are properly reduced. This fulfills the "Modulus check"
161 // step of ML-KEM Encapsulation.
162 //
163 // It implements ByteDecode₁₂, according to FIPS 203, Algorithm 6.
164 func polyByteDecode[T ~[n]fieldElement](b []byte) (T, error) {
165 if len(b) != encodingSize12 {
166 return T{}, errors.New("mlkem: invalid encoding length")
167 }
168 var f T
169 for i := 0; i < n; i += 2 {
170 d := uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16
171 const mask12 = 0b1111_1111_1111
172 var err error
173 if f[i], err = fieldCheckReduced(uint16(d & mask12)); err != nil {
174 return T{}, errors.New("mlkem: invalid polynomial encoding")
175 }
176 if f[i+1], err = fieldCheckReduced(uint16(d >> 12)); err != nil {
177 return T{}, errors.New("mlkem: invalid polynomial encoding")
178 }
179 b = b[3:]
180 }
181 return f, nil
182 }
183 184 // sliceForAppend takes a slice and a requested number of bytes. It returns a
185 // slice with the contents of the given slice followed by that many bytes and a
186 // second slice that aliases into it and contains only the extra bytes. If the
187 // original slice has sufficient capacity then no allocation is performed.
188 func sliceForAppend(in []byte, n int) (head, tail []byte) {
189 if total := len(in) + n; cap(in) >= total {
190 head = in[:total]
191 } else {
192 head = []byte{:total}
193 copy(head, in)
194 }
195 tail = head[len(in):]
196 return
197 }
198 199 // ringCompressAndEncode1 appends a 32-byte encoding of a ring element to s,
200 // compressing one coefficients per bit.
201 //
202 // It implements Compress₁, according to FIPS 203, Definition 4.7,
203 // followed by ByteEncode₁, according to FIPS 203, Algorithm 5.
204 func ringCompressAndEncode1(s []byte, f ringElement) []byte {
205 s, b := sliceForAppend(s, encodingSize1)
206 for i := range b {
207 b[i] = 0
208 }
209 for i := range f {
210 b[i/8] |= uint8(compress(f[i], 1) << (i % 8))
211 }
212 return s
213 }
214 215 // ringDecodeAndDecompress1 decodes a 32-byte slice to a ring element where each
216 // bit is mapped to 0 or ⌈q/2⌋.
217 //
218 // It implements ByteDecode₁, according to FIPS 203, Algorithm 6,
219 // followed by Decompress₁, according to FIPS 203, Definition 4.8.
220 func ringDecodeAndDecompress1(b *[encodingSize1]byte) ringElement {
221 var f ringElement
222 for i := range f {
223 b_i := b[i/8] >> (i % 8) & 1
224 const halfQ = (q + 1) / 2 // ⌈q/2⌋, rounded up per FIPS 203, Section 2.3
225 f[i] = fieldElement(b_i) * halfQ // 0 decompresses to 0, and 1 to ⌈q/2⌋
226 }
227 return f
228 }
229 230 // ringCompressAndEncode4 appends a 128-byte encoding of a ring element to s,
231 // compressing two coefficients per byte.
232 //
233 // It implements Compress₄, according to FIPS 203, Definition 4.7,
234 // followed by ByteEncode₄, according to FIPS 203, Algorithm 5.
235 func ringCompressAndEncode4(s []byte, f ringElement) []byte {
236 s, b := sliceForAppend(s, encodingSize4)
237 for i := 0; i < n; i += 2 {
238 b[i/2] = uint8(compress(f[i], 4) | compress(f[i+1], 4)<<4)
239 }
240 return s
241 }
242 243 // ringDecodeAndDecompress4 decodes a 128-byte encoding of a ring element where
244 // each four bits are mapped to an equidistant distribution.
245 //
246 // It implements ByteDecode₄, according to FIPS 203, Algorithm 6,
247 // followed by Decompress₄, according to FIPS 203, Definition 4.8.
248 func ringDecodeAndDecompress4(b *[encodingSize4]byte) ringElement {
249 var f ringElement
250 for i := 0; i < n; i += 2 {
251 f[i] = fieldElement(decompress(uint16(b[i/2]&0b1111), 4))
252 f[i+1] = fieldElement(decompress(uint16(b[i/2]>>4), 4))
253 }
254 return f
255 }
256 257 // ringCompressAndEncode10 appends a 320-byte encoding of a ring element to s,
258 // compressing four coefficients per five bytes.
259 //
260 // It implements Compress₁₀, according to FIPS 203, Definition 4.7,
261 // followed by ByteEncode₁₀, according to FIPS 203, Algorithm 5.
262 func ringCompressAndEncode10(s []byte, f ringElement) []byte {
263 s, b := sliceForAppend(s, encodingSize10)
264 for i := 0; i < n; i += 4 {
265 var x uint64
266 x |= uint64(compress(f[i], 10))
267 x |= uint64(compress(f[i+1], 10)) << 10
268 x |= uint64(compress(f[i+2], 10)) << 20
269 x |= uint64(compress(f[i+3], 10)) << 30
270 b[0] = uint8(x)
271 b[1] = uint8(x >> 8)
272 b[2] = uint8(x >> 16)
273 b[3] = uint8(x >> 24)
274 b[4] = uint8(x >> 32)
275 b = b[5:]
276 }
277 return s
278 }
279 280 // ringDecodeAndDecompress10 decodes a 320-byte encoding of a ring element where
281 // each ten bits are mapped to an equidistant distribution.
282 //
283 // It implements ByteDecode₁₀, according to FIPS 203, Algorithm 6,
284 // followed by Decompress₁₀, according to FIPS 203, Definition 4.8.
285 func ringDecodeAndDecompress10(bb *[encodingSize10]byte) ringElement {
286 b := bb[:]
287 var f ringElement
288 for i := 0; i < n; i += 4 {
289 x := uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32
290 b = b[5:]
291 f[i] = fieldElement(decompress(uint16(x>>0&0b11_1111_1111), 10))
292 f[i+1] = fieldElement(decompress(uint16(x>>10&0b11_1111_1111), 10))
293 f[i+2] = fieldElement(decompress(uint16(x>>20&0b11_1111_1111), 10))
294 f[i+3] = fieldElement(decompress(uint16(x>>30&0b11_1111_1111), 10))
295 }
296 return f
297 }
298 299 // ringCompressAndEncode appends an encoding of a ring element to s,
300 // compressing each coefficient to d bits.
301 //
302 // It implements Compress, according to FIPS 203, Definition 4.7,
303 // followed by ByteEncode, according to FIPS 203, Algorithm 5.
304 func ringCompressAndEncode(s []byte, f ringElement, d uint8) []byte {
305 var b byte
306 var bIdx uint8
307 for i := 0; i < n; i++ {
308 c := compress(f[i], d)
309 var cIdx uint8
310 for cIdx < d {
311 b |= byte(c>>cIdx) << bIdx
312 bits := min(8-bIdx, d-cIdx)
313 bIdx += bits
314 cIdx += bits
315 if bIdx == 8 {
316 s = append(s, b)
317 b = 0
318 bIdx = 0
319 }
320 }
321 }
322 if bIdx != 0 {
323 panic("mlkem: internal error: bitsFilled != 0")
324 }
325 return s
326 }
327 328 // ringDecodeAndDecompress decodes an encoding of a ring element where
329 // each d bits are mapped to an equidistant distribution.
330 //
331 // It implements ByteDecode, according to FIPS 203, Algorithm 6,
332 // followed by Decompress, according to FIPS 203, Definition 4.8.
333 func ringDecodeAndDecompress(b []byte, d uint8) ringElement {
334 var f ringElement
335 var bIdx uint8
336 for i := 0; i < n; i++ {
337 var c uint16
338 var cIdx uint8
339 for cIdx < d {
340 c |= uint16(b[0]>>bIdx) << cIdx
341 c &= (1 << d) - 1
342 bits := min(8-bIdx, d-cIdx)
343 bIdx += bits
344 cIdx += bits
345 if bIdx == 8 {
346 b = b[1:]
347 bIdx = 0
348 }
349 }
350 f[i] = fieldElement(decompress(c, d))
351 }
352 if len(b) != 0 {
353 panic("mlkem: internal error: leftover bytes")
354 }
355 return f
356 }
357 358 // ringCompressAndEncode5 appends a 160-byte encoding of a ring element to s,
359 // compressing eight coefficients per five bytes.
360 //
361 // It implements Compress₅, according to FIPS 203, Definition 4.7,
362 // followed by ByteEncode₅, according to FIPS 203, Algorithm 5.
363 func ringCompressAndEncode5(s []byte, f ringElement) []byte {
364 return ringCompressAndEncode(s, f, 5)
365 }
366 367 // ringDecodeAndDecompress5 decodes a 160-byte encoding of a ring element where
368 // each five bits are mapped to an equidistant distribution.
369 //
370 // It implements ByteDecode₅, according to FIPS 203, Algorithm 6,
371 // followed by Decompress₅, according to FIPS 203, Definition 4.8.
372 func ringDecodeAndDecompress5(bb *[encodingSize5]byte) ringElement {
373 return ringDecodeAndDecompress(bb[:], 5)
374 }
375 376 // ringCompressAndEncode11 appends a 352-byte encoding of a ring element to s,
377 // compressing eight coefficients per eleven bytes.
378 //
379 // It implements Compress₁₁, according to FIPS 203, Definition 4.7,
380 // followed by ByteEncode₁₁, according to FIPS 203, Algorithm 5.
381 func ringCompressAndEncode11(s []byte, f ringElement) []byte {
382 return ringCompressAndEncode(s, f, 11)
383 }
384 385 // ringDecodeAndDecompress11 decodes a 352-byte encoding of a ring element where
386 // each eleven bits are mapped to an equidistant distribution.
387 //
388 // It implements ByteDecode₁₁, according to FIPS 203, Algorithm 6,
389 // followed by Decompress₁₁, according to FIPS 203, Definition 4.8.
390 func ringDecodeAndDecompress11(bb *[encodingSize11]byte) ringElement {
391 return ringDecodeAndDecompress(bb[:], 11)
392 }
393 394 // samplePolyCBD draws a ringElement from the special Dη distribution given a
395 // stream of random bytes generated by the PRF function, according to FIPS 203,
396 // Algorithm 8 and Definition 4.3.
397 func samplePolyCBD(s []byte, b byte) ringElement {
398 prf := sha3.NewShake256()
399 prf.Write(s)
400 prf.Write([]byte{b})
401 B := []byte{:64*2} // η = 2
402 prf.Read(B)
403 404 // SamplePolyCBD simply draws four (2η) bits for each coefficient, and adds
405 // the first two and subtracts the last two.
406 407 var f ringElement
408 for i := 0; i < n; i += 2 {
409 b := B[i/2]
410 b_7, b_6, b_5, b_4 := b>>7, b>>6&1, b>>5&1, b>>4&1
411 b_3, b_2, b_1, b_0 := b>>3&1, b>>2&1, b>>1&1, b&1
412 f[i] = fieldSub(fieldElement(b_0+b_1), fieldElement(b_2+b_3))
413 f[i+1] = fieldSub(fieldElement(b_4+b_5), fieldElement(b_6+b_7))
414 }
415 return f
416 }
417 418 // nttElement is an NTT representation, an element of T_q, represented as an
419 // array according to FIPS 203, Section 2.4.4.
420 type nttElement [n]fieldElement
421 422 // gammas are the values ζ^2BitRev7(i)+1 mod q for each index i, according to
423 // FIPS 203, Appendix A (with negative values reduced to positive).
424 var gammas = [128]fieldElement{17, 3312, 2761, 568, 583, 2746, 2649, 680, 1637, 1692, 723, 2606, 2288, 1041, 1100, 2229, 1409, 1920, 2662, 667, 3281, 48, 233, 3096, 756, 2573, 2156, 1173, 3015, 314, 3050, 279, 1703, 1626, 1651, 1678, 2789, 540, 1789, 1540, 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, 939, 2390, 2308, 1021, 2437, 892, 2388, 941, 733, 2596, 2337, 992, 268, 3061, 641, 2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, 1063, 2266, 319, 3010, 2773, 556, 757, 2572, 2099, 1230, 561, 2768, 2466, 863, 2594, 735, 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, 1143, 2186, 2150, 1179, 2775, 554, 886, 2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, 2110, 1219, 2935, 394, 885, 2444, 2154, 1175}
425 426 // nttMul multiplies two nttElements.
427 //
428 // It implements MultiplyNTTs, according to FIPS 203, Algorithm 11.
429 func nttMul(f, g nttElement) nttElement {
430 var h nttElement
431 // We use i += 2 for bounds check elimination. See https://go.dev/issue/66826.
432 for i := 0; i < 256; i += 2 {
433 a0, a1 := f[i], f[i+1]
434 b0, b1 := g[i], g[i+1]
435 h[i] = fieldAddMul(a0, b0, fieldMul(a1, b1), gammas[i/2])
436 h[i+1] = fieldAddMul(a0, b1, a1, b0)
437 }
438 return h
439 }
440 441 // zetas are the values ζ^BitRev7(k) mod q for each index k, according to FIPS
442 // 203, Appendix A.
443 var zetas = [128]fieldElement{1, 1729, 2580, 3289, 2642, 630, 1897, 848, 1062, 1919, 193, 797, 2786, 3260, 569, 1746, 296, 2447, 1339, 1476, 3046, 56, 2240, 1333, 1426, 2094, 535, 2882, 2393, 2879, 1974, 821, 289, 331, 3253, 1756, 1197, 2304, 2277, 2055, 650, 1977, 2513, 632, 2865, 33, 1320, 1915, 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, 2647, 2617, 1481, 648, 2474, 3110, 1227, 910, 17, 2761, 583, 2649, 1637, 723, 2288, 1100, 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, 1703, 1651, 2789, 1789, 1847, 952, 1461, 2687, 939, 2308, 2437, 2388, 733, 2337, 268, 641, 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, 1063, 319, 2773, 757, 2099, 561, 2466, 2594, 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154}
444 445 // ntt maps a ringElement to its nttElement representation.
446 //
447 // It implements NTT, according to FIPS 203, Algorithm 9.
448 func ntt(f ringElement) nttElement {
449 k := 1
450 for len := 128; len >= 2; len /= 2 {
451 for start := 0; start < 256; start += 2 * len {
452 zeta := zetas[k]
453 k++
454 // Bounds check elimination hint.
455 f, flen := f[start:start+len], f[start+len:start+len+len]
456 for j := 0; j < len; j++ {
457 t := fieldMul(zeta, flen[j])
458 flen[j] = fieldSub(f[j], t)
459 f[j] = fieldAdd(f[j], t)
460 }
461 }
462 }
463 return nttElement(f)
464 }
465 466 // inverseNTT maps a nttElement back to the ringElement it represents.
467 //
468 // It implements NTT⁻¹, according to FIPS 203, Algorithm 10.
469 func inverseNTT(f nttElement) ringElement {
470 k := 127
471 for len := 2; len <= 128; len *= 2 {
472 for start := 0; start < 256; start += 2 * len {
473 zeta := zetas[k]
474 k--
475 // Bounds check elimination hint.
476 f, flen := f[start:start+len], f[start+len:start+len+len]
477 for j := 0; j < len; j++ {
478 t := f[j]
479 f[j] = fieldAdd(t, flen[j])
480 flen[j] = fieldMulSub(zeta, flen[j], t)
481 }
482 }
483 }
484 for i := range f {
485 f[i] = fieldMul(f[i], 3303) // 3303 = 128⁻¹ mod q
486 }
487 return ringElement(f)
488 }
489 490 // sampleNTT draws a uniformly random nttElement from a stream of uniformly
491 // random bytes generated by the XOF function, according to FIPS 203,
492 // Algorithm 7.
493 func sampleNTT(rho []byte, ii, jj byte) nttElement {
494 B := sha3.NewShake128()
495 B.Write(rho)
496 B.Write([]byte{ii, jj})
497 498 // SampleNTT essentially draws 12 bits at a time from r, interprets them in
499 // little-endian, and rejects values higher than q, until it drew 256
500 // values. (The rejection rate is approximately 19%.)
501 //
502 // To do this from a bytes stream, it draws three bytes at a time, and
503 // splits them into two uint16 appropriately masked.
504 //
505 // r₀ r₁ r₂
506 // |- - - - - - - -|- - - - - - - -|- - - - - - - -|
507 //
508 // Uint16(r₀ || r₁)
509 // |- - - - - - - - - - - - - - - -|
510 // |- - - - - - - - - - - -|
511 // d₁
512 //
513 // Uint16(r₁ || r₂)
514 // |- - - - - - - - - - - - - - - -|
515 // |- - - - - - - - - - - -|
516 // d₂
517 //
518 // Note that in little-endian, the rightmost bits are the most significant
519 // bits (dropped with a mask) and the leftmost bits are the least
520 // significant bits (dropped with a right shift).
521 522 var a nttElement
523 var j int // index into a
524 var buf [24]byte // buffered reads from B
525 off := len(buf) // index into buf, starts in a "buffer fully consumed" state
526 for {
527 if off >= len(buf) {
528 B.Read(buf[:])
529 off = 0
530 }
531 d1 := byteorder.LEUint16(buf[off:]) & 0b1111_1111_1111
532 d2 := byteorder.LEUint16(buf[off+1:]) >> 4
533 off += 3
534 if d1 < q {
535 a[j] = fieldElement(d1)
536 j++
537 }
538 if j >= len(a) {
539 break
540 }
541 if d2 < q {
542 a[j] = fieldElement(d2)
543 j++
544 }
545 if j >= len(a) {
546 break
547 }
548 }
549 return a
550 }
551