amd64.go raw
1 //go:build amd64 && !purego
2 // +build amd64,!purego
3
4 package common
5
6 import (
7 "golang.org/x/sys/cpu"
8 )
9
10 // ZetasAVX2 contains all ζ used in NTT (like the Zetas array), but also
11 // the values int16(zeta * 62209) for each zeta, which is used in
12 // Montgomery reduction. There is some duplication and reordering as
13 // compared to Zetas to make it more convenient for use with AVX2.
14 var ZetasAVX2 = [...]int16{
15 // level 1: int16(Zetas[1]*62209) and Zetas[1]
16 31499, 2571,
17
18 // level 2
19 //
20 // int16(Zetas[2]*62209), Zetas[2], int16(Zetas[3]*62209), Zetas[3]
21 14746, 2970, 788, 1812,
22
23 // level 3, like level 2.
24 13525, 1493, -12402, 1422, 28191, 287, -16694, 202,
25
26 0, 0, // padding
27
28 // layer 4. offset: 1*16
29 //
30 // The precomputed multiplication and zetas are grouped by 16 at a
31 // time as used in the set of butterflies, etc.
32 -20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
33 27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
34 3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
35 622, 622, 622, 622, 622, 622, 622, 622,
36 -3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
37 -15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
38 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
39 182, 182, 182, 182, 182, 182, 182, 182,
40 10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
41 1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
42 962, 962, 962, 962, 962, 962, 962, 962,
43 2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
44 -11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
45 31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
46 1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
47 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
48
49 // layer 5. offset: 9*16
50 -5827, -5827, -5827, -5827, 17364, 17364, 17364, 17364,
51 -26360, -26360, -26360, -26360, -29057, -29057, -29057, -29057,
52 573, 573, 573, 573, 2004, 2004, 2004, 2004,
53 264, 264, 264, 264, 383, 383, 383, 383,
54 5572, 5572, 5572, 5572, -1102, -1102, -1102, -1102,
55 21439, 21439, 21439, 21439, -26241, -26241, -26241, -26241,
56 2500, 2500, 2500, 2500, 1458, 1458, 1458, 1458,
57 1727, 1727, 1727, 1727, 3199, 3199, 3199, 3199,
58 -28072, -28072, -28072, -28072, 24313, 24313, 24313, 24313,
59 -10532, -10532, -10532, -10532, 8800, 8800, 8800, 8800,
60 2648, 2648, 2648, 2648, 1017, 1017, 1017, 1017,
61 732, 732, 732, 732, 608, 608, 608, 608,
62 18427, 18427, 18427, 18427, 8859, 8859, 8859, 8859,
63 26676, 26676, 26676, 26676, -16162, -16162, -16162, -16162,
64 1787, 1787, 1787, 1787, 411, 411, 411, 411,
65 3124, 3124, 3124, 3124, 1758, 1758, 1758, 1758,
66
67 // layer 6. offset: 17*16
68 -5689, -5689, -6516, -6516, 1497, 1497, 30967, 30967,
69 -23564, -23564, 20179, 20179, 20711, 20711, 25081, 25081,
70 1223, 1223, 652, 652, 2777, 2777, 1015, 1015,
71 2036, 2036, 1491, 1491, 3047, 3047, 1785, 1785,
72 -12796, -12796, 26617, 26617, 16065, 16065, -12441, -12441,
73 9135, 9135, -649, -649, -25986, -25986, 27837, 27837,
74 516, 516, 3321, 3321, 3009, 3009, 2663, 2663,
75 1711, 1711, 2167, 2167, 126, 126, 1469, 1469,
76 19884, 19884, -28249, -28249, -15886, -15886, -8898, -8898,
77 -28309, -28309, 9076, 9076, -30198, -30198, 18250, 18250,
78 2476, 2476, 3239, 3239, 3058, 3058, 830, 830,
79 107, 107, 1908, 1908, 3082, 3082, 2378, 2378,
80 13427, 13427, 14017, 14017, -29155, -29155, -12756, -12756,
81 16832, 16832, 4312, 4312, -24155, -24155, -17914, -17914,
82 2931, 2931, 961, 961, 1821, 1821, 2604, 2604,
83 448, 448, 2264, 2264, 677, 677, 2054, 2054,
84
85 // layer 7. offset: 25*16
86 -334, 11182, -11477, 13387, -32226, -14233, 20494, -21655,
87 -27738, 13131, 945, -4586, -14882, 23093, 6182, 5493,
88 2226, 430, 555, 843, 2078, 871, 1550, 105,
89 422, 587, 177, 3094, 3038, 2869, 1574, 1653,
90 32011, -32502, 10631, 30318, 29176, -18741, -28761, 12639,
91 -18485, 20100, 17561, 18525, -14430, 19529, -5275, -12618,
92 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119,
93 1739, 644, 2457, 349, 418, 329, 3173, 3254,
94 -31183, 20297, 25435, 2146, -7382, 15356, 24392, -32384,
95 -20926, -6279, 10946, -14902, 24215, -11044, 16990, 14470,
96 817, 1097, 603, 610, 1322, 2044, 1864, 384,
97 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670,
98 10336, -21497, -7933, -20198, -22501, 23211, 10907, -17442,
99 31637, -23859, 28644, -20257, 23998, 7757, -17422, 23132,
100 2144, 1799, 2051, 794, 1819, 2475, 2459, 478,
101 3221, 3021, 996, 991, 958, 1869, 1522, 1628,
102
103 // layer 1 inverse
104 23132, -17422, 7757, 23998, -20257, 28644, -23859, 31637,
105 -17442, 10907, 23211, -22501, -20198, -7933, -21497, 10336,
106 1628, 1522, 1869, 958, 991, 996, 3021, 3221,
107 478, 2459, 2475, 1819, 794, 2051, 1799, 2144,
108 14470, 16990, -11044, 24215, -14902, 10946, -6279, -20926,
109 -32384, 24392, 15356, -7382, 2146, 25435, 20297, -31183,
110 1670, 2142, 220, 2455, 1994, 1218, 3193, 2114,
111 384, 1864, 2044, 1322, 610, 603, 1097, 817,
112 -12618, -5275, 19529, -14430, 18525, 17561, 20100, -18485,
113 12639, -28761, -18741, 29176, 30318, 10631, -32502, 32011,
114 3254, 3173, 329, 418, 349, 2457, 644, 1739,
115 1119, 2727, 1483, 2552, 3182, 1159, 778, 3083,
116 5493, 6182, 23093, -14882, -4586, 945, 13131, -27738,
117 -21655, 20494, -14233, -32226, 13387, -11477, 11182, -334,
118 1653, 1574, 2869, 3038, 3094, 177, 587, 422,
119 105, 1550, 871, 2078, 843, 555, 430, 2226,
120
121 // layer 2 inverse
122 -17914, -17914, -24155, -24155, 4312, 4312, 16832, 16832,
123 -12756, -12756, -29155, -29155, 14017, 14017, 13427, 13427,
124 2054, 2054, 677, 677, 2264, 2264, 448, 448,
125 2604, 2604, 1821, 1821, 961, 961, 2931, 2931,
126 18250, 18250, -30198, -30198, 9076, 9076, -28309, -28309,
127 -8898, -8898, -15886, -15886, -28249, -28249, 19884, 19884,
128 2378, 2378, 3082, 3082, 1908, 1908, 107, 107,
129 830, 830, 3058, 3058, 3239, 3239, 2476, 2476,
130 27837, 27837, -25986, -25986, -649, -649, 9135, 9135,
131 -12441, -12441, 16065, 16065, 26617, 26617, -12796, -12796,
132 1469, 1469, 126, 126, 2167, 2167, 1711, 1711,
133 2663, 2663, 3009, 3009, 3321, 3321, 516, 516,
134 25081, 25081, 20711, 20711, 20179, 20179, -23564, -23564,
135 30967, 30967, 1497, 1497, -6516, -6516, -5689, -5689,
136 1785, 1785, 3047, 3047, 1491, 1491, 2036, 2036,
137 1015, 1015, 2777, 2777, 652, 652, 1223, 1223,
138
139 // layer 3 inverse
140 -16162, -16162, -16162, -16162, 26676, 26676, 26676, 26676,
141 8859, 8859, 8859, 8859, 18427, 18427, 18427, 18427,
142 1758, 1758, 1758, 1758, 3124, 3124, 3124, 3124,
143 411, 411, 411, 411, 1787, 1787, 1787, 1787,
144 8800, 8800, 8800, 8800, -10532, -10532, -10532, -10532,
145 24313, 24313, 24313, 24313, -28072, -28072, -28072, -28072,
146 608, 608, 608, 608, 732, 732, 732, 732,
147 1017, 1017, 1017, 1017, 2648, 2648, 2648, 2648,
148 -26241, -26241, -26241, -26241, 21439, 21439, 21439, 21439,
149 -1102, -1102, -1102, -1102, 5572, 5572, 5572, 5572,
150 3199, 3199, 3199, 3199, 1727, 1727, 1727, 1727,
151 1458, 1458, 1458, 1458, 2500, 2500, 2500, 2500,
152 -29057, -29057, -29057, -29057, -26360, -26360, -26360, -26360,
153 17364, 17364, 17364, 17364, -5827, -5827, -5827, -5827,
154 383, 383, 383, 383, 264, 264, 264, 264,
155 2004, 2004, 2004, 2004, 573, 573, 573, 573,
156
157 // layer 4 inverse
158 31164, 31164, 31164, 31164, 31164, 31164, 31164, 31164,
159 -11201, -11201, -11201, -11201, -11201, -11201, -11201, -11201,
160 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468,
161 1855, 1855, 1855, 1855, 1855, 1855, 1855, 1855,
162 1359, 1359, 1359, 1359, 1359, 1359, 1359, 1359,
163 10690, 10690, 10690, 10690, 10690, 10690, 10690, 10690,
164 2127, 2127, 2127, 2127, 2127, 2127, 2127, 2127,
165 962, 962, 962, 962, 962, 962, 962, 962,
166 -15690, -15690, -15690, -15690, -15690, -15690, -15690, -15690,
167 -3799, -3799, -3799, -3799, -3799, -3799, -3799, -3799,
168 182, 182, 182, 182, 182, 182, 182, 182,
169 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577,
170 27758, 27758, 27758, 27758, 27758, 27758, 27758, 27758,
171 -20906, -20906, -20906, -20906, -20906, -20906, -20906, -20906,
172 622, 622, 622, 622, 622, 622, 622, 622,
173 3158, 3158, 3158, 3158, 3158, 3158, 3158, 3158,
174
175 // layer 5 inverse
176 -16694, 202, 28191, 287, -12402, 1422, 13525, 1493,
177
178 // layer 6 inverse
179 788, 1812, 14746, 2970,
180
181 // layer 7 inverse
182 31499, 2571,
183 }
184
185 // Sets p to a + b. Does not normalize coefficients.
186 func (p *Poly) Add(a, b *Poly) {
187 if cpu.X86.HasAVX2 {
188 addAVX2(
189 (*[N]int16)(p),
190 (*[N]int16)(a),
191 (*[N]int16)(b),
192 )
193 } else {
194 p.addGeneric(a, b)
195 }
196 }
197
198 // Sets p to a - b. Does not normalize coefficients.
199 func (p *Poly) Sub(a, b *Poly) {
200 if cpu.X86.HasAVX2 {
201 subAVX2(
202 (*[N]int16)(p),
203 (*[N]int16)(a),
204 (*[N]int16)(b),
205 )
206 } else {
207 p.subGeneric(a, b)
208 }
209 }
210
211 // Executes an in-place forward "NTT" on p.
212 //
213 // Assumes the coefficients are in absolute value ≤q. The resulting
214 // coefficients are in absolute value ≤7q. If the input is in Montgomery
215 // form, then the result is in Montgomery form and so (by linearity of the NTT)
216 // if the input is in regular form, then the result is also in regular form.
217 // The order of coefficients will be "tangled". These can be put back into
218 // their proper order by calling Detangle().
219 func (p *Poly) NTT() {
220 if cpu.X86.HasAVX2 {
221 nttAVX2((*[N]int16)(p))
222 } else {
223 p.nttGeneric()
224 }
225 }
226
227 // Executes an in-place inverse "NTT" on p and multiply by the Montgomery
228 // factor R.
229 //
230 // Requires coefficients to be in "tangled" order, see Tangle().
231 // Assumes the coefficients are in absolute value ≤q. The resulting
232 // coefficients are in absolute value ≤q. If the input is in Montgomery
233 // form, then the result is in Montgomery form and so (by linearity)
234 // if the input is in regular form, then the result is also in regular form.
235 func (p *Poly) InvNTT() {
236 if cpu.X86.HasAVX2 {
237 invNttAVX2((*[N]int16)(p))
238 } else {
239 p.invNTTGeneric()
240 }
241 }
242
243 // Sets p to the "pointwise" multiplication of a and b.
244 //
245 // That is: InvNTT(p) = InvNTT(a) * InvNTT(b). Assumes a and b are in
246 // Montgomery form. Products between coefficients of a and b must be strictly
247 // bounded in absolute value by 2¹⁵q. p will be in Montgomery form and
248 // bounded in absolute value by 2q.
249 //
250 // Requires a and b to be in "tangled" order, see Tangle(). p will be in
251 // tangled order as well.
252 func (p *Poly) MulHat(a, b *Poly) {
253 if cpu.X86.HasAVX2 {
254 mulHatAVX2(
255 (*[N]int16)(p),
256 (*[N]int16)(a),
257 (*[N]int16)(b),
258 )
259 } else {
260 p.mulHatGeneric(a, b)
261 }
262 }
263
264 // Puts p into the right form to be used with (among others) InvNTT().
265 func (p *Poly) Tangle() {
266 if cpu.X86.HasAVX2 {
267 tangleAVX2((*[N]int16)(p))
268 }
269
270 // When AVX2 is not available, we use the standard order.
271 }
272
273 // Puts p back into standard form.
274 func (p *Poly) Detangle() {
275 if cpu.X86.HasAVX2 {
276 detangleAVX2((*[N]int16)(p))
277 }
278
279 // When AVX2 is not available, we use the standard order.
280 }
281
282 // Almost normalizes coefficients.
283 //
284 // Ensures each coefficient is in {0, …, q}.
285 func (p *Poly) BarrettReduce() {
286 if cpu.X86.HasAVX2 {
287 barrettReduceAVX2((*[N]int16)(p))
288 } else {
289 p.barrettReduceGeneric()
290 }
291 }
292
293 // Normalizes coefficients.
294 //
295 // Ensures each coefficient is in {0, …, q-1}.
296 func (p *Poly) Normalize() {
297 if cpu.X86.HasAVX2 {
298 normalizeAVX2((*[N]int16)(p))
299 } else {
300 p.normalizeGeneric()
301 }
302 }
303