1 // Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel.
2 //
3 // Keccak-f[1600] is the permutation underlying several algorithms such as
4 // Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is
5 // useful in some scenarios like in hash-based signatures.
6 //
7 // # Limitations
8 //
9 // Note that not all the architectures support SIMD instructions. This package
10 // uses AVX2 instructions that are available in some AMD64 architectures
11 // and NEON instructions that are available in some ARM64 architectures.
12 //
13 // For those systems not supporting these, the package still provides the
14 // expected functionality by means of a generic and slow implementation.
15 // The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2()
16 // to determine if the current system supports the SIMD implementation.
17 package keccakf1600
18 19 import (
20 "runtime"
21 "unsafe"
22 23 "github.com/cloudflare/circl/internal/sha3"
24 "golang.org/x/sys/cpu"
25 )
26 27 // StateX4 contains state for the four-way permutation including the four
28 // interleaved [25]uint64 buffers. Call Initialize() before use to initialize
29 // and get a pointer to the interleaved buffer.
30 type StateX4 struct {
31 // Go guarantees a to be aligned on 8 bytes, whereas we need it to be
32 // aligned on 32 bytes for bet performance. Thus we leave some headroom
33 // to be able to move the start of the state.
34 35 // 4 x 25 uint64s for the interleaved states and three uint64s headroom
36 // to fix alignment.
37 a [103]uint64
38 39 // Offset into a that is 32 byte aligned.
40 offset int
41 42 // If true, permute will use 12-round keccak instead of 24-round keccak
43 turbo bool
44 }
45 46 // StateX2 contains state for the two-way permutation including the two
47 // interleaved [25]uint64 buffers. Call Initialize() before use to initialize
48 // and get a pointer to the interleaved buffer.
49 type StateX2 struct {
50 // Go guarantees a to be aligned on 8 bytes, whereas we need it to be
51 // aligned on 32 bytes for bet performance. Thus we leave some headroom
52 // to be able to move the start of the state.
53 54 // 2 x 25 uint64s for the interleaved states and three uint64s headroom
55 // to fix alignment.
56 a [53]uint64
57 58 // Offset into a that is 32 byte aligned.
59 offset int
60 61 // If true, permute will use 12-round keccak instead of 24-round keccak
62 turbo bool
63 }
64 65 // IsEnabledX4 returns true if the architecture supports a four-way SIMD
66 // implementation provided in this package.
67 func IsEnabledX4() bool { return cpu.X86.HasAVX2 }
68 69 // IsEnabledX2 returns true if the architecture supports a two-way SIMD
70 // implementation provided in this package.
71 func IsEnabledX2() bool { return enabledX2 }
72 73 // Initialize the state and returns the buffer on which the four permutations
74 // will act: a uint64 slice of length 100. The first permutation will act
75 // on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc.
76 // If turbo is true, applies 12-round variant instead of the usual 24.
77 func (s *StateX4) Initialize(turbo bool) []uint64 {
78 s.turbo = turbo
79 rp := unsafe.Pointer(&s.a[0]) //nolint:gosec
80 81 // uint64s are always aligned by a multiple of 8. Compute the remainder
82 // of the address modulo 32 divided by 8.
83 rem := (int(uintptr(rp)&31) >> 3)
84 85 if rem != 0 {
86 s.offset = 4 - rem
87 }
88 89 // The slice we return will be aligned on 32 byte boundary.
90 return s.a[s.offset : s.offset+100]
91 }
92 93 // Initialize the state and returns the buffer on which the two permutations
94 // will act: a uint64 slice of length 50. The first permutation will act
95 // on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}.
96 // If turbo is true, applies 12-round variant instead of the usual 24.
97 func (s *StateX2) Initialize(turbo bool) []uint64 {
98 s.turbo = turbo
99 rp := unsafe.Pointer(&s.a[0]) //nolint:gosec
100 101 // uint64s are always aligned by a multiple of 8. Compute the remainder
102 // of the address modulo 32 divided by 8.
103 rem := (int(uintptr(rp)&31) >> 3)
104 105 if rem != 0 {
106 s.offset = 4 - rem
107 }
108 109 // The slice we return will be aligned on 32 byte boundary.
110 return s.a[s.offset : s.offset+50]
111 }
112 113 // Permute performs the four parallel Keccak-f[1600]s interleaved on the slice
114 // returned from Initialize().
115 func (s *StateX4) Permute() {
116 if IsEnabledX4() {
117 permuteSIMDx4(s.a[s.offset:], s.turbo)
118 } else {
119 permuteScalarX4(s.a[s.offset:], s.turbo) // A slower generic implementation.
120 }
121 }
122 123 // Permute performs the two parallel Keccak-f[1600]s interleaved on the slice
124 // returned from Initialize().
125 func (s *StateX2) Permute() {
126 if IsEnabledX2() {
127 permuteSIMDx2(s.a[s.offset:], s.turbo)
128 } else {
129 permuteScalarX2(s.a[s.offset:], s.turbo) // A slower generic implementation.
130 }
131 }
132 133 func permuteScalarX4(a []uint64, turbo bool) {
134 var buf [25]uint64
135 for i := 0; i < 4; i++ {
136 for j := 0; j < 25; j++ {
137 buf[j] = a[4*j+i]
138 }
139 sha3.KeccakF1600(&buf, turbo)
140 for j := 0; j < 25; j++ {
141 a[4*j+i] = buf[j]
142 }
143 }
144 }
145 146 func permuteScalarX2(a []uint64, turbo bool) {
147 var buf [25]uint64
148 for i := 0; i < 2; i++ {
149 for j := 0; j < 25; j++ {
150 buf[j] = a[2*j+i]
151 }
152 sha3.KeccakF1600(&buf, turbo)
153 for j := 0; j < 25; j++ {
154 a[2*j+i] = buf[j]
155 }
156 }
157 }
158 159 var enabledX2 bool
160 161 func init() {
162 enabledX2 = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin"
163 }
164