f1600x.go raw

   1  // Package keccakf1600 provides a two and four-way Keccak-f[1600] permutation in parallel.
   2  //
   3  // Keccak-f[1600] is the permutation underlying several algorithms such as
   4  // Keccak, SHA3 and SHAKE. Running two or four permutations in parallel is
   5  // useful in some scenarios like in hash-based signatures.
   6  //
   7  // # Limitations
   8  //
   9  // Note that not all the architectures support SIMD instructions. This package
  10  // uses AVX2 instructions that are available in some AMD64 architectures
  11  // and  NEON instructions that are available in some ARM64 architectures.
  12  //
  13  // For those systems not supporting these, the package still provides the
  14  // expected functionality by means of a generic and slow implementation.
  15  // The recommendation is to beforehand verify IsEnabledX4() and IsEnabledX2()
  16  // to determine if the current system supports the SIMD implementation.
  17  package keccakf1600
  18  
  19  import (
  20  	"runtime"
  21  	"unsafe"
  22  
  23  	"github.com/cloudflare/circl/internal/sha3"
  24  	"golang.org/x/sys/cpu"
  25  )
  26  
  27  // StateX4 contains state for the four-way permutation including the four
  28  // interleaved [25]uint64 buffers. Call Initialize() before use to initialize
  29  // and get a pointer to the interleaved buffer.
  30  type StateX4 struct {
  31  	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
  32  	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
  33  	// to be able to move the start of the state.
  34  
  35  	// 4 x 25 uint64s for the interleaved states and three uint64s headroom
  36  	// to fix alignment.
  37  	a [103]uint64
  38  
  39  	// Offset into a that is 32 byte aligned.
  40  	offset int
  41  
  42  	// If true, permute will use 12-round keccak instead of 24-round keccak
  43  	turbo bool
  44  }
  45  
  46  // StateX2 contains state for the two-way permutation including the two
  47  // interleaved [25]uint64 buffers. Call Initialize() before use to initialize
  48  // and get a pointer to the interleaved buffer.
  49  type StateX2 struct {
  50  	// Go guarantees a to be aligned on 8 bytes, whereas we need it to be
  51  	// aligned on 32 bytes for bet performance.  Thus we leave some headroom
  52  	// to be able to move the start of the state.
  53  
  54  	// 2 x 25 uint64s for the interleaved states and three uint64s headroom
  55  	// to fix alignment.
  56  	a [53]uint64
  57  
  58  	// Offset into a that is 32 byte aligned.
  59  	offset int
  60  
  61  	// If true, permute will use 12-round keccak instead of 24-round keccak
  62  	turbo bool
  63  }
  64  
  65  // IsEnabledX4 returns true if the architecture supports a four-way SIMD
  66  // implementation provided in this package.
  67  func IsEnabledX4() bool { return cpu.X86.HasAVX2 }
  68  
  69  // IsEnabledX2 returns true if the architecture supports a two-way SIMD
  70  // implementation provided in this package.
  71  func IsEnabledX2() bool { return enabledX2 }
  72  
  73  // Initialize the state and returns the buffer on which the four permutations
  74  // will act: a uint64 slice of length 100.  The first permutation will act
  75  // on {a[0], a[4], ..., a[96]}, the second on {a[1], a[5], ..., a[97]}, etc.
  76  // If turbo is true, applies 12-round variant instead of the usual 24.
  77  func (s *StateX4) Initialize(turbo bool) []uint64 {
  78  	s.turbo = turbo
  79  	rp := unsafe.Pointer(&s.a[0]) //nolint:gosec
  80  
  81  	// uint64s are always aligned by a multiple of 8.  Compute the remainder
  82  	// of the address modulo 32 divided by 8.
  83  	rem := (int(uintptr(rp)&31) >> 3)
  84  
  85  	if rem != 0 {
  86  		s.offset = 4 - rem
  87  	}
  88  
  89  	// The slice we return will be aligned on 32 byte boundary.
  90  	return s.a[s.offset : s.offset+100]
  91  }
  92  
  93  // Initialize the state and returns the buffer on which the two permutations
  94  // will act: a uint64 slice of length 50.  The first permutation will act
  95  // on {a[0], a[2], ..., a[48]} and the second on {a[1], a[3], ..., a[49]}.
  96  // If turbo is true, applies 12-round variant instead of the usual 24.
  97  func (s *StateX2) Initialize(turbo bool) []uint64 {
  98  	s.turbo = turbo
  99  	rp := unsafe.Pointer(&s.a[0]) //nolint:gosec
 100  
 101  	// uint64s are always aligned by a multiple of 8.  Compute the remainder
 102  	// of the address modulo 32 divided by 8.
 103  	rem := (int(uintptr(rp)&31) >> 3)
 104  
 105  	if rem != 0 {
 106  		s.offset = 4 - rem
 107  	}
 108  
 109  	// The slice we return will be aligned on 32 byte boundary.
 110  	return s.a[s.offset : s.offset+50]
 111  }
 112  
 113  // Permute performs the four parallel Keccak-f[1600]s interleaved on the slice
 114  // returned from Initialize().
 115  func (s *StateX4) Permute() {
 116  	if IsEnabledX4() {
 117  		permuteSIMDx4(s.a[s.offset:], s.turbo)
 118  	} else {
 119  		permuteScalarX4(s.a[s.offset:], s.turbo) // A slower generic implementation.
 120  	}
 121  }
 122  
 123  // Permute performs the two parallel Keccak-f[1600]s interleaved on the slice
 124  // returned from Initialize().
 125  func (s *StateX2) Permute() {
 126  	if IsEnabledX2() {
 127  		permuteSIMDx2(s.a[s.offset:], s.turbo)
 128  	} else {
 129  		permuteScalarX2(s.a[s.offset:], s.turbo) // A slower generic implementation.
 130  	}
 131  }
 132  
 133  func permuteScalarX4(a []uint64, turbo bool) {
 134  	var buf [25]uint64
 135  	for i := 0; i < 4; i++ {
 136  		for j := 0; j < 25; j++ {
 137  			buf[j] = a[4*j+i]
 138  		}
 139  		sha3.KeccakF1600(&buf, turbo)
 140  		for j := 0; j < 25; j++ {
 141  			a[4*j+i] = buf[j]
 142  		}
 143  	}
 144  }
 145  
 146  func permuteScalarX2(a []uint64, turbo bool) {
 147  	var buf [25]uint64
 148  	for i := 0; i < 2; i++ {
 149  		for j := 0; j < 25; j++ {
 150  			buf[j] = a[2*j+i]
 151  		}
 152  		sha3.KeccakF1600(&buf, turbo)
 153  		for j := 0; j < 25; j++ {
 154  			a[2*j+i] = buf[j]
 155  		}
 156  	}
 157  }
 158  
 159  var enabledX2 bool
 160  
 161  func init() {
 162  	enabledX2 = runtime.GOARCH == "arm64" && runtime.GOOS == "darwin"
 163  }
 164