chacha8_generic.mx raw

   1  // Copyright 2023 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // ChaCha8 is ChaCha with 8 rounds.
   6  // See https://cr.yp.to/chacha/chacha-20080128.pdf.
   7  //
   8  // ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to:
   9  //
  10  //	const1 const2 const3 const4
  11  //	seed   seed   seed   seed
  12  //	seed   seed   seed   seed
  13  //	counter64     0      0
  14  //
  15  // We use the same constants as ChaCha20 does, a random seed,
  16  // and a counter. Running ChaCha8 on this input produces
  17  // a 4x4 matrix of pseudo-random values with as much entropy
  18  // as the seed.
  19  //
  20  // Given SIMD registers that can hold N uint32s, it is possible
  21  // to run N ChaCha8 block transformations in parallel by filling
  22  // the first register with the N copies of const1, the second
  23  // with N copies of const2, and so on, and then running the operations.
  24  //
  25  // Each iteration of ChaCha8Rand operates over 32 bytes of input and
  26  // produces 992 bytes of RNG output, plus 32 bytes of input for the next
  27  // iteration.
  28  //
  29  // The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to
  30  // produce 1024 bytes of output (16 blocks, with counters 0 to 15).
  31  // First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32,
  32  // 0x6b206574 are subtracted from the 32-bit little-endian words at
  33  // position 0, 1, 2, and 3 respectively, and an increasing counter
  34  // starting at zero is subtracted from each word at position 12. Then,
  35  // this stream is permuted such that for each sequence of four blocks,
  36  // first we output the first four bytes of each block, then the next four
  37  // bytes of each block, and so on. Finally, the last 32 bytes of output
  38  // are used as the input of the next iteration, and the remaining 992
  39  // bytes are the RNG output.
  40  //
  41  // See https://c2sp.org/chacha8rand for additional details.
  42  //
  43  // Normal ChaCha20 implementations for encryption use this same
  44  // parallelism but then have to deinterlace the results so that
  45  // it appears the blocks were generated separately. For the purposes
  46  // of generating random numbers, the interlacing is fine.
  47  // We are simply locked in to preserving the 4-way interlacing
  48  // in any future optimizations.
  49  package chacha8rand
  50  
  51  import (
  52  	"internal/goarch"
  53  	"unsafe"
  54  )
  55  
  56  // setup sets up 4 ChaCha8 blocks in b32 with the counter and seed.
  57  // Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced
  58  // the same way they would be in a 4-way SIMD implementations.
  59  func setup(seed *[4]uint64, b32 *[16][4]uint32, counter uint32) {
  60  	// Convert to uint64 to do half as many stores to memory.
  61  	b := (*[16][2]uint64)(unsafe.Pointer(b32))
  62  
  63  	// Constants; same as in ChaCha20: "expand 32-byte k"
  64  	b[0][0] = 0x61707865_61707865
  65  	b[0][1] = 0x61707865_61707865
  66  
  67  	b[1][0] = 0x3320646e_3320646e
  68  	b[1][1] = 0x3320646e_3320646e
  69  
  70  	b[2][0] = 0x79622d32_79622d32
  71  	b[2][1] = 0x79622d32_79622d32
  72  
  73  	b[3][0] = 0x6b206574_6b206574
  74  	b[3][1] = 0x6b206574_6b206574
  75  
  76  	// Seed values.
  77  	var x64 uint64
  78  	var x uint32
  79  
  80  	x = uint32(seed[0])
  81  	x64 = uint64(x)<<32 | uint64(x)
  82  	b[4][0] = x64
  83  	b[4][1] = x64
  84  
  85  	x = uint32(seed[0] >> 32)
  86  	x64 = uint64(x)<<32 | uint64(x)
  87  	b[5][0] = x64
  88  	b[5][1] = x64
  89  
  90  	x = uint32(seed[1])
  91  	x64 = uint64(x)<<32 | uint64(x)
  92  	b[6][0] = x64
  93  	b[6][1] = x64
  94  
  95  	x = uint32(seed[1] >> 32)
  96  	x64 = uint64(x)<<32 | uint64(x)
  97  	b[7][0] = x64
  98  	b[7][1] = x64
  99  
 100  	x = uint32(seed[2])
 101  	x64 = uint64(x)<<32 | uint64(x)
 102  	b[8][0] = x64
 103  	b[8][1] = x64
 104  
 105  	x = uint32(seed[2] >> 32)
 106  	x64 = uint64(x)<<32 | uint64(x)
 107  	b[9][0] = x64
 108  	b[9][1] = x64
 109  
 110  	x = uint32(seed[3])
 111  	x64 = uint64(x)<<32 | uint64(x)
 112  	b[10][0] = x64
 113  	b[10][1] = x64
 114  
 115  	x = uint32(seed[3] >> 32)
 116  	x64 = uint64(x)<<32 | uint64(x)
 117  	b[11][0] = x64
 118  	b[11][1] = x64
 119  
 120  	// Counters.
 121  	if goarch.BigEndian {
 122  		b[12][0] = uint64(counter+0)<<32 | uint64(counter+1)
 123  		b[12][1] = uint64(counter+2)<<32 | uint64(counter+3)
 124  	} else {
 125  		b[12][0] = uint64(counter+0) | uint64(counter+1)<<32
 126  		b[12][1] = uint64(counter+2) | uint64(counter+3)<<32
 127  	}
 128  
 129  	// Zeros.
 130  	b[13][0] = 0
 131  	b[13][1] = 0
 132  	b[14][0] = 0
 133  	b[14][1] = 0
 134  
 135  	b[15][0] = 0
 136  	b[15][1] = 0
 137  }
 138  
 139  func _() {
 140  	// block and block_generic must have same type
 141  	x := block
 142  	x = block_generic
 143  	_ = x
 144  }
 145  
 146  // block_generic is the non-assembly block implementation,
 147  // for use on systems without special assembly.
 148  // Even on such systems, it is quite fast: on GOOS=386,
 149  // ChaCha8 using this code generates random values faster than PCG-DXSM.
 150  func block_generic(seed *[4]uint64, buf *[32]uint64, counter uint32) {
 151  	b := (*[16][4]uint32)(unsafe.Pointer(buf))
 152  
 153  	setup(seed, b, counter)
 154  
 155  	for i := range b[0] {
 156  		// Load block i from b[*][i] into local variables.
 157  		b0 := b[0][i]
 158  		b1 := b[1][i]
 159  		b2 := b[2][i]
 160  		b3 := b[3][i]
 161  		b4 := b[4][i]
 162  		b5 := b[5][i]
 163  		b6 := b[6][i]
 164  		b7 := b[7][i]
 165  		b8 := b[8][i]
 166  		b9 := b[9][i]
 167  		b10 := b[10][i]
 168  		b11 := b[11][i]
 169  		b12 := b[12][i]
 170  		b13 := b[13][i]
 171  		b14 := b[14][i]
 172  		b15 := b[15][i]
 173  
 174  		// 4 iterations of eight quarter-rounds each is 8 rounds
 175  		for round := 0; round < 4; round++ {
 176  			b0, b4, b8, b12 = qr(b0, b4, b8, b12)
 177  			b1, b5, b9, b13 = qr(b1, b5, b9, b13)
 178  			b2, b6, b10, b14 = qr(b2, b6, b10, b14)
 179  			b3, b7, b11, b15 = qr(b3, b7, b11, b15)
 180  
 181  			b0, b5, b10, b15 = qr(b0, b5, b10, b15)
 182  			b1, b6, b11, b12 = qr(b1, b6, b11, b12)
 183  			b2, b7, b8, b13 = qr(b2, b7, b8, b13)
 184  			b3, b4, b9, b14 = qr(b3, b4, b9, b14)
 185  		}
 186  
 187  		// Store block i back into b[*][i].
 188  		// Add b4..b11 back to the original key material,
 189  		// like in ChaCha20, to avoid trivial invertibility.
 190  		// There is no entropy in b0..b3 and b12..b15
 191  		// so we can skip the additions and save some time.
 192  		b[0][i] = b0
 193  		b[1][i] = b1
 194  		b[2][i] = b2
 195  		b[3][i] = b3
 196  		b[4][i] += b4
 197  		b[5][i] += b5
 198  		b[6][i] += b6
 199  		b[7][i] += b7
 200  		b[8][i] += b8
 201  		b[9][i] += b9
 202  		b[10][i] += b10
 203  		b[11][i] += b11
 204  		b[12][i] = b12
 205  		b[13][i] = b13
 206  		b[14][i] = b14
 207  		b[15][i] = b15
 208  	}
 209  
 210  	if goarch.BigEndian {
 211  		// On a big-endian system, reading the uint32 pairs as uint64s
 212  		// will word-swap them compared to little-endian, so we word-swap
 213  		// them here first to make the next swap get the right answer.
 214  		for i, x := range buf {
 215  			buf[i] = x>>32 | x<<32
 216  		}
 217  	}
 218  }
 219  
 220  // qr is the (inlinable) ChaCha8 quarter round.
 221  func qr(a, b, c, d uint32) (_a, _b, _c, _d uint32) {
 222  	a += b
 223  	d ^= a
 224  	d = d<<16 | d>>16
 225  	c += d
 226  	b ^= c
 227  	b = b<<12 | b>>20
 228  	a += b
 229  	d ^= a
 230  	d = d<<8 | d>>24
 231  	c += d
 232  	b ^= c
 233  	b = b<<7 | b>>25
 234  	return a, b, c, d
 235  }
 236