blocks_generic.go raw

   1  // Copyright (c) 2024 The Decred developers
   2  // Use of this source code is governed by an ISC
   3  // license that can be found in the LICENSE file.
   4  //
   5  // Main Go code originally written and optimized by Dave Collins May 2020.
   6  // Additional cleanup and comments added July 2024.
   7  
   8  package compress
   9  
  10  import (
  11  	"math/bits"
  12  )
  13  
  14  const (
  15  	// blockSize is the block size of the hash algorithm in bytes.
  16  	blockSize = 64
  17  
  18  	// blockSizeLog2 is the base-2 log of the block size.  It is used to
  19  	// efficiently perform integer division by the block size.
  20  	blockSizeLog2 = 6
  21  )
  22  
  23  // State houses the current chain value and salt used during block compression.
  24  // It is housed in a separate struct in order to reduce the number of parameters
  25  // to help prevent register spillage on platforms with a limited number of
  26  // registers for better performance.
  27  type State struct {
  28  	CV [8]uint32 // the current chain value
  29  	S  [4]uint32 // salt (zero by default)
  30  }
  31  
  32  // g is the quarter round function that each round applies to the 4x4 internal
  33  // state in the compression function.
  34  func g(a, b, c, d, mx, my, cx, cy uint32) (uint32, uint32, uint32, uint32) {
  35  	a += b + (mx ^ cx)
  36  	d = bits.RotateLeft32(d^a, -16)
  37  	c += d
  38  	b = bits.RotateLeft32(b^c, -12)
  39  	a += b + (my ^ cy)
  40  	d = bits.RotateLeft32(d^a, -8)
  41  	c += d
  42  	b = bits.RotateLeft32(b^c, -7)
  43  	return a, b, c, d
  44  }
  45  
  46  // blocksGeneric performs BLAKE-224 and BLAKE-256 block compression using pure
  47  // Go.  It will compress as many full blocks as are available in the provided
  48  // message.
  49  //
  50  // The parameters are:
  51  //
  52  //	state: the block compression state with the chain value and salt
  53  //	msg: the padded message to compress (must be at least 64 bytes)
  54  //	counter: the total number of message bits hashed so far
  55  //
  56  // It will panic if the provided message block does not have at least 64 bytes.
  57  //
  58  // The chain value in the provided state is updated in place.
  59  func blocksGeneric(state *State, msg []byte, counter uint64) {
  60  	// The compression func initializes the 16-word state matrix as follows:
  61  	//
  62  	// h0..h7 is the input chaining value.
  63  	//
  64  	// s0..s3 is the salt.
  65  	//
  66  	// c0..c7 are the first 8 words of the BLAKE constants defined by the
  67  	// specification (the first digits of π).
  68  	//
  69  	// t0 and t1 are the lower and higher order words of the 64-bit counter.
  70  	//
  71  	// |v0  v1  v2  v3|   |  h0     h1     h2     h3 |
  72  	// |v4  v5  v6  v7|   |  h4     h5     h6     h7 |
  73  	// |v8  v9  va  vb| = |s0^c0  s1^c1  s2^c2  s3^c3|
  74  	// |vc  vd  ve  vf|   |t0^c4  t0^c5  t1^c6  t1^c7|
  75  	//
  76  	// Each round consists of 8 applications of the G function as follows:
  77  	// G0(v0,v4,v8,vc)  G1(v1,v5,v9,vd)  G2(v2,v6,va,ve)  G3(v3,v7,vb,vf)
  78  	// G4(v0,v5,va,vf)  G5(v1,v6,vb,vc)  G6(v2,v7,v8,vd)  G7(v3,v4,v9,ve)
  79  	//
  80  	// In other words, the G function is applied to each column of the 4x4 state
  81  	// and then to each of the diagonals.
  82  	//
  83  	// In addition, at each round, the message words are permuted according to
  84  	// the following schedule where each round is mod 10.  In other words round
  85  	// 11 uses the same permutation as round 1, round 12 the same as round 2,
  86  	// etc:
  87  	//
  88  	// round 1:  0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
  89  	// round 2:  14 10 4  8  9  15 13 6  1  12 0  2  11 7  5  3
  90  	// round 3:  11 8  12 0  5  2  15 13 10 14 3  6  7  1  9  4
  91  	// round 4:  7  9  3  1  13 12 11 14 2  6  5  10 4  0  15 8
  92  	// round 5:  9  0  5  7  2  4  10 15 14 1  11 12 6  8  3  13
  93  	// round 6:  2  12 6  10 0  11 8  3  4  13 7  5  15 14 1  9
  94  	// round 7:  12 5  1  15 14 13 4  10 0  7  6  3  9  2  8  11
  95  	// round 8:  13 11 7  14 12 1  3  9  5  0  15 4  8  6  2  10
  96  	// round 9:  6  15 14 9  11 3  0  8  12 2  13 7  1  4  10 5
  97  	// round 10: 10 2  8  4  7  6  1  5  15 11 9  14 3  12 13 0
  98  
  99  	const (
 100  		c0, c1, c2, c3 = 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344
 101  		c4, c5, c6, c7 = 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89
 102  		c8, c9, ca, cb = 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c
 103  		cc, cd, ce, cf = 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917
 104  	)
 105  
 106  	// Ideally these hints wouldn't be necessary, but they do make quite a big
 107  	// difference in avoiding a bunch of additional bounds checks as determined
 108  	// by both reviewing the resulting compiled asm as well as benchmarks.
 109  	h := &state.CV
 110  	s := &state.S
 111  	_ = h[7] // Bounds check hint to compiler.
 112  	_ = s[3] // Bounds check hint to compiler.
 113  	for numBlocks := len(msg) >> blockSizeLog2; numBlocks > 0; numBlocks-- {
 114  		_ = msg[63] // Bounds check hint to compiler.
 115  
 116  		// Convert the provided message of at least 64 bytes to an array of 16
 117  		// 32-bit unsigned big-endian words.
 118  		//
 119  		// Note that this conversion is intentionally not in a separate function
 120  		// because the Go compiler unfortunately sees a function that does this
 121  		// as too complex to inline.
 122  		//
 123  		// It also avoids using binary.BigEndian.Uint32 for these even though it
 124  		// is more readable for increased performance in the critical path.
 125  		//
 126  		// Finally, this is optimized to favor arm since there currently is only
 127  		// a pure Go implementation for that arch while amd64 will be using the
 128  		// vector accelerated versions in practice.  It's only slightly slower
 129  		// on amd64 versus various rearrangements that favor it instead.
 130  		m0 := uint32(msg[3]) | uint32(msg[2])<<8 | uint32(msg[1])<<16 | uint32(msg[0])<<24
 131  		m1 := uint32(msg[7]) | uint32(msg[6])<<8 | uint32(msg[5])<<16 | uint32(msg[4])<<24
 132  		m2 := uint32(msg[11]) | uint32(msg[10])<<8 | uint32(msg[9])<<16 | uint32(msg[8])<<24
 133  		m3 := uint32(msg[15]) | uint32(msg[14])<<8 | uint32(msg[13])<<16 | uint32(msg[12])<<24
 134  		m4 := uint32(msg[19]) | uint32(msg[18])<<8 | uint32(msg[17])<<16 | uint32(msg[16])<<24
 135  		m5 := uint32(msg[23]) | uint32(msg[22])<<8 | uint32(msg[21])<<16 | uint32(msg[20])<<24
 136  		m6 := uint32(msg[27]) | uint32(msg[26])<<8 | uint32(msg[25])<<16 | uint32(msg[24])<<24
 137  		m7 := uint32(msg[31]) | uint32(msg[30])<<8 | uint32(msg[29])<<16 | uint32(msg[28])<<24
 138  		m8 := uint32(msg[35]) | uint32(msg[34])<<8 | uint32(msg[33])<<16 | uint32(msg[32])<<24
 139  		m9 := uint32(msg[39]) | uint32(msg[38])<<8 | uint32(msg[37])<<16 | uint32(msg[36])<<24
 140  		m10 := uint32(msg[43]) | uint32(msg[42])<<8 | uint32(msg[41])<<16 | uint32(msg[40])<<24
 141  		m11 := uint32(msg[47]) | uint32(msg[46])<<8 | uint32(msg[45])<<16 | uint32(msg[44])<<24
 142  		m12 := uint32(msg[51]) | uint32(msg[50])<<8 | uint32(msg[49])<<16 | uint32(msg[48])<<24
 143  		m13 := uint32(msg[55]) | uint32(msg[54])<<8 | uint32(msg[53])<<16 | uint32(msg[52])<<24
 144  		m14 := uint32(msg[59]) | uint32(msg[58])<<8 | uint32(msg[57])<<16 | uint32(msg[56])<<24
 145  		m15 := uint32(msg[63]) | uint32(msg[62])<<8 | uint32(msg[61])<<16 | uint32(msg[60])<<24
 146  
 147  		// Round 1 plus state matrix initialization.
 148  		t0, t1 := uint32(counter), uint32(counter>>32)
 149  		v0, v4, v8, vc := g(h[0], h[4], s[0]^c0, t0^c4, m0, m1, c1, c0)
 150  		v1, v5, v9, vd := g(h[1], h[5], s[1]^c1, t0^c5, m2, m3, c3, c2)
 151  		v2, v6, va, ve := g(h[2], h[6], s[2]^c2, t1^c6, m4, m5, c5, c4)
 152  		v3, v7, vb, vf := g(h[3], h[7], s[3]^c3, t1^c7, m6, m7, c7, c6)
 153  		v0, v5, va, vf = g(v0, v5, va, vf, m8, m9, c9, c8)
 154  		v1, v6, vb, vc = g(v1, v6, vb, vc, m10, m11, cb, ca)
 155  		v2, v7, v8, vd = g(v2, v7, v8, vd, m12, m13, cd, cc)
 156  		v3, v4, v9, ve = g(v3, v4, v9, ve, m14, m15, cf, ce)
 157  
 158  		// Round 2 with message word and constants permutation.
 159  		v0, v4, v8, vc = g(v0, v4, v8, vc, m14, m10, ca, ce)
 160  		v1, v5, v9, vd = g(v1, v5, v9, vd, m4, m8, c8, c4)
 161  		v2, v6, va, ve = g(v2, v6, va, ve, m9, m15, cf, c9)
 162  		v3, v7, vb, vf = g(v3, v7, vb, vf, m13, m6, c6, cd)
 163  		v0, v5, va, vf = g(v0, v5, va, vf, m1, m12, cc, c1)
 164  		v1, v6, vb, vc = g(v1, v6, vb, vc, m0, m2, c2, c0)
 165  		v2, v7, v8, vd = g(v2, v7, v8, vd, m11, m7, c7, cb)
 166  		v3, v4, v9, ve = g(v3, v4, v9, ve, m5, m3, c3, c5)
 167  
 168  		// Round 3 with message word and constants permutation.
 169  		v0, v4, v8, vc = g(v0, v4, v8, vc, m11, m8, c8, cb)
 170  		v1, v5, v9, vd = g(v1, v5, v9, vd, m12, m0, c0, cc)
 171  		v2, v6, va, ve = g(v2, v6, va, ve, m5, m2, c2, c5)
 172  		v3, v7, vb, vf = g(v3, v7, vb, vf, m15, m13, cd, cf)
 173  		v0, v5, va, vf = g(v0, v5, va, vf, m10, m14, ce, ca)
 174  		v1, v6, vb, vc = g(v1, v6, vb, vc, m3, m6, c6, c3)
 175  		v2, v7, v8, vd = g(v2, v7, v8, vd, m7, m1, c1, c7)
 176  		v3, v4, v9, ve = g(v3, v4, v9, ve, m9, m4, c4, c9)
 177  
 178  		// Round 4 with message word and constants permutation.
 179  		v0, v4, v8, vc = g(v0, v4, v8, vc, m7, m9, c9, c7)
 180  		v1, v5, v9, vd = g(v1, v5, v9, vd, m3, m1, c1, c3)
 181  		v2, v6, va, ve = g(v2, v6, va, ve, m13, m12, cc, cd)
 182  		v3, v7, vb, vf = g(v3, v7, vb, vf, m11, m14, ce, cb)
 183  		v0, v5, va, vf = g(v0, v5, va, vf, m2, m6, c6, c2)
 184  		v1, v6, vb, vc = g(v1, v6, vb, vc, m5, m10, ca, c5)
 185  		v2, v7, v8, vd = g(v2, v7, v8, vd, m4, m0, c0, c4)
 186  		v3, v4, v9, ve = g(v3, v4, v9, ve, m15, m8, c8, cf)
 187  
 188  		// Round 5 with message word and constants permutation.
 189  		v0, v4, v8, vc = g(v0, v4, v8, vc, m9, m0, c0, c9)
 190  		v1, v5, v9, vd = g(v1, v5, v9, vd, m5, m7, c7, c5)
 191  		v2, v6, va, ve = g(v2, v6, va, ve, m2, m4, c4, c2)
 192  		v3, v7, vb, vf = g(v3, v7, vb, vf, m10, m15, cf, ca)
 193  		v0, v5, va, vf = g(v0, v5, va, vf, m14, m1, c1, ce)
 194  		v1, v6, vb, vc = g(v1, v6, vb, vc, m11, m12, cc, cb)
 195  		v2, v7, v8, vd = g(v2, v7, v8, vd, m6, m8, c8, c6)
 196  		v3, v4, v9, ve = g(v3, v4, v9, ve, m3, m13, cd, c3)
 197  
 198  		// Round 6 with message word and constants permutation.
 199  		v0, v4, v8, vc = g(v0, v4, v8, vc, m2, m12, cc, c2)
 200  		v1, v5, v9, vd = g(v1, v5, v9, vd, m6, m10, ca, c6)
 201  		v2, v6, va, ve = g(v2, v6, va, ve, m0, m11, cb, c0)
 202  		v3, v7, vb, vf = g(v3, v7, vb, vf, m8, m3, c3, c8)
 203  		v0, v5, va, vf = g(v0, v5, va, vf, m4, m13, cd, c4)
 204  		v1, v6, vb, vc = g(v1, v6, vb, vc, m7, m5, c5, c7)
 205  		v2, v7, v8, vd = g(v2, v7, v8, vd, m15, m14, ce, cf)
 206  		v3, v4, v9, ve = g(v3, v4, v9, ve, m1, m9, c9, c1)
 207  
 208  		// Round 7 with message word and constants permutation.
 209  		v0, v4, v8, vc = g(v0, v4, v8, vc, m12, m5, c5, cc)
 210  		v1, v5, v9, vd = g(v1, v5, v9, vd, m1, m15, cf, c1)
 211  		v2, v6, va, ve = g(v2, v6, va, ve, m14, m13, cd, ce)
 212  		v3, v7, vb, vf = g(v3, v7, vb, vf, m4, m10, ca, c4)
 213  		v0, v5, va, vf = g(v0, v5, va, vf, m0, m7, c7, c0)
 214  		v1, v6, vb, vc = g(v1, v6, vb, vc, m6, m3, c3, c6)
 215  		v2, v7, v8, vd = g(v2, v7, v8, vd, m9, m2, c2, c9)
 216  		v3, v4, v9, ve = g(v3, v4, v9, ve, m8, m11, cb, c8)
 217  
 218  		// Round 8 with message word and constants permutation.
 219  		v0, v4, v8, vc = g(v0, v4, v8, vc, m13, m11, cb, cd)
 220  		v1, v5, v9, vd = g(v1, v5, v9, vd, m7, m14, ce, c7)
 221  		v2, v6, va, ve = g(v2, v6, va, ve, m12, m1, c1, cc)
 222  		v3, v7, vb, vf = g(v3, v7, vb, vf, m3, m9, c9, c3)
 223  		v0, v5, va, vf = g(v0, v5, va, vf, m5, m0, c0, c5)
 224  		v1, v6, vb, vc = g(v1, v6, vb, vc, m15, m4, c4, cf)
 225  		v2, v7, v8, vd = g(v2, v7, v8, vd, m8, m6, c6, c8)
 226  		v3, v4, v9, ve = g(v3, v4, v9, ve, m2, m10, ca, c2)
 227  
 228  		// Round 9 with message word and constants permutation.
 229  		v0, v4, v8, vc = g(v0, v4, v8, vc, m6, m15, cf, c6)
 230  		v1, v5, v9, vd = g(v1, v5, v9, vd, m14, m9, c9, ce)
 231  		v2, v6, va, ve = g(v2, v6, va, ve, m11, m3, c3, cb)
 232  		v3, v7, vb, vf = g(v3, v7, vb, vf, m0, m8, c8, c0)
 233  		v0, v5, va, vf = g(v0, v5, va, vf, m12, m2, c2, cc)
 234  		v1, v6, vb, vc = g(v1, v6, vb, vc, m13, m7, c7, cd)
 235  		v2, v7, v8, vd = g(v2, v7, v8, vd, m1, m4, c4, c1)
 236  		v3, v4, v9, ve = g(v3, v4, v9, ve, m10, m5, c5, ca)
 237  
 238  		// Round 10 with message word and constants permutation.
 239  		v0, v4, v8, vc = g(v0, v4, v8, vc, m10, m2, c2, ca)
 240  		v1, v5, v9, vd = g(v1, v5, v9, vd, m8, m4, c4, c8)
 241  		v2, v6, va, ve = g(v2, v6, va, ve, m7, m6, c6, c7)
 242  		v3, v7, vb, vf = g(v3, v7, vb, vf, m1, m5, c5, c1)
 243  		v0, v5, va, vf = g(v0, v5, va, vf, m15, m11, cb, cf)
 244  		v1, v6, vb, vc = g(v1, v6, vb, vc, m9, m14, ce, c9)
 245  		v2, v7, v8, vd = g(v2, v7, v8, vd, m3, m12, cc, c3)
 246  		v3, v4, v9, ve = g(v3, v4, v9, ve, m13, m0, c0, cd)
 247  
 248  		// Round 11 with message word and constants permutation.
 249  		v0, v4, v8, vc = g(v0, v4, v8, vc, m0, m1, c1, c0)
 250  		v1, v5, v9, vd = g(v1, v5, v9, vd, m2, m3, c3, c2)
 251  		v2, v6, va, ve = g(v2, v6, va, ve, m4, m5, c5, c4)
 252  		v3, v7, vb, vf = g(v3, v7, vb, vf, m6, m7, c7, c6)
 253  		v0, v5, va, vf = g(v0, v5, va, vf, m8, m9, c9, c8)
 254  		v1, v6, vb, vc = g(v1, v6, vb, vc, m10, m11, cb, ca)
 255  		v2, v7, v8, vd = g(v2, v7, v8, vd, m12, m13, cd, cc)
 256  		v3, v4, v9, ve = g(v3, v4, v9, ve, m14, m15, cf, ce)
 257  
 258  		// Round 12 with message word and constants permutation.
 259  		v0, v4, v8, vc = g(v0, v4, v8, vc, m14, m10, ca, ce)
 260  		v1, v5, v9, vd = g(v1, v5, v9, vd, m4, m8, c8, c4)
 261  		v2, v6, va, ve = g(v2, v6, va, ve, m9, m15, cf, c9)
 262  		v3, v7, vb, vf = g(v3, v7, vb, vf, m13, m6, c6, cd)
 263  		v0, v5, va, vf = g(v0, v5, va, vf, m1, m12, cc, c1)
 264  		v1, v6, vb, vc = g(v1, v6, vb, vc, m0, m2, c2, c0)
 265  		v2, v7, v8, vd = g(v2, v7, v8, vd, m11, m7, c7, cb)
 266  		v3, v4, v9, ve = g(v3, v4, v9, ve, m5, m3, c3, c5)
 267  
 268  		// Round 13 with message word and constants permutation.
 269  		v0, v4, v8, vc = g(v0, v4, v8, vc, m11, m8, c8, cb)
 270  		v1, v5, v9, vd = g(v1, v5, v9, vd, m12, m0, c0, cc)
 271  		v2, v6, va, ve = g(v2, v6, va, ve, m5, m2, c2, c5)
 272  		v3, v7, vb, vf = g(v3, v7, vb, vf, m15, m13, cd, cf)
 273  		v0, v5, va, vf = g(v0, v5, va, vf, m10, m14, ce, ca)
 274  		v1, v6, vb, vc = g(v1, v6, vb, vc, m3, m6, c6, c3)
 275  		v2, v7, v8, vd = g(v2, v7, v8, vd, m7, m1, c1, c7)
 276  		v3, v4, v9, ve = g(v3, v4, v9, ve, m9, m4, c4, c9)
 277  
 278  		// Round 14 with message word and constants permutation.
 279  		v0, v4, v8, vc = g(v0, v4, v8, vc, m7, m9, c9, c7)
 280  		v1, v5, v9, vd = g(v1, v5, v9, vd, m3, m1, c1, c3)
 281  		v2, v6, va, ve = g(v2, v6, va, ve, m13, m12, cc, cd)
 282  		v3, v7, vb, vf = g(v3, v7, vb, vf, m11, m14, ce, cb)
 283  		v0, v5, va, vf = g(v0, v5, va, vf, m2, m6, c6, c2)
 284  		v1, v6, vb, vc = g(v1, v6, vb, vc, m5, m10, ca, c5)
 285  		v2, v7, v8, vd = g(v2, v7, v8, vd, m4, m0, c0, c4)
 286  		v3, v4, v9, ve = g(v3, v4, v9, ve, m15, m8, c8, cf)
 287  
 288  		// Finally the output is defined as:
 289  		//
 290  		// h'0 = h0^s0^v0^v8
 291  		// h'1 = h1^s1^v1^v9
 292  		// h'2 = h2^s2^v2^va
 293  		// h'3 = h3^s3^v3^vb
 294  		// h'4 = h4^s0^v4^vc
 295  		// h'5 = h5^s1^v5^vd
 296  		// h'6 = h6^s2^v6^ve
 297  		// h'7 = h7^s3^v7^vf
 298  		h[0] ^= s[0] ^ v0 ^ v8
 299  		h[1] ^= s[1] ^ v1 ^ v9
 300  		h[2] ^= s[2] ^ v2 ^ va
 301  		h[3] ^= s[3] ^ v3 ^ vb
 302  		h[4] ^= s[0] ^ v4 ^ vc
 303  		h[5] ^= s[1] ^ v5 ^ vd
 304  		h[6] ^= s[2] ^ v6 ^ ve
 305  		h[7] ^= s[3] ^ v7 ^ vf
 306  
 307  		// Move to the next message and increase the message bits counter
 308  		// accordingly.
 309  		msg = msg[blockSize:]
 310  		counter += blockSize << 3
 311  	}
 312  }
 313