// Copyright (c) 2024 The Decred developers // Use of this source code is governed by an ISC // license that can be found in the LICENSE file. // // Main Go code originally written and optimized by Dave Collins May 2020. // Additional cleanup and comments added July 2024. package compress import ( "math/bits" ) const ( // blockSize is the block size of the hash algorithm in bytes. blockSize = 64 // blockSizeLog2 is the base-2 log of the block size. It is used to // efficiently perform integer division by the block size. blockSizeLog2 = 6 ) // State houses the current chain value and salt used during block compression. // It is housed in a separate struct in order to reduce the number of parameters // to help prevent register spillage on platforms with a limited number of // registers for better performance. type State struct { CV [8]uint32 // the current chain value S [4]uint32 // salt (zero by default) } // g is the quarter round function that each round applies to the 4x4 internal // state in the compression function. func g(a, b, c, d, mx, my, cx, cy uint32) (uint32, uint32, uint32, uint32) { a += b + (mx ^ cx) d = bits.RotateLeft32(d^a, -16) c += d b = bits.RotateLeft32(b^c, -12) a += b + (my ^ cy) d = bits.RotateLeft32(d^a, -8) c += d b = bits.RotateLeft32(b^c, -7) return a, b, c, d } // blocksGeneric performs BLAKE-224 and BLAKE-256 block compression using pure // Go. It will compress as many full blocks as are available in the provided // message. // // The parameters are: // // state: the block compression state with the chain value and salt // msg: the padded message to compress (must be at least 64 bytes) // counter: the total number of message bits hashed so far // // It will panic if the provided message block does not have at least 64 bytes. // // The chain value in the provided state is updated in place. func blocksGeneric(state *State, msg []byte, counter uint64) { // The compression func initializes the 16-word state matrix as follows: // // h0..h7 is the input chaining value. // // s0..s3 is the salt. // // c0..c7 are the first 8 words of the BLAKE constants defined by the // specification (the first digits of π). // // t0 and t1 are the lower and higher order words of the 64-bit counter. // // |v0 v1 v2 v3| | h0 h1 h2 h3 | // |v4 v5 v6 v7| | h4 h5 h6 h7 | // |v8 v9 va vb| = |s0^c0 s1^c1 s2^c2 s3^c3| // |vc vd ve vf| |t0^c4 t0^c5 t1^c6 t1^c7| // // Each round consists of 8 applications of the G function as follows: // G0(v0,v4,v8,vc) G1(v1,v5,v9,vd) G2(v2,v6,va,ve) G3(v3,v7,vb,vf) // G4(v0,v5,va,vf) G5(v1,v6,vb,vc) G6(v2,v7,v8,vd) G7(v3,v4,v9,ve) // // In other words, the G function is applied to each column of the 4x4 state // and then to each of the diagonals. // // In addition, at each round, the message words are permuted according to // the following schedule where each round is mod 10. In other words round // 11 uses the same permutation as round 1, round 12 the same as round 2, // etc: // // round 1: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // round 2: 14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3 // round 3: 11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4 // round 4: 7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8 // round 5: 9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13 // round 6: 2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9 // round 7: 12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11 // round 8: 13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10 // round 9: 6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5 // round 10: 10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0 const ( c0, c1, c2, c3 = 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344 c4, c5, c6, c7 = 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89 c8, c9, ca, cb = 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c cc, cd, ce, cf = 0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 ) // Ideally these hints wouldn't be necessary, but they do make quite a big // difference in avoiding a bunch of additional bounds checks as determined // by both reviewing the resulting compiled asm as well as benchmarks. h := &state.CV s := &state.S _ = h[7] // Bounds check hint to compiler. _ = s[3] // Bounds check hint to compiler. for numBlocks := len(msg) >> blockSizeLog2; numBlocks > 0; numBlocks-- { _ = msg[63] // Bounds check hint to compiler. // Convert the provided message of at least 64 bytes to an array of 16 // 32-bit unsigned big-endian words. // // Note that this conversion is intentionally not in a separate function // because the Go compiler unfortunately sees a function that does this // as too complex to inline. // // It also avoids using binary.BigEndian.Uint32 for these even though it // is more readable for increased performance in the critical path. // // Finally, this is optimized to favor arm since there currently is only // a pure Go implementation for that arch while amd64 will be using the // vector accelerated versions in practice. It's only slightly slower // on amd64 versus various rearrangements that favor it instead. m0 := uint32(msg[3]) | uint32(msg[2])<<8 | uint32(msg[1])<<16 | uint32(msg[0])<<24 m1 := uint32(msg[7]) | uint32(msg[6])<<8 | uint32(msg[5])<<16 | uint32(msg[4])<<24 m2 := uint32(msg[11]) | uint32(msg[10])<<8 | uint32(msg[9])<<16 | uint32(msg[8])<<24 m3 := uint32(msg[15]) | uint32(msg[14])<<8 | uint32(msg[13])<<16 | uint32(msg[12])<<24 m4 := uint32(msg[19]) | uint32(msg[18])<<8 | uint32(msg[17])<<16 | uint32(msg[16])<<24 m5 := uint32(msg[23]) | uint32(msg[22])<<8 | uint32(msg[21])<<16 | uint32(msg[20])<<24 m6 := uint32(msg[27]) | uint32(msg[26])<<8 | uint32(msg[25])<<16 | uint32(msg[24])<<24 m7 := uint32(msg[31]) | uint32(msg[30])<<8 | uint32(msg[29])<<16 | uint32(msg[28])<<24 m8 := uint32(msg[35]) | uint32(msg[34])<<8 | uint32(msg[33])<<16 | uint32(msg[32])<<24 m9 := uint32(msg[39]) | uint32(msg[38])<<8 | uint32(msg[37])<<16 | uint32(msg[36])<<24 m10 := uint32(msg[43]) | uint32(msg[42])<<8 | uint32(msg[41])<<16 | uint32(msg[40])<<24 m11 := uint32(msg[47]) | uint32(msg[46])<<8 | uint32(msg[45])<<16 | uint32(msg[44])<<24 m12 := uint32(msg[51]) | uint32(msg[50])<<8 | uint32(msg[49])<<16 | uint32(msg[48])<<24 m13 := uint32(msg[55]) | uint32(msg[54])<<8 | uint32(msg[53])<<16 | uint32(msg[52])<<24 m14 := uint32(msg[59]) | uint32(msg[58])<<8 | uint32(msg[57])<<16 | uint32(msg[56])<<24 m15 := uint32(msg[63]) | uint32(msg[62])<<8 | uint32(msg[61])<<16 | uint32(msg[60])<<24 // Round 1 plus state matrix initialization. t0, t1 := uint32(counter), uint32(counter>>32) v0, v4, v8, vc := g(h[0], h[4], s[0]^c0, t0^c4, m0, m1, c1, c0) v1, v5, v9, vd := g(h[1], h[5], s[1]^c1, t0^c5, m2, m3, c3, c2) v2, v6, va, ve := g(h[2], h[6], s[2]^c2, t1^c6, m4, m5, c5, c4) v3, v7, vb, vf := g(h[3], h[7], s[3]^c3, t1^c7, m6, m7, c7, c6) v0, v5, va, vf = g(v0, v5, va, vf, m8, m9, c9, c8) v1, v6, vb, vc = g(v1, v6, vb, vc, m10, m11, cb, ca) v2, v7, v8, vd = g(v2, v7, v8, vd, m12, m13, cd, cc) v3, v4, v9, ve = g(v3, v4, v9, ve, m14, m15, cf, ce) // Round 2 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m14, m10, ca, ce) v1, v5, v9, vd = g(v1, v5, v9, vd, m4, m8, c8, c4) v2, v6, va, ve = g(v2, v6, va, ve, m9, m15, cf, c9) v3, v7, vb, vf = g(v3, v7, vb, vf, m13, m6, c6, cd) v0, v5, va, vf = g(v0, v5, va, vf, m1, m12, cc, c1) v1, v6, vb, vc = g(v1, v6, vb, vc, m0, m2, c2, c0) v2, v7, v8, vd = g(v2, v7, v8, vd, m11, m7, c7, cb) v3, v4, v9, ve = g(v3, v4, v9, ve, m5, m3, c3, c5) // Round 3 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m11, m8, c8, cb) v1, v5, v9, vd = g(v1, v5, v9, vd, m12, m0, c0, cc) v2, v6, va, ve = g(v2, v6, va, ve, m5, m2, c2, c5) v3, v7, vb, vf = g(v3, v7, vb, vf, m15, m13, cd, cf) v0, v5, va, vf = g(v0, v5, va, vf, m10, m14, ce, ca) v1, v6, vb, vc = g(v1, v6, vb, vc, m3, m6, c6, c3) v2, v7, v8, vd = g(v2, v7, v8, vd, m7, m1, c1, c7) v3, v4, v9, ve = g(v3, v4, v9, ve, m9, m4, c4, c9) // Round 4 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m7, m9, c9, c7) v1, v5, v9, vd = g(v1, v5, v9, vd, m3, m1, c1, c3) v2, v6, va, ve = g(v2, v6, va, ve, m13, m12, cc, cd) v3, v7, vb, vf = g(v3, v7, vb, vf, m11, m14, ce, cb) v0, v5, va, vf = g(v0, v5, va, vf, m2, m6, c6, c2) v1, v6, vb, vc = g(v1, v6, vb, vc, m5, m10, ca, c5) v2, v7, v8, vd = g(v2, v7, v8, vd, m4, m0, c0, c4) v3, v4, v9, ve = g(v3, v4, v9, ve, m15, m8, c8, cf) // Round 5 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m9, m0, c0, c9) v1, v5, v9, vd = g(v1, v5, v9, vd, m5, m7, c7, c5) v2, v6, va, ve = g(v2, v6, va, ve, m2, m4, c4, c2) v3, v7, vb, vf = g(v3, v7, vb, vf, m10, m15, cf, ca) v0, v5, va, vf = g(v0, v5, va, vf, m14, m1, c1, ce) v1, v6, vb, vc = g(v1, v6, vb, vc, m11, m12, cc, cb) v2, v7, v8, vd = g(v2, v7, v8, vd, m6, m8, c8, c6) v3, v4, v9, ve = g(v3, v4, v9, ve, m3, m13, cd, c3) // Round 6 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m2, m12, cc, c2) v1, v5, v9, vd = g(v1, v5, v9, vd, m6, m10, ca, c6) v2, v6, va, ve = g(v2, v6, va, ve, m0, m11, cb, c0) v3, v7, vb, vf = g(v3, v7, vb, vf, m8, m3, c3, c8) v0, v5, va, vf = g(v0, v5, va, vf, m4, m13, cd, c4) v1, v6, vb, vc = g(v1, v6, vb, vc, m7, m5, c5, c7) v2, v7, v8, vd = g(v2, v7, v8, vd, m15, m14, ce, cf) v3, v4, v9, ve = g(v3, v4, v9, ve, m1, m9, c9, c1) // Round 7 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m12, m5, c5, cc) v1, v5, v9, vd = g(v1, v5, v9, vd, m1, m15, cf, c1) v2, v6, va, ve = g(v2, v6, va, ve, m14, m13, cd, ce) v3, v7, vb, vf = g(v3, v7, vb, vf, m4, m10, ca, c4) v0, v5, va, vf = g(v0, v5, va, vf, m0, m7, c7, c0) v1, v6, vb, vc = g(v1, v6, vb, vc, m6, m3, c3, c6) v2, v7, v8, vd = g(v2, v7, v8, vd, m9, m2, c2, c9) v3, v4, v9, ve = g(v3, v4, v9, ve, m8, m11, cb, c8) // Round 8 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m13, m11, cb, cd) v1, v5, v9, vd = g(v1, v5, v9, vd, m7, m14, ce, c7) v2, v6, va, ve = g(v2, v6, va, ve, m12, m1, c1, cc) v3, v7, vb, vf = g(v3, v7, vb, vf, m3, m9, c9, c3) v0, v5, va, vf = g(v0, v5, va, vf, m5, m0, c0, c5) v1, v6, vb, vc = g(v1, v6, vb, vc, m15, m4, c4, cf) v2, v7, v8, vd = g(v2, v7, v8, vd, m8, m6, c6, c8) v3, v4, v9, ve = g(v3, v4, v9, ve, m2, m10, ca, c2) // Round 9 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m6, m15, cf, c6) v1, v5, v9, vd = g(v1, v5, v9, vd, m14, m9, c9, ce) v2, v6, va, ve = g(v2, v6, va, ve, m11, m3, c3, cb) v3, v7, vb, vf = g(v3, v7, vb, vf, m0, m8, c8, c0) v0, v5, va, vf = g(v0, v5, va, vf, m12, m2, c2, cc) v1, v6, vb, vc = g(v1, v6, vb, vc, m13, m7, c7, cd) v2, v7, v8, vd = g(v2, v7, v8, vd, m1, m4, c4, c1) v3, v4, v9, ve = g(v3, v4, v9, ve, m10, m5, c5, ca) // Round 10 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m10, m2, c2, ca) v1, v5, v9, vd = g(v1, v5, v9, vd, m8, m4, c4, c8) v2, v6, va, ve = g(v2, v6, va, ve, m7, m6, c6, c7) v3, v7, vb, vf = g(v3, v7, vb, vf, m1, m5, c5, c1) v0, v5, va, vf = g(v0, v5, va, vf, m15, m11, cb, cf) v1, v6, vb, vc = g(v1, v6, vb, vc, m9, m14, ce, c9) v2, v7, v8, vd = g(v2, v7, v8, vd, m3, m12, cc, c3) v3, v4, v9, ve = g(v3, v4, v9, ve, m13, m0, c0, cd) // Round 11 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m0, m1, c1, c0) v1, v5, v9, vd = g(v1, v5, v9, vd, m2, m3, c3, c2) v2, v6, va, ve = g(v2, v6, va, ve, m4, m5, c5, c4) v3, v7, vb, vf = g(v3, v7, vb, vf, m6, m7, c7, c6) v0, v5, va, vf = g(v0, v5, va, vf, m8, m9, c9, c8) v1, v6, vb, vc = g(v1, v6, vb, vc, m10, m11, cb, ca) v2, v7, v8, vd = g(v2, v7, v8, vd, m12, m13, cd, cc) v3, v4, v9, ve = g(v3, v4, v9, ve, m14, m15, cf, ce) // Round 12 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m14, m10, ca, ce) v1, v5, v9, vd = g(v1, v5, v9, vd, m4, m8, c8, c4) v2, v6, va, ve = g(v2, v6, va, ve, m9, m15, cf, c9) v3, v7, vb, vf = g(v3, v7, vb, vf, m13, m6, c6, cd) v0, v5, va, vf = g(v0, v5, va, vf, m1, m12, cc, c1) v1, v6, vb, vc = g(v1, v6, vb, vc, m0, m2, c2, c0) v2, v7, v8, vd = g(v2, v7, v8, vd, m11, m7, c7, cb) v3, v4, v9, ve = g(v3, v4, v9, ve, m5, m3, c3, c5) // Round 13 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m11, m8, c8, cb) v1, v5, v9, vd = g(v1, v5, v9, vd, m12, m0, c0, cc) v2, v6, va, ve = g(v2, v6, va, ve, m5, m2, c2, c5) v3, v7, vb, vf = g(v3, v7, vb, vf, m15, m13, cd, cf) v0, v5, va, vf = g(v0, v5, va, vf, m10, m14, ce, ca) v1, v6, vb, vc = g(v1, v6, vb, vc, m3, m6, c6, c3) v2, v7, v8, vd = g(v2, v7, v8, vd, m7, m1, c1, c7) v3, v4, v9, ve = g(v3, v4, v9, ve, m9, m4, c4, c9) // Round 14 with message word and constants permutation. v0, v4, v8, vc = g(v0, v4, v8, vc, m7, m9, c9, c7) v1, v5, v9, vd = g(v1, v5, v9, vd, m3, m1, c1, c3) v2, v6, va, ve = g(v2, v6, va, ve, m13, m12, cc, cd) v3, v7, vb, vf = g(v3, v7, vb, vf, m11, m14, ce, cb) v0, v5, va, vf = g(v0, v5, va, vf, m2, m6, c6, c2) v1, v6, vb, vc = g(v1, v6, vb, vc, m5, m10, ca, c5) v2, v7, v8, vd = g(v2, v7, v8, vd, m4, m0, c0, c4) v3, v4, v9, ve = g(v3, v4, v9, ve, m15, m8, c8, cf) // Finally the output is defined as: // // h'0 = h0^s0^v0^v8 // h'1 = h1^s1^v1^v9 // h'2 = h2^s2^v2^va // h'3 = h3^s3^v3^vb // h'4 = h4^s0^v4^vc // h'5 = h5^s1^v5^vd // h'6 = h6^s2^v6^ve // h'7 = h7^s3^v7^vf h[0] ^= s[0] ^ v0 ^ v8 h[1] ^= s[1] ^ v1 ^ v9 h[2] ^= s[2] ^ v2 ^ va h[3] ^= s[3] ^ v3 ^ vb h[4] ^= s[0] ^ v4 ^ vc h[5] ^= s[1] ^ v5 ^ vd h[6] ^= s[2] ^ v6 ^ ve h[7] ^= s[3] ^ v7 ^ vf // Move to the next message and increase the message bits counter // accordingly. msg = msg[blockSize:] counter += blockSize << 3 } }