sha256blockAvx512_amd64.go raw

   1  //go:build !noasm && !appengine && gc
   2  // +build !noasm,!appengine,gc
   3  
   4  /*
   5   * Minio Cloud Storage, (C) 2017 Minio, Inc.
   6   *
   7   * Licensed under the Apache License, Version 2.0 (the "License");
   8   * you may not use this file except in compliance with the License.
   9   * You may obtain a copy of the License at
  10   *
  11   *     http://www.apache.org/licenses/LICENSE-2.0
  12   *
  13   * Unless required by applicable law or agreed to in writing, software
  14   * distributed under the License is distributed on an "AS IS" BASIS,
  15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16   * See the License for the specific language governing permissions and
  17   * limitations under the License.
  18   */
  19  
  20  package sha256
  21  
  22  import (
  23  	"encoding/binary"
  24  	"errors"
  25  	"hash"
  26  	"sort"
  27  	"sync/atomic"
  28  	"time"
  29  )
  30  
  31  //go:noescape
  32  func sha256X16Avx512(digests *[512]byte, scratch *[512]byte, table *[512]uint64, mask []uint64, inputs [16][]byte)
  33  
  34  // Avx512ServerUID - Do not start at 0 but next multiple of 16 so as to be able to
  35  // differentiate with default initialiation value of 0
  36  const Avx512ServerUID = 16
  37  
  38  var uidCounter uint64
  39  
  40  // NewAvx512 - initialize sha256 Avx512 implementation.
  41  func NewAvx512(a512srv *Avx512Server) hash.Hash {
  42  	uid := atomic.AddUint64(&uidCounter, 1)
  43  	return &Avx512Digest{uid: uid, a512srv: a512srv}
  44  }
  45  
  46  // Avx512Digest - Type for computing SHA256 using Avx512
  47  type Avx512Digest struct {
  48  	uid     uint64
  49  	a512srv *Avx512Server
  50  	x       [chunk]byte
  51  	nx      int
  52  	len     uint64
  53  	final   bool
  54  	result  [Size]byte
  55  }
  56  
  57  // Size - Return size of checksum
  58  func (d *Avx512Digest) Size() int { return Size }
  59  
  60  // BlockSize - Return blocksize of checksum
  61  func (d Avx512Digest) BlockSize() int { return BlockSize }
  62  
  63  // Reset - reset sha digest to its initial values
  64  func (d *Avx512Digest) Reset() {
  65  	d.a512srv.blocksCh <- blockInput{uid: d.uid, reset: true}
  66  	d.nx = 0
  67  	d.len = 0
  68  	d.final = false
  69  }
  70  
  71  // Write to digest
  72  func (d *Avx512Digest) Write(p []byte) (nn int, err error) {
  73  
  74  	if d.final {
  75  		return 0, errors.New("Avx512Digest already finalized. Reset first before writing again")
  76  	}
  77  
  78  	nn = len(p)
  79  	d.len += uint64(nn)
  80  	if d.nx > 0 {
  81  		n := copy(d.x[d.nx:], p)
  82  		d.nx += n
  83  		if d.nx == chunk {
  84  			d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: d.x[:]}
  85  			d.nx = 0
  86  		}
  87  		p = p[n:]
  88  	}
  89  	if len(p) >= chunk {
  90  		n := len(p) &^ (chunk - 1)
  91  		d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: p[:n]}
  92  		p = p[n:]
  93  	}
  94  	if len(p) > 0 {
  95  		d.nx = copy(d.x[:], p)
  96  	}
  97  	return
  98  }
  99  
 100  // Sum - Return sha256 sum in bytes
 101  func (d *Avx512Digest) Sum(in []byte) (result []byte) {
 102  
 103  	if d.final {
 104  		return append(in, d.result[:]...)
 105  	}
 106  
 107  	trail := make([]byte, 0, 128)
 108  	trail = append(trail, d.x[:d.nx]...)
 109  
 110  	len := d.len
 111  	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
 112  	var tmp [64]byte
 113  	tmp[0] = 0x80
 114  	if len%64 < 56 {
 115  		trail = append(trail, tmp[0:56-len%64]...)
 116  	} else {
 117  		trail = append(trail, tmp[0:64+56-len%64]...)
 118  	}
 119  	d.nx = 0
 120  
 121  	// Length in bits.
 122  	len <<= 3
 123  	for i := uint(0); i < 8; i++ {
 124  		tmp[i] = byte(len >> (56 - 8*i))
 125  	}
 126  	trail = append(trail, tmp[0:8]...)
 127  
 128  	sumCh := make(chan [Size]byte)
 129  	d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: trail, final: true, sumCh: sumCh}
 130  	d.result = <-sumCh
 131  	d.final = true
 132  	return append(in, d.result[:]...)
 133  }
 134  
 135  var table = [512]uint64{
 136  	0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
 137  	0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
 138  	0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
 139  	0x7137449171374491, 0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
 140  	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
 141  	0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
 142  	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
 143  	0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
 144  	0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
 145  	0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
 146  	0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
 147  	0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
 148  	0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
 149  	0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
 150  	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
 151  	0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
 152  	0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
 153  	0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
 154  	0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
 155  	0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
 156  	0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
 157  	0x243185be243185be, 0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
 158  	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
 159  	0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
 160  	0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
 161  	0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
 162  	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
 163  	0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
 164  	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
 165  	0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
 166  	0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
 167  	0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
 168  	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
 169  	0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
 170  	0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
 171  	0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
 172  	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
 173  	0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
 174  	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
 175  	0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
 176  	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
 177  	0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
 178  	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
 179  	0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
 180  	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
 181  	0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
 182  	0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
 183  	0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
 184  	0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
 185  	0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
 186  	0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
 187  	0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
 188  	0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
 189  	0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
 190  	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
 191  	0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
 192  	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
 193  	0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
 194  	0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
 195  	0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
 196  	0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
 197  	0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
 198  	0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
 199  	0x1429296714292967, 0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
 200  	0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
 201  	0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
 202  	0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
 203  	0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
 204  	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
 205  	0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
 206  	0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
 207  	0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
 208  	0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
 209  	0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
 210  	0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
 211  	0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
 212  	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
 213  	0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
 214  	0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
 215  	0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
 216  	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
 217  	0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
 218  	0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
 219  	0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
 220  	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
 221  	0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
 222  	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
 223  	0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
 224  	0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
 225  	0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
 226  	0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
 227  	0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
 228  	0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
 229  	0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
 230  	0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
 231  	0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
 232  	0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
 233  	0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
 234  	0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
 235  	0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
 236  	0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
 237  	0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
 238  	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
 239  	0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
 240  	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
 241  	0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
 242  	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
 243  	0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
 244  	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
 245  	0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
 246  	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
 247  	0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
 248  	0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
 249  	0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
 250  	0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
 251  	0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
 252  	0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
 253  	0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
 254  	0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
 255  	0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
 256  	0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
 257  	0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
 258  	0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
 259  	0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
 260  	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
 261  	0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
 262  	0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2,
 263  	0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2}
 264  
 265  // Interface function to assembly ode
 266  func blockAvx512(digests *[512]byte, input [16][]byte, mask []uint64) [16][Size]byte {
 267  
 268  	scratch := [512]byte{}
 269  	sha256X16Avx512(digests, &scratch, &table, mask, input)
 270  
 271  	output := [16][Size]byte{}
 272  	for i := 0; i < 16; i++ {
 273  		output[i] = getDigest(i, digests[:])
 274  	}
 275  
 276  	return output
 277  }
 278  
 279  func getDigest(index int, state []byte) (sum [Size]byte) {
 280  	for j := 0; j < 16; j += 2 {
 281  		for i := index*4 + j*Size; i < index*4+(j+1)*Size; i += Size {
 282  			binary.BigEndian.PutUint32(sum[j*2:], binary.LittleEndian.Uint32(state[i:i+4]))
 283  		}
 284  	}
 285  	return
 286  }
 287  
 288  // Message to send across input channel
 289  type blockInput struct {
 290  	uid   uint64
 291  	msg   []byte
 292  	reset bool
 293  	final bool
 294  	sumCh chan [Size]byte
 295  }
 296  
 297  // Avx512Server - Type to implement 16x parallel handling of SHA256 invocations
 298  type Avx512Server struct {
 299  	blocksCh chan blockInput       // Input channel
 300  	totalIn  int                   // Total number of inputs waiting to be processed
 301  	lanes    [16]Avx512LaneInfo    // Array with info per lane (out of 16)
 302  	digests  map[uint64][Size]byte // Map of uids to (interim) digest results
 303  }
 304  
 305  // Avx512LaneInfo - Info for each lane
 306  type Avx512LaneInfo struct {
 307  	uid      uint64          // unique identification for this SHA processing
 308  	block    []byte          // input block to be processed
 309  	outputCh chan [Size]byte // channel for output result
 310  }
 311  
 312  // NewAvx512Server - Create new object for parallel processing handling
 313  func NewAvx512Server() *Avx512Server {
 314  	a512srv := &Avx512Server{}
 315  	a512srv.digests = make(map[uint64][Size]byte)
 316  	a512srv.blocksCh = make(chan blockInput)
 317  
 318  	// Start a single thread for reading from the input channel
 319  	go a512srv.Process()
 320  	return a512srv
 321  }
 322  
 323  // Process - Sole handler for reading from the input channel
 324  func (a512srv *Avx512Server) Process() {
 325  	for {
 326  		select {
 327  		case block := <-a512srv.blocksCh:
 328  			if block.reset {
 329  				a512srv.reset(block.uid)
 330  				continue
 331  			}
 332  			index := block.uid & 0xf
 333  			// fmt.Println("Adding message:", block.uid, index)
 334  
 335  			if a512srv.lanes[index].block != nil { // If slot is already filled, process all inputs
 336  				//fmt.Println("Invoking Blocks()")
 337  				a512srv.blocks()
 338  			}
 339  			a512srv.totalIn++
 340  			a512srv.lanes[index] = Avx512LaneInfo{uid: block.uid, block: block.msg}
 341  			if block.final {
 342  				a512srv.lanes[index].outputCh = block.sumCh
 343  			}
 344  			if a512srv.totalIn == len(a512srv.lanes) {
 345  				// fmt.Println("Invoking Blocks() while FULL: ")
 346  				a512srv.blocks()
 347  			}
 348  
 349  			// TODO: test with larger timeout
 350  		case <-time.After(1 * time.Microsecond):
 351  			for _, lane := range a512srv.lanes {
 352  				if lane.block != nil { // check if there is any input to process
 353  					// fmt.Println("Invoking Blocks() on TIMEOUT: ")
 354  					a512srv.blocks()
 355  					break // we are done
 356  				}
 357  			}
 358  		}
 359  	}
 360  }
 361  
 362  // Do a reset for this calculation
 363  func (a512srv *Avx512Server) reset(uid uint64) {
 364  
 365  	// Check if there is a message still waiting to be processed (and remove if so)
 366  	for i, lane := range a512srv.lanes {
 367  		if lane.uid == uid {
 368  			if lane.block != nil {
 369  				a512srv.lanes[i] = Avx512LaneInfo{} // clear message
 370  				a512srv.totalIn--
 371  			}
 372  		}
 373  	}
 374  
 375  	// Delete entry from hash map
 376  	delete(a512srv.digests, uid)
 377  }
 378  
 379  // Invoke assembly and send results back
 380  func (a512srv *Avx512Server) blocks() {
 381  
 382  	inputs := [16][]byte{}
 383  	for i := range inputs {
 384  		inputs[i] = a512srv.lanes[i].block
 385  	}
 386  
 387  	mask := expandMask(genMask(inputs))
 388  	outputs := blockAvx512(a512srv.getDigests(), inputs, mask)
 389  
 390  	a512srv.totalIn = 0
 391  	for i := 0; i < len(outputs); i++ {
 392  		uid, outputCh := a512srv.lanes[i].uid, a512srv.lanes[i].outputCh
 393  		a512srv.digests[uid] = outputs[i]
 394  		a512srv.lanes[i] = Avx512LaneInfo{}
 395  
 396  		if outputCh != nil {
 397  			// Send back result
 398  			outputCh <- outputs[i]
 399  			delete(a512srv.digests, uid) // Delete entry from hashmap
 400  		}
 401  	}
 402  }
 403  
 404  func (a512srv *Avx512Server) Write(uid uint64, p []byte) (nn int, err error) {
 405  	a512srv.blocksCh <- blockInput{uid: uid, msg: p}
 406  	return len(p), nil
 407  }
 408  
 409  // Sum - return sha256 sum in bytes for a given sum id.
 410  func (a512srv *Avx512Server) Sum(uid uint64, p []byte) [32]byte {
 411  	sumCh := make(chan [32]byte)
 412  	a512srv.blocksCh <- blockInput{uid: uid, msg: p, final: true, sumCh: sumCh}
 413  	return <-sumCh
 414  }
 415  
 416  func (a512srv *Avx512Server) getDigests() *[512]byte {
 417  	digests := [512]byte{}
 418  	for i, lane := range a512srv.lanes {
 419  		a, ok := a512srv.digests[lane.uid]
 420  		if ok {
 421  			binary.BigEndian.PutUint32(digests[(i+0*16)*4:], binary.LittleEndian.Uint32(a[0:4]))
 422  			binary.BigEndian.PutUint32(digests[(i+1*16)*4:], binary.LittleEndian.Uint32(a[4:8]))
 423  			binary.BigEndian.PutUint32(digests[(i+2*16)*4:], binary.LittleEndian.Uint32(a[8:12]))
 424  			binary.BigEndian.PutUint32(digests[(i+3*16)*4:], binary.LittleEndian.Uint32(a[12:16]))
 425  			binary.BigEndian.PutUint32(digests[(i+4*16)*4:], binary.LittleEndian.Uint32(a[16:20]))
 426  			binary.BigEndian.PutUint32(digests[(i+5*16)*4:], binary.LittleEndian.Uint32(a[20:24]))
 427  			binary.BigEndian.PutUint32(digests[(i+6*16)*4:], binary.LittleEndian.Uint32(a[24:28]))
 428  			binary.BigEndian.PutUint32(digests[(i+7*16)*4:], binary.LittleEndian.Uint32(a[28:32]))
 429  		} else {
 430  			binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0)
 431  			binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1)
 432  			binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2)
 433  			binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3)
 434  			binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4)
 435  			binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5)
 436  			binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6)
 437  			binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7)
 438  		}
 439  	}
 440  	return &digests
 441  }
 442  
 443  // Helper struct for sorting blocks based on length
 444  type lane struct {
 445  	len uint
 446  	pos uint
 447  }
 448  
 449  type lanes []lane
 450  
 451  func (lns lanes) Len() int           { return len(lns) }
 452  func (lns lanes) Swap(i, j int)      { lns[i], lns[j] = lns[j], lns[i] }
 453  func (lns lanes) Less(i, j int) bool { return lns[i].len < lns[j].len }
 454  
 455  // Helper struct for
 456  type maskRounds struct {
 457  	mask   uint64
 458  	rounds uint64
 459  }
 460  
 461  func genMask(input [16][]byte) [16]maskRounds {
 462  
 463  	// Sort on blocks length small to large
 464  	var sorted [16]lane
 465  	for c, inpt := range input {
 466  		sorted[c] = lane{uint(len(inpt)), uint(c)}
 467  	}
 468  	sort.Sort(lanes(sorted[:]))
 469  
 470  	// Create mask array including 'rounds' between masks
 471  	m, round, index := uint64(0xffff), uint64(0), 0
 472  	var mr [16]maskRounds
 473  	for _, s := range sorted {
 474  		if s.len > 0 {
 475  			if uint64(s.len)>>6 > round {
 476  				mr[index] = maskRounds{m, (uint64(s.len) >> 6) - round}
 477  				index++
 478  			}
 479  			round = uint64(s.len) >> 6
 480  		}
 481  		m = m & ^(1 << uint(s.pos))
 482  	}
 483  
 484  	return mr
 485  }
 486  
 487  // TODO: remove function
 488  func expandMask(mr [16]maskRounds) []uint64 {
 489  	size := uint64(0)
 490  	for _, r := range mr {
 491  		size += r.rounds
 492  	}
 493  	result, index := make([]uint64, size), 0
 494  	for _, r := range mr {
 495  		for j := uint64(0); j < r.rounds; j++ {
 496  			result[index] = r.mask
 497  			index++
 498  		}
 499  	}
 500  	return result
 501  }
 502