shift.mx raw

   1  // Copyright 2025 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package asmgen
   6  
   7  // shiftVU generates lshVU and rshVU, which do
   8  // z, c = x << s and z, c = x >> s, for 0 < s < _W.
   9  func shiftVU(a *Asm, name []byte) {
  10  	// Because these routines can be called for z.Lsh(z, N) and z.Rsh(z, N),
  11  	// the input and output slices may be aliased at different offsets.
  12  	// For example (on 64-bit systems), during z.Lsh(z, 65), &z[0] == &x[1],
  13  	// and during z.Rsh(z, 65), &z[1] == &x[0].
  14  	// For left shift, we must process the slices from len(z)-1 down to 0,
  15  	// so that we don't overwrite a word before we need to read it.
  16  	// For right shift, we must process the slices from 0 up to len(z)-1.
  17  	// The different traversals at least make the two cases more consistent,
  18  	// since we're always delaying the output by one word compared
  19  	// to the input.
  20  
  21  	f := a.Func("func " + name + "(z, x []Word, s uint) (c Word)")
  22  
  23  	// Check for no input early, since we need to start by reading 1 word.
  24  	n := f.Arg("z_len")
  25  	a.JmpZero(n, "ret0")
  26  
  27  	// Start loop by reading first input word.
  28  	s := f.ArgHint("s", HintShiftCount)
  29  	p := f.Pipe()
  30  	if name == "lshVU" {
  31  		p.SetBackward()
  32  	}
  33  	unroll := []int{1, 4}
  34  	if a.Arch == Arch386 {
  35  		unroll = []int{1} // too few registers for more
  36  		p.SetUseIndexCounter()
  37  	}
  38  	p.LoadPtrs(n)
  39  	a.Comment("shift first word into carry")
  40  	prev := p.LoadN(1)[0][0]
  41  
  42  	// Decide how to shift. On systems with a wide shift (x86), use that.
  43  	// Otherwise, we need shift by s and negative (reverse) shift by 64-s or 32-s.
  44  	shift := a.Lsh
  45  	shiftWide := a.LshWide
  46  	negShift := a.Rsh
  47  	negShiftReg := a.RshReg
  48  	if name == "rshVU" {
  49  		shift = a.Rsh
  50  		shiftWide = a.RshWide
  51  		negShift = a.Lsh
  52  		negShiftReg = a.LshReg
  53  	}
  54  	if a.Arch.HasShiftWide() {
  55  		// Use wide shift to avoid needing negative shifts.
  56  		// The invariant is that prev holds the previous word (not shifted at all),
  57  		// to be used as input into the wide shift.
  58  		// After the loop finishes, prev holds the final output word to be written.
  59  		c := a.Reg()
  60  		shiftWide(s, prev, a.Imm(0), c)
  61  		f.StoreArg(c, "c")
  62  		a.Free(c)
  63  		a.Comment("shift remaining words")
  64  		p.Start(n, unroll...)
  65  		p.Loop(func(in [][]Reg, out [][]Reg) {
  66  			// We reuse the input registers as output, delayed one cycle; prev is the first output.
  67  			// After writing the outputs to memory, we can copy the final x value into prev
  68  			// for the next iteration.
  69  			old := prev
  70  			for i, x := range in[0] {
  71  				shiftWide(s, x, old, old)
  72  				out[0][i] = old
  73  				old = x
  74  			}
  75  			p.StoreN(out)
  76  			a.Mov(old, prev)
  77  		})
  78  		a.Comment("store final shifted bits")
  79  		shift(s, prev, prev)
  80  	} else {
  81  		// Construct values from x << s and x >> (64-s).
  82  		// After the first word has been processed, the invariant is that
  83  		// prev holds x << s, to be used as the high bits of the next output word,
  84  		// once we find the low bits after reading the next input word.
  85  		// After the loop finishes, prev holds the final output word to be written.
  86  		sNeg := a.Reg()
  87  		a.Mov(a.Imm(a.Arch.WordBits), sNeg)
  88  		a.Sub(s, sNeg, sNeg, SmashCarry)
  89  		c := a.Reg()
  90  		negShift(sNeg, prev, c)
  91  		shift(s, prev, prev)
  92  		f.StoreArg(c, "c")
  93  		a.Free(c)
  94  		a.Comment("shift remaining words")
  95  		p.Start(n, unroll...)
  96  		p.Loop(func(in, out [][]Reg) {
  97  			if a.HasRegShift() {
  98  				// ARM (32-bit) allows shifts in most arithmetic expressions,
  99  				// including OR, letting us combine the negShift and a.Or.
 100  				// The simplest way to manage the registers is to do StoreN for
 101  				// one output at a time, and since we don't use multi-register
 102  				// stores on ARM, that doesn't hurt us.
 103  				out[0] = out[0][:1]
 104  				for _, x := range in[0] {
 105  					a.Or(negShiftReg(sNeg, x), prev, prev)
 106  					out[0][0] = prev
 107  					p.StoreN(out)
 108  					shift(s, x, prev)
 109  				}
 110  				return
 111  			}
 112  			// We reuse the input registers as output, delayed one cycle; z0 is the first output.
 113  			z0 := a.Reg()
 114  			z := z0
 115  			for i, x := range in[0] {
 116  				negShift(sNeg, x, z)
 117  				a.Or(prev, z, z)
 118  				shift(s, x, prev)
 119  				out[0][i] = z
 120  				z = x
 121  			}
 122  			p.StoreN(out)
 123  		})
 124  		a.Comment("store final shifted bits")
 125  	}
 126  	p.StoreN([][]Reg{{prev}})
 127  	p.Done()
 128  	a.Free(s)
 129  	a.Ret()
 130  
 131  	// Return 0, used from above.
 132  	a.Label("ret0")
 133  	f.StoreArg(a.Imm(0), "c")
 134  	a.Ret()
 135  }
 136