asm6.go raw

   1  // Inferno utils/6l/span.c
   2  // https://bitbucket.org/inferno-os/inferno-os/src/master/utils/6l/span.c
   3  //
   4  //	Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
   5  //	Portions Copyright © 1995-1997 C H Forsyth (forsyth@terzarima.net)
   6  //	Portions Copyright © 1997-1999 Vita Nuova Limited
   7  //	Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
   8  //	Portions Copyright © 2004,2006 Bruce Ellis
   9  //	Portions Copyright © 2005-2007 C H Forsyth (forsyth@terzarima.net)
  10  //	Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
  11  //	Portions Copyright © 2009 The Go Authors. All rights reserved.
  12  //
  13  // Permission is hereby granted, free of charge, to any person obtaining a copy
  14  // of this software and associated documentation files (the "Software"), to deal
  15  // in the Software without restriction, including without limitation the rights
  16  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  17  // copies of the Software, and to permit persons to whom the Software is
  18  // furnished to do so, subject to the following conditions:
  19  //
  20  // The above copyright notice and this permission notice shall be included in
  21  // all copies or substantial portions of the Software.
  22  //
  23  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  24  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  25  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
  26  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  27  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  28  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  29  // THE SOFTWARE.
  30  
  31  package x86
  32  
  33  import (
  34  	"github.com/twitchyliquid64/golang-asm/obj"
  35  	"github.com/twitchyliquid64/golang-asm/objabi"
  36  	"github.com/twitchyliquid64/golang-asm/sys"
  37  	"encoding/binary"
  38  	"fmt"
  39  	"log"
  40  	"strings"
  41  )
  42  
  43  var (
  44  	plan9privates *obj.LSym
  45  	deferreturn   *obj.LSym
  46  )
  47  
  48  // Instruction layout.
  49  
  50  // Loop alignment constants:
  51  // want to align loop entry to loopAlign-byte boundary,
  52  // and willing to insert at most maxLoopPad bytes of NOP to do so.
  53  // We define a loop entry as the target of a backward jump.
  54  //
  55  // gcc uses maxLoopPad = 10 for its 'generic x86-64' config,
  56  // and it aligns all jump targets, not just backward jump targets.
  57  //
  58  // As of 6/1/2012, the effect of setting maxLoopPad = 10 here
  59  // is very slight but negative, so the alignment is disabled by
  60  // setting MaxLoopPad = 0. The code is here for reference and
  61  // for future experiments.
  62  //
  63  const (
  64  	loopAlign  = 16
  65  	maxLoopPad = 0
  66  )
  67  
  68  // Bit flags that are used to express jump target properties.
  69  const (
  70  	// branchBackwards marks targets that are located behind.
  71  	// Used to express jumps to loop headers.
  72  	branchBackwards = (1 << iota)
  73  	// branchShort marks branches those target is close,
  74  	// with offset is in -128..127 range.
  75  	branchShort
  76  	// branchLoopHead marks loop entry.
  77  	// Used to insert padding for misaligned loops.
  78  	branchLoopHead
  79  )
  80  
  81  // opBytes holds optab encoding bytes.
  82  // Each ytab reserves fixed amount of bytes in this array.
  83  //
  84  // The size should be the minimal number of bytes that
  85  // are enough to hold biggest optab op lines.
  86  type opBytes [31]uint8
  87  
  88  type Optab struct {
  89  	as     obj.As
  90  	ytab   []ytab
  91  	prefix uint8
  92  	op     opBytes
  93  }
  94  
  95  type movtab struct {
  96  	as   obj.As
  97  	ft   uint8
  98  	f3t  uint8
  99  	tt   uint8
 100  	code uint8
 101  	op   [4]uint8
 102  }
 103  
 104  const (
 105  	Yxxx = iota
 106  	Ynone
 107  	Yi0 // $0
 108  	Yi1 // $1
 109  	Yu2 // $x, x fits in uint2
 110  	Yi8 // $x, x fits in int8
 111  	Yu8 // $x, x fits in uint8
 112  	Yu7 // $x, x in 0..127 (fits in both int8 and uint8)
 113  	Ys32
 114  	Yi32
 115  	Yi64
 116  	Yiauto
 117  	Yal
 118  	Ycl
 119  	Yax
 120  	Ycx
 121  	Yrb
 122  	Yrl
 123  	Yrl32 // Yrl on 32-bit system
 124  	Yrf
 125  	Yf0
 126  	Yrx
 127  	Ymb
 128  	Yml
 129  	Ym
 130  	Ybr
 131  	Ycs
 132  	Yss
 133  	Yds
 134  	Yes
 135  	Yfs
 136  	Ygs
 137  	Ygdtr
 138  	Yidtr
 139  	Yldtr
 140  	Ymsw
 141  	Ytask
 142  	Ycr0
 143  	Ycr1
 144  	Ycr2
 145  	Ycr3
 146  	Ycr4
 147  	Ycr5
 148  	Ycr6
 149  	Ycr7
 150  	Ycr8
 151  	Ydr0
 152  	Ydr1
 153  	Ydr2
 154  	Ydr3
 155  	Ydr4
 156  	Ydr5
 157  	Ydr6
 158  	Ydr7
 159  	Ytr0
 160  	Ytr1
 161  	Ytr2
 162  	Ytr3
 163  	Ytr4
 164  	Ytr5
 165  	Ytr6
 166  	Ytr7
 167  	Ymr
 168  	Ymm
 169  	Yxr0          // X0 only. "<XMM0>" notation in Intel manual.
 170  	YxrEvexMulti4 // [ X<n> - X<n+3> ]; multisource YxrEvex
 171  	Yxr           // X0..X15
 172  	YxrEvex       // X0..X31
 173  	Yxm
 174  	YxmEvex       // YxrEvex+Ym
 175  	Yxvm          // VSIB vector array; vm32x/vm64x
 176  	YxvmEvex      // Yxvm which permits High-16 X register as index.
 177  	YyrEvexMulti4 // [ Y<n> - Y<n+3> ]; multisource YyrEvex
 178  	Yyr           // Y0..Y15
 179  	YyrEvex       // Y0..Y31
 180  	Yym
 181  	YymEvex   // YyrEvex+Ym
 182  	Yyvm      // VSIB vector array; vm32y/vm64y
 183  	YyvmEvex  // Yyvm which permits High-16 Y register as index.
 184  	YzrMulti4 // [ Z<n> - Z<n+3> ]; multisource YzrEvex
 185  	Yzr       // Z0..Z31
 186  	Yzm       // Yzr+Ym
 187  	Yzvm      // VSIB vector array; vm32z/vm64z
 188  	Yk0       // K0
 189  	Yknot0    // K1..K7; write mask
 190  	Yk        // K0..K7; used for KOP
 191  	Ykm       // Yk+Ym; used for KOP
 192  	Ytls
 193  	Ytextsize
 194  	Yindir
 195  	Ymax
 196  )
 197  
 198  const (
 199  	Zxxx = iota
 200  	Zlit
 201  	Zlitm_r
 202  	Zlitr_m
 203  	Zlit_m_r
 204  	Z_rp
 205  	Zbr
 206  	Zcall
 207  	Zcallcon
 208  	Zcallduff
 209  	Zcallind
 210  	Zcallindreg
 211  	Zib_
 212  	Zib_rp
 213  	Zibo_m
 214  	Zibo_m_xm
 215  	Zil_
 216  	Zil_rp
 217  	Ziq_rp
 218  	Zilo_m
 219  	Zjmp
 220  	Zjmpcon
 221  	Zloop
 222  	Zo_iw
 223  	Zm_o
 224  	Zm_r
 225  	Z_m_r
 226  	Zm2_r
 227  	Zm_r_xm
 228  	Zm_r_i_xm
 229  	Zm_r_xm_nr
 230  	Zr_m_xm_nr
 231  	Zibm_r // mmx1,mmx2/mem64,imm8
 232  	Zibr_m
 233  	Zmb_r
 234  	Zaut_r
 235  	Zo_m
 236  	Zo_m64
 237  	Zpseudo
 238  	Zr_m
 239  	Zr_m_xm
 240  	Zrp_
 241  	Z_ib
 242  	Z_il
 243  	Zm_ibo
 244  	Zm_ilo
 245  	Zib_rr
 246  	Zil_rr
 247  	Zbyte
 248  
 249  	Zvex_rm_v_r
 250  	Zvex_rm_v_ro
 251  	Zvex_r_v_rm
 252  	Zvex_i_rm_vo
 253  	Zvex_v_rm_r
 254  	Zvex_i_rm_r
 255  	Zvex_i_r_v
 256  	Zvex_i_rm_v_r
 257  	Zvex
 258  	Zvex_rm_r_vo
 259  	Zvex_i_r_rm
 260  	Zvex_hr_rm_v_r
 261  
 262  	Zevex_first
 263  	Zevex_i_r_k_rm
 264  	Zevex_i_r_rm
 265  	Zevex_i_rm_k_r
 266  	Zevex_i_rm_k_vo
 267  	Zevex_i_rm_r
 268  	Zevex_i_rm_v_k_r
 269  	Zevex_i_rm_v_r
 270  	Zevex_i_rm_vo
 271  	Zevex_k_rmo
 272  	Zevex_r_k_rm
 273  	Zevex_r_v_k_rm
 274  	Zevex_r_v_rm
 275  	Zevex_rm_k_r
 276  	Zevex_rm_v_k_r
 277  	Zevex_rm_v_r
 278  	Zevex_last
 279  
 280  	Zmax
 281  )
 282  
 283  const (
 284  	Px   = 0
 285  	Px1  = 1    // symbolic; exact value doesn't matter
 286  	P32  = 0x32 // 32-bit only
 287  	Pe   = 0x66 // operand escape
 288  	Pm   = 0x0f // 2byte opcode escape
 289  	Pq   = 0xff // both escapes: 66 0f
 290  	Pb   = 0xfe // byte operands
 291  	Pf2  = 0xf2 // xmm escape 1: f2 0f
 292  	Pf3  = 0xf3 // xmm escape 2: f3 0f
 293  	Pef3 = 0xf5 // xmm escape 2 with 16-bit prefix: 66 f3 0f
 294  	Pq3  = 0x67 // xmm escape 3: 66 48 0f
 295  	Pq4  = 0x68 // xmm escape 4: 66 0F 38
 296  	Pq4w = 0x69 // Pq4 with Rex.w 66 0F 38
 297  	Pq5  = 0x6a // xmm escape 5: F3 0F 38
 298  	Pq5w = 0x6b // Pq5 with Rex.w F3 0F 38
 299  	Pfw  = 0xf4 // Pf3 with Rex.w: f3 48 0f
 300  	Pw   = 0x48 // Rex.w
 301  	Pw8  = 0x90 // symbolic; exact value doesn't matter
 302  	Py   = 0x80 // defaults to 64-bit mode
 303  	Py1  = 0x81 // symbolic; exact value doesn't matter
 304  	Py3  = 0x83 // symbolic; exact value doesn't matter
 305  	Pavx = 0x84 // symbolic: exact value doesn't matter
 306  
 307  	RxrEvex = 1 << 4 // AVX512 extension to REX.R/VEX.R
 308  	Rxw     = 1 << 3 // =1, 64-bit operand size
 309  	Rxr     = 1 << 2 // extend modrm reg
 310  	Rxx     = 1 << 1 // extend sib index
 311  	Rxb     = 1 << 0 // extend modrm r/m, sib base, or opcode reg
 312  )
 313  
 314  const (
 315  	// Encoding for VEX prefix in tables.
 316  	// The P, L, and W fields are chosen to match
 317  	// their eventual locations in the VEX prefix bytes.
 318  
 319  	// Encoding for VEX prefix in tables.
 320  	// The P, L, and W fields are chosen to match
 321  	// their eventual locations in the VEX prefix bytes.
 322  
 323  	// Using spare bit to make leading [E]VEX encoding byte different from
 324  	// 0x0f even if all other VEX fields are 0.
 325  	avxEscape = 1 << 6
 326  
 327  	// P field - 2 bits
 328  	vex66 = 1 << 0
 329  	vexF3 = 2 << 0
 330  	vexF2 = 3 << 0
 331  	// L field - 1 bit
 332  	vexLZ  = 0 << 2
 333  	vexLIG = 0 << 2
 334  	vex128 = 0 << 2
 335  	vex256 = 1 << 2
 336  	// W field - 1 bit
 337  	vexWIG = 0 << 7
 338  	vexW0  = 0 << 7
 339  	vexW1  = 1 << 7
 340  	// M field - 5 bits, but mostly reserved; we can store up to 3
 341  	vex0F   = 1 << 3
 342  	vex0F38 = 2 << 3
 343  	vex0F3A = 3 << 3
 344  )
 345  
 346  var ycover [Ymax * Ymax]uint8
 347  
 348  var reg [MAXREG]int
 349  
 350  var regrex [MAXREG + 1]int
 351  
 352  var ynone = []ytab{
 353  	{Zlit, 1, argList{}},
 354  }
 355  
 356  var ytext = []ytab{
 357  	{Zpseudo, 0, argList{Ymb, Ytextsize}},
 358  	{Zpseudo, 1, argList{Ymb, Yi32, Ytextsize}},
 359  }
 360  
 361  var ynop = []ytab{
 362  	{Zpseudo, 0, argList{}},
 363  	{Zpseudo, 0, argList{Yiauto}},
 364  	{Zpseudo, 0, argList{Yml}},
 365  	{Zpseudo, 0, argList{Yrf}},
 366  	{Zpseudo, 0, argList{Yxr}},
 367  	{Zpseudo, 0, argList{Yiauto}},
 368  	{Zpseudo, 0, argList{Yml}},
 369  	{Zpseudo, 0, argList{Yrf}},
 370  	{Zpseudo, 1, argList{Yxr}},
 371  }
 372  
 373  var yfuncdata = []ytab{
 374  	{Zpseudo, 0, argList{Yi32, Ym}},
 375  }
 376  
 377  var ypcdata = []ytab{
 378  	{Zpseudo, 0, argList{Yi32, Yi32}},
 379  }
 380  
 381  var yxorb = []ytab{
 382  	{Zib_, 1, argList{Yi32, Yal}},
 383  	{Zibo_m, 2, argList{Yi32, Ymb}},
 384  	{Zr_m, 1, argList{Yrb, Ymb}},
 385  	{Zm_r, 1, argList{Ymb, Yrb}},
 386  }
 387  
 388  var yaddl = []ytab{
 389  	{Zibo_m, 2, argList{Yi8, Yml}},
 390  	{Zil_, 1, argList{Yi32, Yax}},
 391  	{Zilo_m, 2, argList{Yi32, Yml}},
 392  	{Zr_m, 1, argList{Yrl, Yml}},
 393  	{Zm_r, 1, argList{Yml, Yrl}},
 394  }
 395  
 396  var yincl = []ytab{
 397  	{Z_rp, 1, argList{Yrl}},
 398  	{Zo_m, 2, argList{Yml}},
 399  }
 400  
 401  var yincq = []ytab{
 402  	{Zo_m, 2, argList{Yml}},
 403  }
 404  
 405  var ycmpb = []ytab{
 406  	{Z_ib, 1, argList{Yal, Yi32}},
 407  	{Zm_ibo, 2, argList{Ymb, Yi32}},
 408  	{Zm_r, 1, argList{Ymb, Yrb}},
 409  	{Zr_m, 1, argList{Yrb, Ymb}},
 410  }
 411  
 412  var ycmpl = []ytab{
 413  	{Zm_ibo, 2, argList{Yml, Yi8}},
 414  	{Z_il, 1, argList{Yax, Yi32}},
 415  	{Zm_ilo, 2, argList{Yml, Yi32}},
 416  	{Zm_r, 1, argList{Yml, Yrl}},
 417  	{Zr_m, 1, argList{Yrl, Yml}},
 418  }
 419  
 420  var yshb = []ytab{
 421  	{Zo_m, 2, argList{Yi1, Ymb}},
 422  	{Zibo_m, 2, argList{Yu8, Ymb}},
 423  	{Zo_m, 2, argList{Ycx, Ymb}},
 424  }
 425  
 426  var yshl = []ytab{
 427  	{Zo_m, 2, argList{Yi1, Yml}},
 428  	{Zibo_m, 2, argList{Yu8, Yml}},
 429  	{Zo_m, 2, argList{Ycl, Yml}},
 430  	{Zo_m, 2, argList{Ycx, Yml}},
 431  }
 432  
 433  var ytestl = []ytab{
 434  	{Zil_, 1, argList{Yi32, Yax}},
 435  	{Zilo_m, 2, argList{Yi32, Yml}},
 436  	{Zr_m, 1, argList{Yrl, Yml}},
 437  	{Zm_r, 1, argList{Yml, Yrl}},
 438  }
 439  
 440  var ymovb = []ytab{
 441  	{Zr_m, 1, argList{Yrb, Ymb}},
 442  	{Zm_r, 1, argList{Ymb, Yrb}},
 443  	{Zib_rp, 1, argList{Yi32, Yrb}},
 444  	{Zibo_m, 2, argList{Yi32, Ymb}},
 445  }
 446  
 447  var ybtl = []ytab{
 448  	{Zibo_m, 2, argList{Yi8, Yml}},
 449  	{Zr_m, 1, argList{Yrl, Yml}},
 450  }
 451  
 452  var ymovw = []ytab{
 453  	{Zr_m, 1, argList{Yrl, Yml}},
 454  	{Zm_r, 1, argList{Yml, Yrl}},
 455  	{Zil_rp, 1, argList{Yi32, Yrl}},
 456  	{Zilo_m, 2, argList{Yi32, Yml}},
 457  	{Zaut_r, 2, argList{Yiauto, Yrl}},
 458  }
 459  
 460  var ymovl = []ytab{
 461  	{Zr_m, 1, argList{Yrl, Yml}},
 462  	{Zm_r, 1, argList{Yml, Yrl}},
 463  	{Zil_rp, 1, argList{Yi32, Yrl}},
 464  	{Zilo_m, 2, argList{Yi32, Yml}},
 465  	{Zm_r_xm, 1, argList{Yml, Ymr}}, // MMX MOVD
 466  	{Zr_m_xm, 1, argList{Ymr, Yml}}, // MMX MOVD
 467  	{Zm_r_xm, 2, argList{Yml, Yxr}}, // XMM MOVD (32 bit)
 468  	{Zr_m_xm, 2, argList{Yxr, Yml}}, // XMM MOVD (32 bit)
 469  	{Zaut_r, 2, argList{Yiauto, Yrl}},
 470  }
 471  
 472  var yret = []ytab{
 473  	{Zo_iw, 1, argList{}},
 474  	{Zo_iw, 1, argList{Yi32}},
 475  }
 476  
 477  var ymovq = []ytab{
 478  	// valid in 32-bit mode
 479  	{Zm_r_xm_nr, 1, argList{Ym, Ymr}},  // 0x6f MMX MOVQ (shorter encoding)
 480  	{Zr_m_xm_nr, 1, argList{Ymr, Ym}},  // 0x7f MMX MOVQ
 481  	{Zm_r_xm_nr, 2, argList{Yxr, Ymr}}, // Pf2, 0xd6 MOVDQ2Q
 482  	{Zm_r_xm_nr, 2, argList{Yxm, Yxr}}, // Pf3, 0x7e MOVQ xmm1/m64 -> xmm2
 483  	{Zr_m_xm_nr, 2, argList{Yxr, Yxm}}, // Pe, 0xd6 MOVQ xmm1 -> xmm2/m64
 484  
 485  	// valid only in 64-bit mode, usually with 64-bit prefix
 486  	{Zr_m, 1, argList{Yrl, Yml}},      // 0x89
 487  	{Zm_r, 1, argList{Yml, Yrl}},      // 0x8b
 488  	{Zilo_m, 2, argList{Ys32, Yrl}},   // 32 bit signed 0xc7,(0)
 489  	{Ziq_rp, 1, argList{Yi64, Yrl}},   // 0xb8 -- 32/64 bit immediate
 490  	{Zilo_m, 2, argList{Yi32, Yml}},   // 0xc7,(0)
 491  	{Zm_r_xm, 1, argList{Ymm, Ymr}},   // 0x6e MMX MOVD
 492  	{Zr_m_xm, 1, argList{Ymr, Ymm}},   // 0x7e MMX MOVD
 493  	{Zm_r_xm, 2, argList{Yml, Yxr}},   // Pe, 0x6e MOVD xmm load
 494  	{Zr_m_xm, 2, argList{Yxr, Yml}},   // Pe, 0x7e MOVD xmm store
 495  	{Zaut_r, 1, argList{Yiauto, Yrl}}, // 0 built-in LEAQ
 496  }
 497  
 498  var ymovbe = []ytab{
 499  	{Zlitm_r, 3, argList{Ym, Yrl}},
 500  	{Zlitr_m, 3, argList{Yrl, Ym}},
 501  }
 502  
 503  var ym_rl = []ytab{
 504  	{Zm_r, 1, argList{Ym, Yrl}},
 505  }
 506  
 507  var yrl_m = []ytab{
 508  	{Zr_m, 1, argList{Yrl, Ym}},
 509  }
 510  
 511  var ymb_rl = []ytab{
 512  	{Zmb_r, 1, argList{Ymb, Yrl}},
 513  }
 514  
 515  var yml_rl = []ytab{
 516  	{Zm_r, 1, argList{Yml, Yrl}},
 517  }
 518  
 519  var yrl_ml = []ytab{
 520  	{Zr_m, 1, argList{Yrl, Yml}},
 521  }
 522  
 523  var yml_mb = []ytab{
 524  	{Zr_m, 1, argList{Yrb, Ymb}},
 525  	{Zm_r, 1, argList{Ymb, Yrb}},
 526  }
 527  
 528  var yrb_mb = []ytab{
 529  	{Zr_m, 1, argList{Yrb, Ymb}},
 530  }
 531  
 532  var yxchg = []ytab{
 533  	{Z_rp, 1, argList{Yax, Yrl}},
 534  	{Zrp_, 1, argList{Yrl, Yax}},
 535  	{Zr_m, 1, argList{Yrl, Yml}},
 536  	{Zm_r, 1, argList{Yml, Yrl}},
 537  }
 538  
 539  var ydivl = []ytab{
 540  	{Zm_o, 2, argList{Yml}},
 541  }
 542  
 543  var ydivb = []ytab{
 544  	{Zm_o, 2, argList{Ymb}},
 545  }
 546  
 547  var yimul = []ytab{
 548  	{Zm_o, 2, argList{Yml}},
 549  	{Zib_rr, 1, argList{Yi8, Yrl}},
 550  	{Zil_rr, 1, argList{Yi32, Yrl}},
 551  	{Zm_r, 2, argList{Yml, Yrl}},
 552  }
 553  
 554  var yimul3 = []ytab{
 555  	{Zibm_r, 2, argList{Yi8, Yml, Yrl}},
 556  	{Zibm_r, 2, argList{Yi32, Yml, Yrl}},
 557  }
 558  
 559  var ybyte = []ytab{
 560  	{Zbyte, 1, argList{Yi64}},
 561  }
 562  
 563  var yin = []ytab{
 564  	{Zib_, 1, argList{Yi32}},
 565  	{Zlit, 1, argList{}},
 566  }
 567  
 568  var yint = []ytab{
 569  	{Zib_, 1, argList{Yi32}},
 570  }
 571  
 572  var ypushl = []ytab{
 573  	{Zrp_, 1, argList{Yrl}},
 574  	{Zm_o, 2, argList{Ym}},
 575  	{Zib_, 1, argList{Yi8}},
 576  	{Zil_, 1, argList{Yi32}},
 577  }
 578  
 579  var ypopl = []ytab{
 580  	{Z_rp, 1, argList{Yrl}},
 581  	{Zo_m, 2, argList{Ym}},
 582  }
 583  
 584  var ywrfsbase = []ytab{
 585  	{Zm_o, 2, argList{Yrl}},
 586  }
 587  
 588  var yrdrand = []ytab{
 589  	{Zo_m, 2, argList{Yrl}},
 590  }
 591  
 592  var yclflush = []ytab{
 593  	{Zo_m, 2, argList{Ym}},
 594  }
 595  
 596  var ybswap = []ytab{
 597  	{Z_rp, 2, argList{Yrl}},
 598  }
 599  
 600  var yscond = []ytab{
 601  	{Zo_m, 2, argList{Ymb}},
 602  }
 603  
 604  var yjcond = []ytab{
 605  	{Zbr, 0, argList{Ybr}},
 606  	{Zbr, 0, argList{Yi0, Ybr}},
 607  	{Zbr, 1, argList{Yi1, Ybr}},
 608  }
 609  
 610  var yloop = []ytab{
 611  	{Zloop, 1, argList{Ybr}},
 612  }
 613  
 614  var ycall = []ytab{
 615  	{Zcallindreg, 0, argList{Yml}},
 616  	{Zcallindreg, 2, argList{Yrx, Yrx}},
 617  	{Zcallind, 2, argList{Yindir}},
 618  	{Zcall, 0, argList{Ybr}},
 619  	{Zcallcon, 1, argList{Yi32}},
 620  }
 621  
 622  var yduff = []ytab{
 623  	{Zcallduff, 1, argList{Yi32}},
 624  }
 625  
 626  var yjmp = []ytab{
 627  	{Zo_m64, 2, argList{Yml}},
 628  	{Zjmp, 0, argList{Ybr}},
 629  	{Zjmpcon, 1, argList{Yi32}},
 630  }
 631  
 632  var yfmvd = []ytab{
 633  	{Zm_o, 2, argList{Ym, Yf0}},
 634  	{Zo_m, 2, argList{Yf0, Ym}},
 635  	{Zm_o, 2, argList{Yrf, Yf0}},
 636  	{Zo_m, 2, argList{Yf0, Yrf}},
 637  }
 638  
 639  var yfmvdp = []ytab{
 640  	{Zo_m, 2, argList{Yf0, Ym}},
 641  	{Zo_m, 2, argList{Yf0, Yrf}},
 642  }
 643  
 644  var yfmvf = []ytab{
 645  	{Zm_o, 2, argList{Ym, Yf0}},
 646  	{Zo_m, 2, argList{Yf0, Ym}},
 647  }
 648  
 649  var yfmvx = []ytab{
 650  	{Zm_o, 2, argList{Ym, Yf0}},
 651  }
 652  
 653  var yfmvp = []ytab{
 654  	{Zo_m, 2, argList{Yf0, Ym}},
 655  }
 656  
 657  var yfcmv = []ytab{
 658  	{Zm_o, 2, argList{Yrf, Yf0}},
 659  }
 660  
 661  var yfadd = []ytab{
 662  	{Zm_o, 2, argList{Ym, Yf0}},
 663  	{Zm_o, 2, argList{Yrf, Yf0}},
 664  	{Zo_m, 2, argList{Yf0, Yrf}},
 665  }
 666  
 667  var yfxch = []ytab{
 668  	{Zo_m, 2, argList{Yf0, Yrf}},
 669  	{Zm_o, 2, argList{Yrf, Yf0}},
 670  }
 671  
 672  var ycompp = []ytab{
 673  	{Zo_m, 2, argList{Yf0, Yrf}}, // botch is really f0,f1
 674  }
 675  
 676  var ystsw = []ytab{
 677  	{Zo_m, 2, argList{Ym}},
 678  	{Zlit, 1, argList{Yax}},
 679  }
 680  
 681  var ysvrs_mo = []ytab{
 682  	{Zm_o, 2, argList{Ym}},
 683  }
 684  
 685  // unaryDst version of "ysvrs_mo".
 686  var ysvrs_om = []ytab{
 687  	{Zo_m, 2, argList{Ym}},
 688  }
 689  
 690  var ymm = []ytab{
 691  	{Zm_r_xm, 1, argList{Ymm, Ymr}},
 692  	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 693  }
 694  
 695  var yxm = []ytab{
 696  	{Zm_r_xm, 1, argList{Yxm, Yxr}},
 697  }
 698  
 699  var yxm_q4 = []ytab{
 700  	{Zm_r, 1, argList{Yxm, Yxr}},
 701  }
 702  
 703  var yxcvm1 = []ytab{
 704  	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 705  	{Zm_r_xm, 2, argList{Yxm, Ymr}},
 706  }
 707  
 708  var yxcvm2 = []ytab{
 709  	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 710  	{Zm_r_xm, 2, argList{Ymm, Yxr}},
 711  }
 712  
 713  var yxr = []ytab{
 714  	{Zm_r_xm, 1, argList{Yxr, Yxr}},
 715  }
 716  
 717  var yxr_ml = []ytab{
 718  	{Zr_m_xm, 1, argList{Yxr, Yml}},
 719  }
 720  
 721  var ymr = []ytab{
 722  	{Zm_r, 1, argList{Ymr, Ymr}},
 723  }
 724  
 725  var ymr_ml = []ytab{
 726  	{Zr_m_xm, 1, argList{Ymr, Yml}},
 727  }
 728  
 729  var yxcmpi = []ytab{
 730  	{Zm_r_i_xm, 2, argList{Yxm, Yxr, Yi8}},
 731  }
 732  
 733  var yxmov = []ytab{
 734  	{Zm_r_xm, 1, argList{Yxm, Yxr}},
 735  	{Zr_m_xm, 1, argList{Yxr, Yxm}},
 736  }
 737  
 738  var yxcvfl = []ytab{
 739  	{Zm_r_xm, 1, argList{Yxm, Yrl}},
 740  }
 741  
 742  var yxcvlf = []ytab{
 743  	{Zm_r_xm, 1, argList{Yml, Yxr}},
 744  }
 745  
 746  var yxcvfq = []ytab{
 747  	{Zm_r_xm, 2, argList{Yxm, Yrl}},
 748  }
 749  
 750  var yxcvqf = []ytab{
 751  	{Zm_r_xm, 2, argList{Yml, Yxr}},
 752  }
 753  
 754  var yps = []ytab{
 755  	{Zm_r_xm, 1, argList{Ymm, Ymr}},
 756  	{Zibo_m_xm, 2, argList{Yi8, Ymr}},
 757  	{Zm_r_xm, 2, argList{Yxm, Yxr}},
 758  	{Zibo_m_xm, 3, argList{Yi8, Yxr}},
 759  }
 760  
 761  var yxrrl = []ytab{
 762  	{Zm_r, 1, argList{Yxr, Yrl}},
 763  }
 764  
 765  var ymrxr = []ytab{
 766  	{Zm_r, 1, argList{Ymr, Yxr}},
 767  	{Zm_r_xm, 1, argList{Yxm, Yxr}},
 768  }
 769  
 770  var ymshuf = []ytab{
 771  	{Zibm_r, 2, argList{Yi8, Ymm, Ymr}},
 772  }
 773  
 774  var ymshufb = []ytab{
 775  	{Zm2_r, 2, argList{Yxm, Yxr}},
 776  }
 777  
 778  // It should never have more than 1 entry,
 779  // because some optab entries you opcode secuences that
 780  // are longer than 2 bytes (zoffset=2 here),
 781  // ROUNDPD and ROUNDPS and recently added BLENDPD,
 782  // to name a few.
 783  var yxshuf = []ytab{
 784  	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
 785  }
 786  
 787  var yextrw = []ytab{
 788  	{Zibm_r, 2, argList{Yu8, Yxr, Yrl}},
 789  	{Zibr_m, 2, argList{Yu8, Yxr, Yml}},
 790  }
 791  
 792  var yextr = []ytab{
 793  	{Zibr_m, 3, argList{Yu8, Yxr, Ymm}},
 794  }
 795  
 796  var yinsrw = []ytab{
 797  	{Zibm_r, 2, argList{Yu8, Yml, Yxr}},
 798  }
 799  
 800  var yinsr = []ytab{
 801  	{Zibm_r, 3, argList{Yu8, Ymm, Yxr}},
 802  }
 803  
 804  var ypsdq = []ytab{
 805  	{Zibo_m, 2, argList{Yi8, Yxr}},
 806  }
 807  
 808  var ymskb = []ytab{
 809  	{Zm_r_xm, 2, argList{Yxr, Yrl}},
 810  	{Zm_r_xm, 1, argList{Ymr, Yrl}},
 811  }
 812  
 813  var ycrc32l = []ytab{
 814  	{Zlitm_r, 0, argList{Yml, Yrl}},
 815  }
 816  
 817  var ycrc32b = []ytab{
 818  	{Zlitm_r, 0, argList{Ymb, Yrl}},
 819  }
 820  
 821  var yprefetch = []ytab{
 822  	{Zm_o, 2, argList{Ym}},
 823  }
 824  
 825  var yaes = []ytab{
 826  	{Zlitm_r, 2, argList{Yxm, Yxr}},
 827  }
 828  
 829  var yxbegin = []ytab{
 830  	{Zjmp, 1, argList{Ybr}},
 831  }
 832  
 833  var yxabort = []ytab{
 834  	{Zib_, 1, argList{Yu8}},
 835  }
 836  
 837  var ylddqu = []ytab{
 838  	{Zm_r, 1, argList{Ym, Yxr}},
 839  }
 840  
 841  var ypalignr = []ytab{
 842  	{Zibm_r, 2, argList{Yu8, Yxm, Yxr}},
 843  }
 844  
 845  var ysha256rnds2 = []ytab{
 846  	{Zlit_m_r, 0, argList{Yxr0, Yxm, Yxr}},
 847  }
 848  
 849  var yblendvpd = []ytab{
 850  	{Z_m_r, 1, argList{Yxr0, Yxm, Yxr}},
 851  }
 852  
 853  var ymmxmm0f38 = []ytab{
 854  	{Zlitm_r, 3, argList{Ymm, Ymr}},
 855  	{Zlitm_r, 5, argList{Yxm, Yxr}},
 856  }
 857  
 858  var yextractps = []ytab{
 859  	{Zibr_m, 2, argList{Yu2, Yxr, Yml}},
 860  }
 861  
 862  var ysha1rnds4 = []ytab{
 863  	{Zibm_r, 2, argList{Yu2, Yxm, Yxr}},
 864  }
 865  
 866  // You are doasm, holding in your hand a *obj.Prog with p.As set to, say,
 867  // ACRC32, and p.From and p.To as operands (obj.Addr).  The linker scans optab
 868  // to find the entry with the given p.As and then looks through the ytable for
 869  // that instruction (the second field in the optab struct) for a line whose
 870  // first two values match the Ytypes of the p.From and p.To operands.  The
 871  // function oclass computes the specific Ytype of an operand and then the set
 872  // of more general Ytypes that it satisfies is implied by the ycover table, set
 873  // up in instinit.  For example, oclass distinguishes the constants 0 and 1
 874  // from the more general 8-bit constants, but instinit says
 875  //
 876  //        ycover[Yi0*Ymax+Ys32] = 1
 877  //        ycover[Yi1*Ymax+Ys32] = 1
 878  //        ycover[Yi8*Ymax+Ys32] = 1
 879  //
 880  // which means that Yi0, Yi1, and Yi8 all count as Ys32 (signed 32)
 881  // if that's what an instruction can handle.
 882  //
 883  // In parallel with the scan through the ytable for the appropriate line, there
 884  // is a z pointer that starts out pointing at the strange magic byte list in
 885  // the Optab struct.  With each step past a non-matching ytable line, z
 886  // advances by the 4th entry in the line.  When a matching line is found, that
 887  // z pointer has the extra data to use in laying down the instruction bytes.
 888  // The actual bytes laid down are a function of the 3rd entry in the line (that
 889  // is, the Ztype) and the z bytes.
 890  //
 891  // For example, let's look at AADDL.  The optab line says:
 892  //        {AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 893  //
 894  // and yaddl says
 895  //        var yaddl = []ytab{
 896  //                {Yi8, Ynone, Yml, Zibo_m, 2},
 897  //                {Yi32, Ynone, Yax, Zil_, 1},
 898  //                {Yi32, Ynone, Yml, Zilo_m, 2},
 899  //                {Yrl, Ynone, Yml, Zr_m, 1},
 900  //                {Yml, Ynone, Yrl, Zm_r, 1},
 901  //        }
 902  //
 903  // so there are 5 possible types of ADDL instruction that can be laid down, and
 904  // possible states used to lay them down (Ztype and z pointer, assuming z
 905  // points at opBytes{0x83, 00, 0x05,0x81, 00, 0x01, 0x03}) are:
 906  //
 907  //        Yi8, Yml -> Zibo_m, z (0x83, 00)
 908  //        Yi32, Yax -> Zil_, z+2 (0x05)
 909  //        Yi32, Yml -> Zilo_m, z+2+1 (0x81, 0x00)
 910  //        Yrl, Yml -> Zr_m, z+2+1+2 (0x01)
 911  //        Yml, Yrl -> Zm_r, z+2+1+2+1 (0x03)
 912  //
 913  // The Pconstant in the optab line controls the prefix bytes to emit.  That's
 914  // relatively straightforward as this program goes.
 915  //
 916  // The switch on yt.zcase in doasm implements the various Z cases.  Zibo_m, for
 917  // example, is an opcode byte (z[0]) then an asmando (which is some kind of
 918  // encoded addressing mode for the Yml arg), and then a single immediate byte.
 919  // Zilo_m is the same but a long (32-bit) immediate.
 920  var optab =
 921  //	as, ytab, andproto, opcode
 922  [...]Optab{
 923  	{obj.AXXX, nil, 0, opBytes{}},
 924  	{AAAA, ynone, P32, opBytes{0x37}},
 925  	{AAAD, ynone, P32, opBytes{0xd5, 0x0a}},
 926  	{AAAM, ynone, P32, opBytes{0xd4, 0x0a}},
 927  	{AAAS, ynone, P32, opBytes{0x3f}},
 928  	{AADCB, yxorb, Pb, opBytes{0x14, 0x80, 02, 0x10, 0x12}},
 929  	{AADCL, yaddl, Px, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
 930  	{AADCQ, yaddl, Pw, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
 931  	{AADCW, yaddl, Pe, opBytes{0x83, 02, 0x15, 0x81, 02, 0x11, 0x13}},
 932  	{AADCXL, yml_rl, Pq4, opBytes{0xf6}},
 933  	{AADCXQ, yml_rl, Pq4w, opBytes{0xf6}},
 934  	{AADDB, yxorb, Pb, opBytes{0x04, 0x80, 00, 0x00, 0x02}},
 935  	{AADDL, yaddl, Px, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 936  	{AADDPD, yxm, Pq, opBytes{0x58}},
 937  	{AADDPS, yxm, Pm, opBytes{0x58}},
 938  	{AADDQ, yaddl, Pw, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 939  	{AADDSD, yxm, Pf2, opBytes{0x58}},
 940  	{AADDSS, yxm, Pf3, opBytes{0x58}},
 941  	{AADDSUBPD, yxm, Pq, opBytes{0xd0}},
 942  	{AADDSUBPS, yxm, Pf2, opBytes{0xd0}},
 943  	{AADDW, yaddl, Pe, opBytes{0x83, 00, 0x05, 0x81, 00, 0x01, 0x03}},
 944  	{AADOXL, yml_rl, Pq5, opBytes{0xf6}},
 945  	{AADOXQ, yml_rl, Pq5w, opBytes{0xf6}},
 946  	{AADJSP, nil, 0, opBytes{}},
 947  	{AANDB, yxorb, Pb, opBytes{0x24, 0x80, 04, 0x20, 0x22}},
 948  	{AANDL, yaddl, Px, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
 949  	{AANDNPD, yxm, Pq, opBytes{0x55}},
 950  	{AANDNPS, yxm, Pm, opBytes{0x55}},
 951  	{AANDPD, yxm, Pq, opBytes{0x54}},
 952  	{AANDPS, yxm, Pm, opBytes{0x54}},
 953  	{AANDQ, yaddl, Pw, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
 954  	{AANDW, yaddl, Pe, opBytes{0x83, 04, 0x25, 0x81, 04, 0x21, 0x23}},
 955  	{AARPL, yrl_ml, P32, opBytes{0x63}},
 956  	{ABOUNDL, yrl_m, P32, opBytes{0x62}},
 957  	{ABOUNDW, yrl_m, Pe, opBytes{0x62}},
 958  	{ABSFL, yml_rl, Pm, opBytes{0xbc}},
 959  	{ABSFQ, yml_rl, Pw, opBytes{0x0f, 0xbc}},
 960  	{ABSFW, yml_rl, Pq, opBytes{0xbc}},
 961  	{ABSRL, yml_rl, Pm, opBytes{0xbd}},
 962  	{ABSRQ, yml_rl, Pw, opBytes{0x0f, 0xbd}},
 963  	{ABSRW, yml_rl, Pq, opBytes{0xbd}},
 964  	{ABSWAPL, ybswap, Px, opBytes{0x0f, 0xc8}},
 965  	{ABSWAPQ, ybswap, Pw, opBytes{0x0f, 0xc8}},
 966  	{ABTCL, ybtl, Pm, opBytes{0xba, 07, 0xbb}},
 967  	{ABTCQ, ybtl, Pw, opBytes{0x0f, 0xba, 07, 0x0f, 0xbb}},
 968  	{ABTCW, ybtl, Pq, opBytes{0xba, 07, 0xbb}},
 969  	{ABTL, ybtl, Pm, opBytes{0xba, 04, 0xa3}},
 970  	{ABTQ, ybtl, Pw, opBytes{0x0f, 0xba, 04, 0x0f, 0xa3}},
 971  	{ABTRL, ybtl, Pm, opBytes{0xba, 06, 0xb3}},
 972  	{ABTRQ, ybtl, Pw, opBytes{0x0f, 0xba, 06, 0x0f, 0xb3}},
 973  	{ABTRW, ybtl, Pq, opBytes{0xba, 06, 0xb3}},
 974  	{ABTSL, ybtl, Pm, opBytes{0xba, 05, 0xab}},
 975  	{ABTSQ, ybtl, Pw, opBytes{0x0f, 0xba, 05, 0x0f, 0xab}},
 976  	{ABTSW, ybtl, Pq, opBytes{0xba, 05, 0xab}},
 977  	{ABTW, ybtl, Pq, opBytes{0xba, 04, 0xa3}},
 978  	{ABYTE, ybyte, Px, opBytes{1}},
 979  	{obj.ACALL, ycall, Px, opBytes{0xff, 02, 0xff, 0x15, 0xe8}},
 980  	{ACBW, ynone, Pe, opBytes{0x98}},
 981  	{ACDQ, ynone, Px, opBytes{0x99}},
 982  	{ACDQE, ynone, Pw, opBytes{0x98}},
 983  	{ACLAC, ynone, Pm, opBytes{01, 0xca}},
 984  	{ACLC, ynone, Px, opBytes{0xf8}},
 985  	{ACLD, ynone, Px, opBytes{0xfc}},
 986  	{ACLDEMOTE, yclflush, Pm, opBytes{0x1c, 00}},
 987  	{ACLFLUSH, yclflush, Pm, opBytes{0xae, 07}},
 988  	{ACLFLUSHOPT, yclflush, Pq, opBytes{0xae, 07}},
 989  	{ACLI, ynone, Px, opBytes{0xfa}},
 990  	{ACLTS, ynone, Pm, opBytes{0x06}},
 991  	{ACLWB, yclflush, Pq, opBytes{0xae, 06}},
 992  	{ACMC, ynone, Px, opBytes{0xf5}},
 993  	{ACMOVLCC, yml_rl, Pm, opBytes{0x43}},
 994  	{ACMOVLCS, yml_rl, Pm, opBytes{0x42}},
 995  	{ACMOVLEQ, yml_rl, Pm, opBytes{0x44}},
 996  	{ACMOVLGE, yml_rl, Pm, opBytes{0x4d}},
 997  	{ACMOVLGT, yml_rl, Pm, opBytes{0x4f}},
 998  	{ACMOVLHI, yml_rl, Pm, opBytes{0x47}},
 999  	{ACMOVLLE, yml_rl, Pm, opBytes{0x4e}},
1000  	{ACMOVLLS, yml_rl, Pm, opBytes{0x46}},
1001  	{ACMOVLLT, yml_rl, Pm, opBytes{0x4c}},
1002  	{ACMOVLMI, yml_rl, Pm, opBytes{0x48}},
1003  	{ACMOVLNE, yml_rl, Pm, opBytes{0x45}},
1004  	{ACMOVLOC, yml_rl, Pm, opBytes{0x41}},
1005  	{ACMOVLOS, yml_rl, Pm, opBytes{0x40}},
1006  	{ACMOVLPC, yml_rl, Pm, opBytes{0x4b}},
1007  	{ACMOVLPL, yml_rl, Pm, opBytes{0x49}},
1008  	{ACMOVLPS, yml_rl, Pm, opBytes{0x4a}},
1009  	{ACMOVQCC, yml_rl, Pw, opBytes{0x0f, 0x43}},
1010  	{ACMOVQCS, yml_rl, Pw, opBytes{0x0f, 0x42}},
1011  	{ACMOVQEQ, yml_rl, Pw, opBytes{0x0f, 0x44}},
1012  	{ACMOVQGE, yml_rl, Pw, opBytes{0x0f, 0x4d}},
1013  	{ACMOVQGT, yml_rl, Pw, opBytes{0x0f, 0x4f}},
1014  	{ACMOVQHI, yml_rl, Pw, opBytes{0x0f, 0x47}},
1015  	{ACMOVQLE, yml_rl, Pw, opBytes{0x0f, 0x4e}},
1016  	{ACMOVQLS, yml_rl, Pw, opBytes{0x0f, 0x46}},
1017  	{ACMOVQLT, yml_rl, Pw, opBytes{0x0f, 0x4c}},
1018  	{ACMOVQMI, yml_rl, Pw, opBytes{0x0f, 0x48}},
1019  	{ACMOVQNE, yml_rl, Pw, opBytes{0x0f, 0x45}},
1020  	{ACMOVQOC, yml_rl, Pw, opBytes{0x0f, 0x41}},
1021  	{ACMOVQOS, yml_rl, Pw, opBytes{0x0f, 0x40}},
1022  	{ACMOVQPC, yml_rl, Pw, opBytes{0x0f, 0x4b}},
1023  	{ACMOVQPL, yml_rl, Pw, opBytes{0x0f, 0x49}},
1024  	{ACMOVQPS, yml_rl, Pw, opBytes{0x0f, 0x4a}},
1025  	{ACMOVWCC, yml_rl, Pq, opBytes{0x43}},
1026  	{ACMOVWCS, yml_rl, Pq, opBytes{0x42}},
1027  	{ACMOVWEQ, yml_rl, Pq, opBytes{0x44}},
1028  	{ACMOVWGE, yml_rl, Pq, opBytes{0x4d}},
1029  	{ACMOVWGT, yml_rl, Pq, opBytes{0x4f}},
1030  	{ACMOVWHI, yml_rl, Pq, opBytes{0x47}},
1031  	{ACMOVWLE, yml_rl, Pq, opBytes{0x4e}},
1032  	{ACMOVWLS, yml_rl, Pq, opBytes{0x46}},
1033  	{ACMOVWLT, yml_rl, Pq, opBytes{0x4c}},
1034  	{ACMOVWMI, yml_rl, Pq, opBytes{0x48}},
1035  	{ACMOVWNE, yml_rl, Pq, opBytes{0x45}},
1036  	{ACMOVWOC, yml_rl, Pq, opBytes{0x41}},
1037  	{ACMOVWOS, yml_rl, Pq, opBytes{0x40}},
1038  	{ACMOVWPC, yml_rl, Pq, opBytes{0x4b}},
1039  	{ACMOVWPL, yml_rl, Pq, opBytes{0x49}},
1040  	{ACMOVWPS, yml_rl, Pq, opBytes{0x4a}},
1041  	{ACMPB, ycmpb, Pb, opBytes{0x3c, 0x80, 07, 0x38, 0x3a}},
1042  	{ACMPL, ycmpl, Px, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1043  	{ACMPPD, yxcmpi, Px, opBytes{Pe, 0xc2}},
1044  	{ACMPPS, yxcmpi, Pm, opBytes{0xc2, 0}},
1045  	{ACMPQ, ycmpl, Pw, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1046  	{ACMPSB, ynone, Pb, opBytes{0xa6}},
1047  	{ACMPSD, yxcmpi, Px, opBytes{Pf2, 0xc2}},
1048  	{ACMPSL, ynone, Px, opBytes{0xa7}},
1049  	{ACMPSQ, ynone, Pw, opBytes{0xa7}},
1050  	{ACMPSS, yxcmpi, Px, opBytes{Pf3, 0xc2}},
1051  	{ACMPSW, ynone, Pe, opBytes{0xa7}},
1052  	{ACMPW, ycmpl, Pe, opBytes{0x83, 07, 0x3d, 0x81, 07, 0x39, 0x3b}},
1053  	{ACOMISD, yxm, Pe, opBytes{0x2f}},
1054  	{ACOMISS, yxm, Pm, opBytes{0x2f}},
1055  	{ACPUID, ynone, Pm, opBytes{0xa2}},
1056  	{ACVTPL2PD, yxcvm2, Px, opBytes{Pf3, 0xe6, Pe, 0x2a}},
1057  	{ACVTPL2PS, yxcvm2, Pm, opBytes{0x5b, 0, 0x2a, 0}},
1058  	{ACVTPD2PL, yxcvm1, Px, opBytes{Pf2, 0xe6, Pe, 0x2d}},
1059  	{ACVTPD2PS, yxm, Pe, opBytes{0x5a}},
1060  	{ACVTPS2PL, yxcvm1, Px, opBytes{Pe, 0x5b, Pm, 0x2d}},
1061  	{ACVTPS2PD, yxm, Pm, opBytes{0x5a}},
1062  	{ACVTSD2SL, yxcvfl, Pf2, opBytes{0x2d}},
1063  	{ACVTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2d}},
1064  	{ACVTSD2SS, yxm, Pf2, opBytes{0x5a}},
1065  	{ACVTSL2SD, yxcvlf, Pf2, opBytes{0x2a}},
1066  	{ACVTSQ2SD, yxcvqf, Pw, opBytes{Pf2, 0x2a}},
1067  	{ACVTSL2SS, yxcvlf, Pf3, opBytes{0x2a}},
1068  	{ACVTSQ2SS, yxcvqf, Pw, opBytes{Pf3, 0x2a}},
1069  	{ACVTSS2SD, yxm, Pf3, opBytes{0x5a}},
1070  	{ACVTSS2SL, yxcvfl, Pf3, opBytes{0x2d}},
1071  	{ACVTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2d}},
1072  	{ACVTTPD2PL, yxcvm1, Px, opBytes{Pe, 0xe6, Pe, 0x2c}},
1073  	{ACVTTPS2PL, yxcvm1, Px, opBytes{Pf3, 0x5b, Pm, 0x2c}},
1074  	{ACVTTSD2SL, yxcvfl, Pf2, opBytes{0x2c}},
1075  	{ACVTTSD2SQ, yxcvfq, Pw, opBytes{Pf2, 0x2c}},
1076  	{ACVTTSS2SL, yxcvfl, Pf3, opBytes{0x2c}},
1077  	{ACVTTSS2SQ, yxcvfq, Pw, opBytes{Pf3, 0x2c}},
1078  	{ACWD, ynone, Pe, opBytes{0x99}},
1079  	{ACWDE, ynone, Px, opBytes{0x98}},
1080  	{ACQO, ynone, Pw, opBytes{0x99}},
1081  	{ADAA, ynone, P32, opBytes{0x27}},
1082  	{ADAS, ynone, P32, opBytes{0x2f}},
1083  	{ADECB, yscond, Pb, opBytes{0xfe, 01}},
1084  	{ADECL, yincl, Px1, opBytes{0x48, 0xff, 01}},
1085  	{ADECQ, yincq, Pw, opBytes{0xff, 01}},
1086  	{ADECW, yincq, Pe, opBytes{0xff, 01}},
1087  	{ADIVB, ydivb, Pb, opBytes{0xf6, 06}},
1088  	{ADIVL, ydivl, Px, opBytes{0xf7, 06}},
1089  	{ADIVPD, yxm, Pe, opBytes{0x5e}},
1090  	{ADIVPS, yxm, Pm, opBytes{0x5e}},
1091  	{ADIVQ, ydivl, Pw, opBytes{0xf7, 06}},
1092  	{ADIVSD, yxm, Pf2, opBytes{0x5e}},
1093  	{ADIVSS, yxm, Pf3, opBytes{0x5e}},
1094  	{ADIVW, ydivl, Pe, opBytes{0xf7, 06}},
1095  	{ADPPD, yxshuf, Pq, opBytes{0x3a, 0x41, 0}},
1096  	{ADPPS, yxshuf, Pq, opBytes{0x3a, 0x40, 0}},
1097  	{AEMMS, ynone, Pm, opBytes{0x77}},
1098  	{AEXTRACTPS, yextractps, Pq, opBytes{0x3a, 0x17, 0}},
1099  	{AENTER, nil, 0, opBytes{}}, // botch
1100  	{AFXRSTOR, ysvrs_mo, Pm, opBytes{0xae, 01, 0xae, 01}},
1101  	{AFXSAVE, ysvrs_om, Pm, opBytes{0xae, 00, 0xae, 00}},
1102  	{AFXRSTOR64, ysvrs_mo, Pw, opBytes{0x0f, 0xae, 01, 0x0f, 0xae, 01}},
1103  	{AFXSAVE64, ysvrs_om, Pw, opBytes{0x0f, 0xae, 00, 0x0f, 0xae, 00}},
1104  	{AHLT, ynone, Px, opBytes{0xf4}},
1105  	{AIDIVB, ydivb, Pb, opBytes{0xf6, 07}},
1106  	{AIDIVL, ydivl, Px, opBytes{0xf7, 07}},
1107  	{AIDIVQ, ydivl, Pw, opBytes{0xf7, 07}},
1108  	{AIDIVW, ydivl, Pe, opBytes{0xf7, 07}},
1109  	{AIMULB, ydivb, Pb, opBytes{0xf6, 05}},
1110  	{AIMULL, yimul, Px, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1111  	{AIMULQ, yimul, Pw, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1112  	{AIMULW, yimul, Pe, opBytes{0xf7, 05, 0x6b, 0x69, Pm, 0xaf}},
1113  	{AIMUL3W, yimul3, Pe, opBytes{0x6b, 00, 0x69, 00}},
1114  	{AIMUL3L, yimul3, Px, opBytes{0x6b, 00, 0x69, 00}},
1115  	{AIMUL3Q, yimul3, Pw, opBytes{0x6b, 00, 0x69, 00}},
1116  	{AINB, yin, Pb, opBytes{0xe4, 0xec}},
1117  	{AINW, yin, Pe, opBytes{0xe5, 0xed}},
1118  	{AINL, yin, Px, opBytes{0xe5, 0xed}},
1119  	{AINCB, yscond, Pb, opBytes{0xfe, 00}},
1120  	{AINCL, yincl, Px1, opBytes{0x40, 0xff, 00}},
1121  	{AINCQ, yincq, Pw, opBytes{0xff, 00}},
1122  	{AINCW, yincq, Pe, opBytes{0xff, 00}},
1123  	{AINSB, ynone, Pb, opBytes{0x6c}},
1124  	{AINSL, ynone, Px, opBytes{0x6d}},
1125  	{AINSERTPS, yxshuf, Pq, opBytes{0x3a, 0x21, 0}},
1126  	{AINSW, ynone, Pe, opBytes{0x6d}},
1127  	{AICEBP, ynone, Px, opBytes{0xf1}},
1128  	{AINT, yint, Px, opBytes{0xcd}},
1129  	{AINTO, ynone, P32, opBytes{0xce}},
1130  	{AIRETL, ynone, Px, opBytes{0xcf}},
1131  	{AIRETQ, ynone, Pw, opBytes{0xcf}},
1132  	{AIRETW, ynone, Pe, opBytes{0xcf}},
1133  	{AJCC, yjcond, Px, opBytes{0x73, 0x83, 00}},
1134  	{AJCS, yjcond, Px, opBytes{0x72, 0x82}},
1135  	{AJCXZL, yloop, Px, opBytes{0xe3}},
1136  	{AJCXZW, yloop, Px, opBytes{0xe3}},
1137  	{AJCXZQ, yloop, Px, opBytes{0xe3}},
1138  	{AJEQ, yjcond, Px, opBytes{0x74, 0x84}},
1139  	{AJGE, yjcond, Px, opBytes{0x7d, 0x8d}},
1140  	{AJGT, yjcond, Px, opBytes{0x7f, 0x8f}},
1141  	{AJHI, yjcond, Px, opBytes{0x77, 0x87}},
1142  	{AJLE, yjcond, Px, opBytes{0x7e, 0x8e}},
1143  	{AJLS, yjcond, Px, opBytes{0x76, 0x86}},
1144  	{AJLT, yjcond, Px, opBytes{0x7c, 0x8c}},
1145  	{AJMI, yjcond, Px, opBytes{0x78, 0x88}},
1146  	{obj.AJMP, yjmp, Px, opBytes{0xff, 04, 0xeb, 0xe9}},
1147  	{AJNE, yjcond, Px, opBytes{0x75, 0x85}},
1148  	{AJOC, yjcond, Px, opBytes{0x71, 0x81, 00}},
1149  	{AJOS, yjcond, Px, opBytes{0x70, 0x80, 00}},
1150  	{AJPC, yjcond, Px, opBytes{0x7b, 0x8b}},
1151  	{AJPL, yjcond, Px, opBytes{0x79, 0x89}},
1152  	{AJPS, yjcond, Px, opBytes{0x7a, 0x8a}},
1153  	{AHADDPD, yxm, Pq, opBytes{0x7c}},
1154  	{AHADDPS, yxm, Pf2, opBytes{0x7c}},
1155  	{AHSUBPD, yxm, Pq, opBytes{0x7d}},
1156  	{AHSUBPS, yxm, Pf2, opBytes{0x7d}},
1157  	{ALAHF, ynone, Px, opBytes{0x9f}},
1158  	{ALARL, yml_rl, Pm, opBytes{0x02}},
1159  	{ALARQ, yml_rl, Pw, opBytes{0x0f, 0x02}},
1160  	{ALARW, yml_rl, Pq, opBytes{0x02}},
1161  	{ALDDQU, ylddqu, Pf2, opBytes{0xf0}},
1162  	{ALDMXCSR, ysvrs_mo, Pm, opBytes{0xae, 02, 0xae, 02}},
1163  	{ALEAL, ym_rl, Px, opBytes{0x8d}},
1164  	{ALEAQ, ym_rl, Pw, opBytes{0x8d}},
1165  	{ALEAVEL, ynone, P32, opBytes{0xc9}},
1166  	{ALEAVEQ, ynone, Py, opBytes{0xc9}},
1167  	{ALEAVEW, ynone, Pe, opBytes{0xc9}},
1168  	{ALEAW, ym_rl, Pe, opBytes{0x8d}},
1169  	{ALOCK, ynone, Px, opBytes{0xf0}},
1170  	{ALODSB, ynone, Pb, opBytes{0xac}},
1171  	{ALODSL, ynone, Px, opBytes{0xad}},
1172  	{ALODSQ, ynone, Pw, opBytes{0xad}},
1173  	{ALODSW, ynone, Pe, opBytes{0xad}},
1174  	{ALONG, ybyte, Px, opBytes{4}},
1175  	{ALOOP, yloop, Px, opBytes{0xe2}},
1176  	{ALOOPEQ, yloop, Px, opBytes{0xe1}},
1177  	{ALOOPNE, yloop, Px, opBytes{0xe0}},
1178  	{ALTR, ydivl, Pm, opBytes{0x00, 03}},
1179  	{ALZCNTL, yml_rl, Pf3, opBytes{0xbd}},
1180  	{ALZCNTQ, yml_rl, Pfw, opBytes{0xbd}},
1181  	{ALZCNTW, yml_rl, Pef3, opBytes{0xbd}},
1182  	{ALSLL, yml_rl, Pm, opBytes{0x03}},
1183  	{ALSLW, yml_rl, Pq, opBytes{0x03}},
1184  	{ALSLQ, yml_rl, Pw, opBytes{0x0f, 0x03}},
1185  	{AMASKMOVOU, yxr, Pe, opBytes{0xf7}},
1186  	{AMASKMOVQ, ymr, Pm, opBytes{0xf7}},
1187  	{AMAXPD, yxm, Pe, opBytes{0x5f}},
1188  	{AMAXPS, yxm, Pm, opBytes{0x5f}},
1189  	{AMAXSD, yxm, Pf2, opBytes{0x5f}},
1190  	{AMAXSS, yxm, Pf3, opBytes{0x5f}},
1191  	{AMINPD, yxm, Pe, opBytes{0x5d}},
1192  	{AMINPS, yxm, Pm, opBytes{0x5d}},
1193  	{AMINSD, yxm, Pf2, opBytes{0x5d}},
1194  	{AMINSS, yxm, Pf3, opBytes{0x5d}},
1195  	{AMONITOR, ynone, Px, opBytes{0x0f, 0x01, 0xc8, 0}},
1196  	{AMWAIT, ynone, Px, opBytes{0x0f, 0x01, 0xc9, 0}},
1197  	{AMOVAPD, yxmov, Pe, opBytes{0x28, 0x29}},
1198  	{AMOVAPS, yxmov, Pm, opBytes{0x28, 0x29}},
1199  	{AMOVB, ymovb, Pb, opBytes{0x88, 0x8a, 0xb0, 0xc6, 00}},
1200  	{AMOVBLSX, ymb_rl, Pm, opBytes{0xbe}},
1201  	{AMOVBLZX, ymb_rl, Pm, opBytes{0xb6}},
1202  	{AMOVBQSX, ymb_rl, Pw, opBytes{0x0f, 0xbe}},
1203  	{AMOVBQZX, ymb_rl, Pw, opBytes{0x0f, 0xb6}},
1204  	{AMOVBWSX, ymb_rl, Pq, opBytes{0xbe}},
1205  	{AMOVSWW, ymb_rl, Pe, opBytes{0x0f, 0xbf}},
1206  	{AMOVBWZX, ymb_rl, Pq, opBytes{0xb6}},
1207  	{AMOVZWW, ymb_rl, Pe, opBytes{0x0f, 0xb7}},
1208  	{AMOVO, yxmov, Pe, opBytes{0x6f, 0x7f}},
1209  	{AMOVOU, yxmov, Pf3, opBytes{0x6f, 0x7f}},
1210  	{AMOVHLPS, yxr, Pm, opBytes{0x12}},
1211  	{AMOVHPD, yxmov, Pe, opBytes{0x16, 0x17}},
1212  	{AMOVHPS, yxmov, Pm, opBytes{0x16, 0x17}},
1213  	{AMOVL, ymovl, Px, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
1214  	{AMOVLHPS, yxr, Pm, opBytes{0x16}},
1215  	{AMOVLPD, yxmov, Pe, opBytes{0x12, 0x13}},
1216  	{AMOVLPS, yxmov, Pm, opBytes{0x12, 0x13}},
1217  	{AMOVLQSX, yml_rl, Pw, opBytes{0x63}},
1218  	{AMOVLQZX, yml_rl, Px, opBytes{0x8b}},
1219  	{AMOVMSKPD, yxrrl, Pq, opBytes{0x50}},
1220  	{AMOVMSKPS, yxrrl, Pm, opBytes{0x50}},
1221  	{AMOVNTO, yxr_ml, Pe, opBytes{0xe7}},
1222  	{AMOVNTDQA, ylddqu, Pq4, opBytes{0x2a}},
1223  	{AMOVNTPD, yxr_ml, Pe, opBytes{0x2b}},
1224  	{AMOVNTPS, yxr_ml, Pm, opBytes{0x2b}},
1225  	{AMOVNTQ, ymr_ml, Pm, opBytes{0xe7}},
1226  	{AMOVQ, ymovq, Pw8, opBytes{0x6f, 0x7f, Pf2, 0xd6, Pf3, 0x7e, Pe, 0xd6, 0x89, 0x8b, 0xc7, 00, 0xb8, 0xc7, 00, 0x6e, 0x7e, Pe, 0x6e, Pe, 0x7e, 0}},
1227  	{AMOVQOZX, ymrxr, Pf3, opBytes{0xd6, 0x7e}},
1228  	{AMOVSB, ynone, Pb, opBytes{0xa4}},
1229  	{AMOVSD, yxmov, Pf2, opBytes{0x10, 0x11}},
1230  	{AMOVSL, ynone, Px, opBytes{0xa5}},
1231  	{AMOVSQ, ynone, Pw, opBytes{0xa5}},
1232  	{AMOVSS, yxmov, Pf3, opBytes{0x10, 0x11}},
1233  	{AMOVSW, ynone, Pe, opBytes{0xa5}},
1234  	{AMOVUPD, yxmov, Pe, opBytes{0x10, 0x11}},
1235  	{AMOVUPS, yxmov, Pm, opBytes{0x10, 0x11}},
1236  	{AMOVW, ymovw, Pe, opBytes{0x89, 0x8b, 0xb8, 0xc7, 00, 0}},
1237  	{AMOVWLSX, yml_rl, Pm, opBytes{0xbf}},
1238  	{AMOVWLZX, yml_rl, Pm, opBytes{0xb7}},
1239  	{AMOVWQSX, yml_rl, Pw, opBytes{0x0f, 0xbf}},
1240  	{AMOVWQZX, yml_rl, Pw, opBytes{0x0f, 0xb7}},
1241  	{AMPSADBW, yxshuf, Pq, opBytes{0x3a, 0x42, 0}},
1242  	{AMULB, ydivb, Pb, opBytes{0xf6, 04}},
1243  	{AMULL, ydivl, Px, opBytes{0xf7, 04}},
1244  	{AMULPD, yxm, Pe, opBytes{0x59}},
1245  	{AMULPS, yxm, Ym, opBytes{0x59}},
1246  	{AMULQ, ydivl, Pw, opBytes{0xf7, 04}},
1247  	{AMULSD, yxm, Pf2, opBytes{0x59}},
1248  	{AMULSS, yxm, Pf3, opBytes{0x59}},
1249  	{AMULW, ydivl, Pe, opBytes{0xf7, 04}},
1250  	{ANEGB, yscond, Pb, opBytes{0xf6, 03}},
1251  	{ANEGL, yscond, Px, opBytes{0xf7, 03}},
1252  	{ANEGQ, yscond, Pw, opBytes{0xf7, 03}},
1253  	{ANEGW, yscond, Pe, opBytes{0xf7, 03}},
1254  	{obj.ANOP, ynop, Px, opBytes{0, 0}},
1255  	{ANOTB, yscond, Pb, opBytes{0xf6, 02}},
1256  	{ANOTL, yscond, Px, opBytes{0xf7, 02}}, // TODO(rsc): yscond is wrong here.
1257  	{ANOTQ, yscond, Pw, opBytes{0xf7, 02}},
1258  	{ANOTW, yscond, Pe, opBytes{0xf7, 02}},
1259  	{AORB, yxorb, Pb, opBytes{0x0c, 0x80, 01, 0x08, 0x0a}},
1260  	{AORL, yaddl, Px, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1261  	{AORPD, yxm, Pq, opBytes{0x56}},
1262  	{AORPS, yxm, Pm, opBytes{0x56}},
1263  	{AORQ, yaddl, Pw, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1264  	{AORW, yaddl, Pe, opBytes{0x83, 01, 0x0d, 0x81, 01, 0x09, 0x0b}},
1265  	{AOUTB, yin, Pb, opBytes{0xe6, 0xee}},
1266  	{AOUTL, yin, Px, opBytes{0xe7, 0xef}},
1267  	{AOUTW, yin, Pe, opBytes{0xe7, 0xef}},
1268  	{AOUTSB, ynone, Pb, opBytes{0x6e}},
1269  	{AOUTSL, ynone, Px, opBytes{0x6f}},
1270  	{AOUTSW, ynone, Pe, opBytes{0x6f}},
1271  	{APABSB, yxm_q4, Pq4, opBytes{0x1c}},
1272  	{APABSD, yxm_q4, Pq4, opBytes{0x1e}},
1273  	{APABSW, yxm_q4, Pq4, opBytes{0x1d}},
1274  	{APACKSSLW, ymm, Py1, opBytes{0x6b, Pe, 0x6b}},
1275  	{APACKSSWB, ymm, Py1, opBytes{0x63, Pe, 0x63}},
1276  	{APACKUSDW, yxm_q4, Pq4, opBytes{0x2b}},
1277  	{APACKUSWB, ymm, Py1, opBytes{0x67, Pe, 0x67}},
1278  	{APADDB, ymm, Py1, opBytes{0xfc, Pe, 0xfc}},
1279  	{APADDL, ymm, Py1, opBytes{0xfe, Pe, 0xfe}},
1280  	{APADDQ, yxm, Pe, opBytes{0xd4}},
1281  	{APADDSB, ymm, Py1, opBytes{0xec, Pe, 0xec}},
1282  	{APADDSW, ymm, Py1, opBytes{0xed, Pe, 0xed}},
1283  	{APADDUSB, ymm, Py1, opBytes{0xdc, Pe, 0xdc}},
1284  	{APADDUSW, ymm, Py1, opBytes{0xdd, Pe, 0xdd}},
1285  	{APADDW, ymm, Py1, opBytes{0xfd, Pe, 0xfd}},
1286  	{APALIGNR, ypalignr, Pq, opBytes{0x3a, 0x0f}},
1287  	{APAND, ymm, Py1, opBytes{0xdb, Pe, 0xdb}},
1288  	{APANDN, ymm, Py1, opBytes{0xdf, Pe, 0xdf}},
1289  	{APAUSE, ynone, Px, opBytes{0xf3, 0x90}},
1290  	{APAVGB, ymm, Py1, opBytes{0xe0, Pe, 0xe0}},
1291  	{APAVGW, ymm, Py1, opBytes{0xe3, Pe, 0xe3}},
1292  	{APBLENDW, yxshuf, Pq, opBytes{0x3a, 0x0e, 0}},
1293  	{APCMPEQB, ymm, Py1, opBytes{0x74, Pe, 0x74}},
1294  	{APCMPEQL, ymm, Py1, opBytes{0x76, Pe, 0x76}},
1295  	{APCMPEQQ, yxm_q4, Pq4, opBytes{0x29}},
1296  	{APCMPEQW, ymm, Py1, opBytes{0x75, Pe, 0x75}},
1297  	{APCMPGTB, ymm, Py1, opBytes{0x64, Pe, 0x64}},
1298  	{APCMPGTL, ymm, Py1, opBytes{0x66, Pe, 0x66}},
1299  	{APCMPGTQ, yxm_q4, Pq4, opBytes{0x37}},
1300  	{APCMPGTW, ymm, Py1, opBytes{0x65, Pe, 0x65}},
1301  	{APCMPISTRI, yxshuf, Pq, opBytes{0x3a, 0x63, 0}},
1302  	{APCMPISTRM, yxshuf, Pq, opBytes{0x3a, 0x62, 0}},
1303  	{APEXTRW, yextrw, Pq, opBytes{0xc5, 0, 0x3a, 0x15, 0}},
1304  	{APEXTRB, yextr, Pq, opBytes{0x3a, 0x14, 00}},
1305  	{APEXTRD, yextr, Pq, opBytes{0x3a, 0x16, 00}},
1306  	{APEXTRQ, yextr, Pq3, opBytes{0x3a, 0x16, 00}},
1307  	{APHADDD, ymmxmm0f38, Px, opBytes{0x0F, 0x38, 0x02, 0, 0x66, 0x0F, 0x38, 0x02, 0}},
1308  	{APHADDSW, yxm_q4, Pq4, opBytes{0x03}},
1309  	{APHADDW, yxm_q4, Pq4, opBytes{0x01}},
1310  	{APHMINPOSUW, yxm_q4, Pq4, opBytes{0x41}},
1311  	{APHSUBD, yxm_q4, Pq4, opBytes{0x06}},
1312  	{APHSUBSW, yxm_q4, Pq4, opBytes{0x07}},
1313  	{APHSUBW, yxm_q4, Pq4, opBytes{0x05}},
1314  	{APINSRW, yinsrw, Pq, opBytes{0xc4, 00}},
1315  	{APINSRB, yinsr, Pq, opBytes{0x3a, 0x20, 00}},
1316  	{APINSRD, yinsr, Pq, opBytes{0x3a, 0x22, 00}},
1317  	{APINSRQ, yinsr, Pq3, opBytes{0x3a, 0x22, 00}},
1318  	{APMADDUBSW, yxm_q4, Pq4, opBytes{0x04}},
1319  	{APMADDWL, ymm, Py1, opBytes{0xf5, Pe, 0xf5}},
1320  	{APMAXSB, yxm_q4, Pq4, opBytes{0x3c}},
1321  	{APMAXSD, yxm_q4, Pq4, opBytes{0x3d}},
1322  	{APMAXSW, yxm, Pe, opBytes{0xee}},
1323  	{APMAXUB, yxm, Pe, opBytes{0xde}},
1324  	{APMAXUD, yxm_q4, Pq4, opBytes{0x3f}},
1325  	{APMAXUW, yxm_q4, Pq4, opBytes{0x3e}},
1326  	{APMINSB, yxm_q4, Pq4, opBytes{0x38}},
1327  	{APMINSD, yxm_q4, Pq4, opBytes{0x39}},
1328  	{APMINSW, yxm, Pe, opBytes{0xea}},
1329  	{APMINUB, yxm, Pe, opBytes{0xda}},
1330  	{APMINUD, yxm_q4, Pq4, opBytes{0x3b}},
1331  	{APMINUW, yxm_q4, Pq4, opBytes{0x3a}},
1332  	{APMOVMSKB, ymskb, Px, opBytes{Pe, 0xd7, 0xd7}},
1333  	{APMOVSXBD, yxm_q4, Pq4, opBytes{0x21}},
1334  	{APMOVSXBQ, yxm_q4, Pq4, opBytes{0x22}},
1335  	{APMOVSXBW, yxm_q4, Pq4, opBytes{0x20}},
1336  	{APMOVSXDQ, yxm_q4, Pq4, opBytes{0x25}},
1337  	{APMOVSXWD, yxm_q4, Pq4, opBytes{0x23}},
1338  	{APMOVSXWQ, yxm_q4, Pq4, opBytes{0x24}},
1339  	{APMOVZXBD, yxm_q4, Pq4, opBytes{0x31}},
1340  	{APMOVZXBQ, yxm_q4, Pq4, opBytes{0x32}},
1341  	{APMOVZXBW, yxm_q4, Pq4, opBytes{0x30}},
1342  	{APMOVZXDQ, yxm_q4, Pq4, opBytes{0x35}},
1343  	{APMOVZXWD, yxm_q4, Pq4, opBytes{0x33}},
1344  	{APMOVZXWQ, yxm_q4, Pq4, opBytes{0x34}},
1345  	{APMULDQ, yxm_q4, Pq4, opBytes{0x28}},
1346  	{APMULHRSW, yxm_q4, Pq4, opBytes{0x0b}},
1347  	{APMULHUW, ymm, Py1, opBytes{0xe4, Pe, 0xe4}},
1348  	{APMULHW, ymm, Py1, opBytes{0xe5, Pe, 0xe5}},
1349  	{APMULLD, yxm_q4, Pq4, opBytes{0x40}},
1350  	{APMULLW, ymm, Py1, opBytes{0xd5, Pe, 0xd5}},
1351  	{APMULULQ, ymm, Py1, opBytes{0xf4, Pe, 0xf4}},
1352  	{APOPAL, ynone, P32, opBytes{0x61}},
1353  	{APOPAW, ynone, Pe, opBytes{0x61}},
1354  	{APOPCNTW, yml_rl, Pef3, opBytes{0xb8}},
1355  	{APOPCNTL, yml_rl, Pf3, opBytes{0xb8}},
1356  	{APOPCNTQ, yml_rl, Pfw, opBytes{0xb8}},
1357  	{APOPFL, ynone, P32, opBytes{0x9d}},
1358  	{APOPFQ, ynone, Py, opBytes{0x9d}},
1359  	{APOPFW, ynone, Pe, opBytes{0x9d}},
1360  	{APOPL, ypopl, P32, opBytes{0x58, 0x8f, 00}},
1361  	{APOPQ, ypopl, Py, opBytes{0x58, 0x8f, 00}},
1362  	{APOPW, ypopl, Pe, opBytes{0x58, 0x8f, 00}},
1363  	{APOR, ymm, Py1, opBytes{0xeb, Pe, 0xeb}},
1364  	{APSADBW, yxm, Pq, opBytes{0xf6}},
1365  	{APSHUFHW, yxshuf, Pf3, opBytes{0x70, 00}},
1366  	{APSHUFL, yxshuf, Pq, opBytes{0x70, 00}},
1367  	{APSHUFLW, yxshuf, Pf2, opBytes{0x70, 00}},
1368  	{APSHUFW, ymshuf, Pm, opBytes{0x70, 00}},
1369  	{APSHUFB, ymshufb, Pq, opBytes{0x38, 0x00}},
1370  	{APSIGNB, yxm_q4, Pq4, opBytes{0x08}},
1371  	{APSIGND, yxm_q4, Pq4, opBytes{0x0a}},
1372  	{APSIGNW, yxm_q4, Pq4, opBytes{0x09}},
1373  	{APSLLO, ypsdq, Pq, opBytes{0x73, 07}},
1374  	{APSLLL, yps, Py3, opBytes{0xf2, 0x72, 06, Pe, 0xf2, Pe, 0x72, 06}},
1375  	{APSLLQ, yps, Py3, opBytes{0xf3, 0x73, 06, Pe, 0xf3, Pe, 0x73, 06}},
1376  	{APSLLW, yps, Py3, opBytes{0xf1, 0x71, 06, Pe, 0xf1, Pe, 0x71, 06}},
1377  	{APSRAL, yps, Py3, opBytes{0xe2, 0x72, 04, Pe, 0xe2, Pe, 0x72, 04}},
1378  	{APSRAW, yps, Py3, opBytes{0xe1, 0x71, 04, Pe, 0xe1, Pe, 0x71, 04}},
1379  	{APSRLO, ypsdq, Pq, opBytes{0x73, 03}},
1380  	{APSRLL, yps, Py3, opBytes{0xd2, 0x72, 02, Pe, 0xd2, Pe, 0x72, 02}},
1381  	{APSRLQ, yps, Py3, opBytes{0xd3, 0x73, 02, Pe, 0xd3, Pe, 0x73, 02}},
1382  	{APSRLW, yps, Py3, opBytes{0xd1, 0x71, 02, Pe, 0xd1, Pe, 0x71, 02}},
1383  	{APSUBB, yxm, Pe, opBytes{0xf8}},
1384  	{APSUBL, yxm, Pe, opBytes{0xfa}},
1385  	{APSUBQ, yxm, Pe, opBytes{0xfb}},
1386  	{APSUBSB, yxm, Pe, opBytes{0xe8}},
1387  	{APSUBSW, yxm, Pe, opBytes{0xe9}},
1388  	{APSUBUSB, yxm, Pe, opBytes{0xd8}},
1389  	{APSUBUSW, yxm, Pe, opBytes{0xd9}},
1390  	{APSUBW, yxm, Pe, opBytes{0xf9}},
1391  	{APTEST, yxm_q4, Pq4, opBytes{0x17}},
1392  	{APUNPCKHBW, ymm, Py1, opBytes{0x68, Pe, 0x68}},
1393  	{APUNPCKHLQ, ymm, Py1, opBytes{0x6a, Pe, 0x6a}},
1394  	{APUNPCKHQDQ, yxm, Pe, opBytes{0x6d}},
1395  	{APUNPCKHWL, ymm, Py1, opBytes{0x69, Pe, 0x69}},
1396  	{APUNPCKLBW, ymm, Py1, opBytes{0x60, Pe, 0x60}},
1397  	{APUNPCKLLQ, ymm, Py1, opBytes{0x62, Pe, 0x62}},
1398  	{APUNPCKLQDQ, yxm, Pe, opBytes{0x6c}},
1399  	{APUNPCKLWL, ymm, Py1, opBytes{0x61, Pe, 0x61}},
1400  	{APUSHAL, ynone, P32, opBytes{0x60}},
1401  	{APUSHAW, ynone, Pe, opBytes{0x60}},
1402  	{APUSHFL, ynone, P32, opBytes{0x9c}},
1403  	{APUSHFQ, ynone, Py, opBytes{0x9c}},
1404  	{APUSHFW, ynone, Pe, opBytes{0x9c}},
1405  	{APUSHL, ypushl, P32, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1406  	{APUSHQ, ypushl, Py, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1407  	{APUSHW, ypushl, Pe, opBytes{0x50, 0xff, 06, 0x6a, 0x68}},
1408  	{APXOR, ymm, Py1, opBytes{0xef, Pe, 0xef}},
1409  	{AQUAD, ybyte, Px, opBytes{8}},
1410  	{ARCLB, yshb, Pb, opBytes{0xd0, 02, 0xc0, 02, 0xd2, 02}},
1411  	{ARCLL, yshl, Px, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1412  	{ARCLQ, yshl, Pw, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1413  	{ARCLW, yshl, Pe, opBytes{0xd1, 02, 0xc1, 02, 0xd3, 02, 0xd3, 02}},
1414  	{ARCPPS, yxm, Pm, opBytes{0x53}},
1415  	{ARCPSS, yxm, Pf3, opBytes{0x53}},
1416  	{ARCRB, yshb, Pb, opBytes{0xd0, 03, 0xc0, 03, 0xd2, 03}},
1417  	{ARCRL, yshl, Px, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1418  	{ARCRQ, yshl, Pw, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1419  	{ARCRW, yshl, Pe, opBytes{0xd1, 03, 0xc1, 03, 0xd3, 03, 0xd3, 03}},
1420  	{AREP, ynone, Px, opBytes{0xf3}},
1421  	{AREPN, ynone, Px, opBytes{0xf2}},
1422  	{obj.ARET, ynone, Px, opBytes{0xc3}},
1423  	{ARETFW, yret, Pe, opBytes{0xcb, 0xca}},
1424  	{ARETFL, yret, Px, opBytes{0xcb, 0xca}},
1425  	{ARETFQ, yret, Pw, opBytes{0xcb, 0xca}},
1426  	{AROLB, yshb, Pb, opBytes{0xd0, 00, 0xc0, 00, 0xd2, 00}},
1427  	{AROLL, yshl, Px, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1428  	{AROLQ, yshl, Pw, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1429  	{AROLW, yshl, Pe, opBytes{0xd1, 00, 0xc1, 00, 0xd3, 00, 0xd3, 00}},
1430  	{ARORB, yshb, Pb, opBytes{0xd0, 01, 0xc0, 01, 0xd2, 01}},
1431  	{ARORL, yshl, Px, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1432  	{ARORQ, yshl, Pw, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1433  	{ARORW, yshl, Pe, opBytes{0xd1, 01, 0xc1, 01, 0xd3, 01, 0xd3, 01}},
1434  	{ARSQRTPS, yxm, Pm, opBytes{0x52}},
1435  	{ARSQRTSS, yxm, Pf3, opBytes{0x52}},
1436  	{ASAHF, ynone, Px, opBytes{0x9e, 00, 0x86, 0xe0, 0x50, 0x9d}}, // XCHGB AH,AL; PUSH AX; POPFL
1437  	{ASALB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
1438  	{ASALL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1439  	{ASALQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1440  	{ASALW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1441  	{ASARB, yshb, Pb, opBytes{0xd0, 07, 0xc0, 07, 0xd2, 07}},
1442  	{ASARL, yshl, Px, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1443  	{ASARQ, yshl, Pw, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1444  	{ASARW, yshl, Pe, opBytes{0xd1, 07, 0xc1, 07, 0xd3, 07, 0xd3, 07}},
1445  	{ASBBB, yxorb, Pb, opBytes{0x1c, 0x80, 03, 0x18, 0x1a}},
1446  	{ASBBL, yaddl, Px, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1447  	{ASBBQ, yaddl, Pw, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1448  	{ASBBW, yaddl, Pe, opBytes{0x83, 03, 0x1d, 0x81, 03, 0x19, 0x1b}},
1449  	{ASCASB, ynone, Pb, opBytes{0xae}},
1450  	{ASCASL, ynone, Px, opBytes{0xaf}},
1451  	{ASCASQ, ynone, Pw, opBytes{0xaf}},
1452  	{ASCASW, ynone, Pe, opBytes{0xaf}},
1453  	{ASETCC, yscond, Pb, opBytes{0x0f, 0x93, 00}},
1454  	{ASETCS, yscond, Pb, opBytes{0x0f, 0x92, 00}},
1455  	{ASETEQ, yscond, Pb, opBytes{0x0f, 0x94, 00}},
1456  	{ASETGE, yscond, Pb, opBytes{0x0f, 0x9d, 00}},
1457  	{ASETGT, yscond, Pb, opBytes{0x0f, 0x9f, 00}},
1458  	{ASETHI, yscond, Pb, opBytes{0x0f, 0x97, 00}},
1459  	{ASETLE, yscond, Pb, opBytes{0x0f, 0x9e, 00}},
1460  	{ASETLS, yscond, Pb, opBytes{0x0f, 0x96, 00}},
1461  	{ASETLT, yscond, Pb, opBytes{0x0f, 0x9c, 00}},
1462  	{ASETMI, yscond, Pb, opBytes{0x0f, 0x98, 00}},
1463  	{ASETNE, yscond, Pb, opBytes{0x0f, 0x95, 00}},
1464  	{ASETOC, yscond, Pb, opBytes{0x0f, 0x91, 00}},
1465  	{ASETOS, yscond, Pb, opBytes{0x0f, 0x90, 00}},
1466  	{ASETPC, yscond, Pb, opBytes{0x0f, 0x9b, 00}},
1467  	{ASETPL, yscond, Pb, opBytes{0x0f, 0x99, 00}},
1468  	{ASETPS, yscond, Pb, opBytes{0x0f, 0x9a, 00}},
1469  	{ASHLB, yshb, Pb, opBytes{0xd0, 04, 0xc0, 04, 0xd2, 04}},
1470  	{ASHLL, yshl, Px, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1471  	{ASHLQ, yshl, Pw, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1472  	{ASHLW, yshl, Pe, opBytes{0xd1, 04, 0xc1, 04, 0xd3, 04, 0xd3, 04}},
1473  	{ASHRB, yshb, Pb, opBytes{0xd0, 05, 0xc0, 05, 0xd2, 05}},
1474  	{ASHRL, yshl, Px, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1475  	{ASHRQ, yshl, Pw, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1476  	{ASHRW, yshl, Pe, opBytes{0xd1, 05, 0xc1, 05, 0xd3, 05, 0xd3, 05}},
1477  	{ASHUFPD, yxshuf, Pq, opBytes{0xc6, 00}},
1478  	{ASHUFPS, yxshuf, Pm, opBytes{0xc6, 00}},
1479  	{ASQRTPD, yxm, Pe, opBytes{0x51}},
1480  	{ASQRTPS, yxm, Pm, opBytes{0x51}},
1481  	{ASQRTSD, yxm, Pf2, opBytes{0x51}},
1482  	{ASQRTSS, yxm, Pf3, opBytes{0x51}},
1483  	{ASTC, ynone, Px, opBytes{0xf9}},
1484  	{ASTD, ynone, Px, opBytes{0xfd}},
1485  	{ASTI, ynone, Px, opBytes{0xfb}},
1486  	{ASTMXCSR, ysvrs_om, Pm, opBytes{0xae, 03, 0xae, 03}},
1487  	{ASTOSB, ynone, Pb, opBytes{0xaa}},
1488  	{ASTOSL, ynone, Px, opBytes{0xab}},
1489  	{ASTOSQ, ynone, Pw, opBytes{0xab}},
1490  	{ASTOSW, ynone, Pe, opBytes{0xab}},
1491  	{ASUBB, yxorb, Pb, opBytes{0x2c, 0x80, 05, 0x28, 0x2a}},
1492  	{ASUBL, yaddl, Px, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1493  	{ASUBPD, yxm, Pe, opBytes{0x5c}},
1494  	{ASUBPS, yxm, Pm, opBytes{0x5c}},
1495  	{ASUBQ, yaddl, Pw, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1496  	{ASUBSD, yxm, Pf2, opBytes{0x5c}},
1497  	{ASUBSS, yxm, Pf3, opBytes{0x5c}},
1498  	{ASUBW, yaddl, Pe, opBytes{0x83, 05, 0x2d, 0x81, 05, 0x29, 0x2b}},
1499  	{ASWAPGS, ynone, Pm, opBytes{0x01, 0xf8}},
1500  	{ASYSCALL, ynone, Px, opBytes{0x0f, 0x05}}, // fast syscall
1501  	{ATESTB, yxorb, Pb, opBytes{0xa8, 0xf6, 00, 0x84, 0x84}},
1502  	{ATESTL, ytestl, Px, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1503  	{ATESTQ, ytestl, Pw, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1504  	{ATESTW, ytestl, Pe, opBytes{0xa9, 0xf7, 00, 0x85, 0x85}},
1505  	{ATPAUSE, ywrfsbase, Pq, opBytes{0xae, 06}},
1506  	{obj.ATEXT, ytext, Px, opBytes{}},
1507  	{AUCOMISD, yxm, Pe, opBytes{0x2e}},
1508  	{AUCOMISS, yxm, Pm, opBytes{0x2e}},
1509  	{AUNPCKHPD, yxm, Pe, opBytes{0x15}},
1510  	{AUNPCKHPS, yxm, Pm, opBytes{0x15}},
1511  	{AUNPCKLPD, yxm, Pe, opBytes{0x14}},
1512  	{AUNPCKLPS, yxm, Pm, opBytes{0x14}},
1513  	{AUMONITOR, ywrfsbase, Pf3, opBytes{0xae, 06}},
1514  	{AVERR, ydivl, Pm, opBytes{0x00, 04}},
1515  	{AVERW, ydivl, Pm, opBytes{0x00, 05}},
1516  	{AWAIT, ynone, Px, opBytes{0x9b}},
1517  	{AWORD, ybyte, Px, opBytes{2}},
1518  	{AXCHGB, yml_mb, Pb, opBytes{0x86, 0x86}},
1519  	{AXCHGL, yxchg, Px, opBytes{0x90, 0x90, 0x87, 0x87}},
1520  	{AXCHGQ, yxchg, Pw, opBytes{0x90, 0x90, 0x87, 0x87}},
1521  	{AXCHGW, yxchg, Pe, opBytes{0x90, 0x90, 0x87, 0x87}},
1522  	{AXLAT, ynone, Px, opBytes{0xd7}},
1523  	{AXORB, yxorb, Pb, opBytes{0x34, 0x80, 06, 0x30, 0x32}},
1524  	{AXORL, yaddl, Px, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1525  	{AXORPD, yxm, Pe, opBytes{0x57}},
1526  	{AXORPS, yxm, Pm, opBytes{0x57}},
1527  	{AXORQ, yaddl, Pw, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1528  	{AXORW, yaddl, Pe, opBytes{0x83, 06, 0x35, 0x81, 06, 0x31, 0x33}},
1529  	{AFMOVB, yfmvx, Px, opBytes{0xdf, 04}},
1530  	{AFMOVBP, yfmvp, Px, opBytes{0xdf, 06}},
1531  	{AFMOVD, yfmvd, Px, opBytes{0xdd, 00, 0xdd, 02, 0xd9, 00, 0xdd, 02}},
1532  	{AFMOVDP, yfmvdp, Px, opBytes{0xdd, 03, 0xdd, 03}},
1533  	{AFMOVF, yfmvf, Px, opBytes{0xd9, 00, 0xd9, 02}},
1534  	{AFMOVFP, yfmvp, Px, opBytes{0xd9, 03}},
1535  	{AFMOVL, yfmvf, Px, opBytes{0xdb, 00, 0xdb, 02}},
1536  	{AFMOVLP, yfmvp, Px, opBytes{0xdb, 03}},
1537  	{AFMOVV, yfmvx, Px, opBytes{0xdf, 05}},
1538  	{AFMOVVP, yfmvp, Px, opBytes{0xdf, 07}},
1539  	{AFMOVW, yfmvf, Px, opBytes{0xdf, 00, 0xdf, 02}},
1540  	{AFMOVWP, yfmvp, Px, opBytes{0xdf, 03}},
1541  	{AFMOVX, yfmvx, Px, opBytes{0xdb, 05}},
1542  	{AFMOVXP, yfmvp, Px, opBytes{0xdb, 07}},
1543  	{AFCMOVCC, yfcmv, Px, opBytes{0xdb, 00}},
1544  	{AFCMOVCS, yfcmv, Px, opBytes{0xda, 00}},
1545  	{AFCMOVEQ, yfcmv, Px, opBytes{0xda, 01}},
1546  	{AFCMOVHI, yfcmv, Px, opBytes{0xdb, 02}},
1547  	{AFCMOVLS, yfcmv, Px, opBytes{0xda, 02}},
1548  	{AFCMOVB, yfcmv, Px, opBytes{0xda, 00}},
1549  	{AFCMOVBE, yfcmv, Px, opBytes{0xda, 02}},
1550  	{AFCMOVNB, yfcmv, Px, opBytes{0xdb, 00}},
1551  	{AFCMOVNBE, yfcmv, Px, opBytes{0xdb, 02}},
1552  	{AFCMOVE, yfcmv, Px, opBytes{0xda, 01}},
1553  	{AFCMOVNE, yfcmv, Px, opBytes{0xdb, 01}},
1554  	{AFCMOVNU, yfcmv, Px, opBytes{0xdb, 03}},
1555  	{AFCMOVU, yfcmv, Px, opBytes{0xda, 03}},
1556  	{AFCMOVUN, yfcmv, Px, opBytes{0xda, 03}},
1557  	{AFCOMD, yfadd, Px, opBytes{0xdc, 02, 0xd8, 02, 0xdc, 02}},  // botch
1558  	{AFCOMDP, yfadd, Px, opBytes{0xdc, 03, 0xd8, 03, 0xdc, 03}}, // botch
1559  	{AFCOMDPP, ycompp, Px, opBytes{0xde, 03}},
1560  	{AFCOMF, yfmvx, Px, opBytes{0xd8, 02}},
1561  	{AFCOMFP, yfmvx, Px, opBytes{0xd8, 03}},
1562  	{AFCOMI, yfcmv, Px, opBytes{0xdb, 06}},
1563  	{AFCOMIP, yfcmv, Px, opBytes{0xdf, 06}},
1564  	{AFCOML, yfmvx, Px, opBytes{0xda, 02}},
1565  	{AFCOMLP, yfmvx, Px, opBytes{0xda, 03}},
1566  	{AFCOMW, yfmvx, Px, opBytes{0xde, 02}},
1567  	{AFCOMWP, yfmvx, Px, opBytes{0xde, 03}},
1568  	{AFUCOM, ycompp, Px, opBytes{0xdd, 04}},
1569  	{AFUCOMI, ycompp, Px, opBytes{0xdb, 05}},
1570  	{AFUCOMIP, ycompp, Px, opBytes{0xdf, 05}},
1571  	{AFUCOMP, ycompp, Px, opBytes{0xdd, 05}},
1572  	{AFUCOMPP, ycompp, Px, opBytes{0xda, 13}},
1573  	{AFADDDP, ycompp, Px, opBytes{0xde, 00}},
1574  	{AFADDW, yfmvx, Px, opBytes{0xde, 00}},
1575  	{AFADDL, yfmvx, Px, opBytes{0xda, 00}},
1576  	{AFADDF, yfmvx, Px, opBytes{0xd8, 00}},
1577  	{AFADDD, yfadd, Px, opBytes{0xdc, 00, 0xd8, 00, 0xdc, 00}},
1578  	{AFMULDP, ycompp, Px, opBytes{0xde, 01}},
1579  	{AFMULW, yfmvx, Px, opBytes{0xde, 01}},
1580  	{AFMULL, yfmvx, Px, opBytes{0xda, 01}},
1581  	{AFMULF, yfmvx, Px, opBytes{0xd8, 01}},
1582  	{AFMULD, yfadd, Px, opBytes{0xdc, 01, 0xd8, 01, 0xdc, 01}},
1583  	{AFSUBDP, ycompp, Px, opBytes{0xde, 05}},
1584  	{AFSUBW, yfmvx, Px, opBytes{0xde, 04}},
1585  	{AFSUBL, yfmvx, Px, opBytes{0xda, 04}},
1586  	{AFSUBF, yfmvx, Px, opBytes{0xd8, 04}},
1587  	{AFSUBD, yfadd, Px, opBytes{0xdc, 04, 0xd8, 04, 0xdc, 05}},
1588  	{AFSUBRDP, ycompp, Px, opBytes{0xde, 04}},
1589  	{AFSUBRW, yfmvx, Px, opBytes{0xde, 05}},
1590  	{AFSUBRL, yfmvx, Px, opBytes{0xda, 05}},
1591  	{AFSUBRF, yfmvx, Px, opBytes{0xd8, 05}},
1592  	{AFSUBRD, yfadd, Px, opBytes{0xdc, 05, 0xd8, 05, 0xdc, 04}},
1593  	{AFDIVDP, ycompp, Px, opBytes{0xde, 07}},
1594  	{AFDIVW, yfmvx, Px, opBytes{0xde, 06}},
1595  	{AFDIVL, yfmvx, Px, opBytes{0xda, 06}},
1596  	{AFDIVF, yfmvx, Px, opBytes{0xd8, 06}},
1597  	{AFDIVD, yfadd, Px, opBytes{0xdc, 06, 0xd8, 06, 0xdc, 07}},
1598  	{AFDIVRDP, ycompp, Px, opBytes{0xde, 06}},
1599  	{AFDIVRW, yfmvx, Px, opBytes{0xde, 07}},
1600  	{AFDIVRL, yfmvx, Px, opBytes{0xda, 07}},
1601  	{AFDIVRF, yfmvx, Px, opBytes{0xd8, 07}},
1602  	{AFDIVRD, yfadd, Px, opBytes{0xdc, 07, 0xd8, 07, 0xdc, 06}},
1603  	{AFXCHD, yfxch, Px, opBytes{0xd9, 01, 0xd9, 01}},
1604  	{AFFREE, nil, 0, opBytes{}},
1605  	{AFLDCW, ysvrs_mo, Px, opBytes{0xd9, 05, 0xd9, 05}},
1606  	{AFLDENV, ysvrs_mo, Px, opBytes{0xd9, 04, 0xd9, 04}},
1607  	{AFRSTOR, ysvrs_mo, Px, opBytes{0xdd, 04, 0xdd, 04}},
1608  	{AFSAVE, ysvrs_om, Px, opBytes{0xdd, 06, 0xdd, 06}},
1609  	{AFSTCW, ysvrs_om, Px, opBytes{0xd9, 07, 0xd9, 07}},
1610  	{AFSTENV, ysvrs_om, Px, opBytes{0xd9, 06, 0xd9, 06}},
1611  	{AFSTSW, ystsw, Px, opBytes{0xdd, 07, 0xdf, 0xe0}},
1612  	{AF2XM1, ynone, Px, opBytes{0xd9, 0xf0}},
1613  	{AFABS, ynone, Px, opBytes{0xd9, 0xe1}},
1614  	{AFBLD, ysvrs_mo, Px, opBytes{0xdf, 04}},
1615  	{AFBSTP, yclflush, Px, opBytes{0xdf, 06}},
1616  	{AFCHS, ynone, Px, opBytes{0xd9, 0xe0}},
1617  	{AFCLEX, ynone, Px, opBytes{0xdb, 0xe2}},
1618  	{AFCOS, ynone, Px, opBytes{0xd9, 0xff}},
1619  	{AFDECSTP, ynone, Px, opBytes{0xd9, 0xf6}},
1620  	{AFINCSTP, ynone, Px, opBytes{0xd9, 0xf7}},
1621  	{AFINIT, ynone, Px, opBytes{0xdb, 0xe3}},
1622  	{AFLD1, ynone, Px, opBytes{0xd9, 0xe8}},
1623  	{AFLDL2E, ynone, Px, opBytes{0xd9, 0xea}},
1624  	{AFLDL2T, ynone, Px, opBytes{0xd9, 0xe9}},
1625  	{AFLDLG2, ynone, Px, opBytes{0xd9, 0xec}},
1626  	{AFLDLN2, ynone, Px, opBytes{0xd9, 0xed}},
1627  	{AFLDPI, ynone, Px, opBytes{0xd9, 0xeb}},
1628  	{AFLDZ, ynone, Px, opBytes{0xd9, 0xee}},
1629  	{AFNOP, ynone, Px, opBytes{0xd9, 0xd0}},
1630  	{AFPATAN, ynone, Px, opBytes{0xd9, 0xf3}},
1631  	{AFPREM, ynone, Px, opBytes{0xd9, 0xf8}},
1632  	{AFPREM1, ynone, Px, opBytes{0xd9, 0xf5}},
1633  	{AFPTAN, ynone, Px, opBytes{0xd9, 0xf2}},
1634  	{AFRNDINT, ynone, Px, opBytes{0xd9, 0xfc}},
1635  	{AFSCALE, ynone, Px, opBytes{0xd9, 0xfd}},
1636  	{AFSIN, ynone, Px, opBytes{0xd9, 0xfe}},
1637  	{AFSINCOS, ynone, Px, opBytes{0xd9, 0xfb}},
1638  	{AFSQRT, ynone, Px, opBytes{0xd9, 0xfa}},
1639  	{AFTST, ynone, Px, opBytes{0xd9, 0xe4}},
1640  	{AFXAM, ynone, Px, opBytes{0xd9, 0xe5}},
1641  	{AFXTRACT, ynone, Px, opBytes{0xd9, 0xf4}},
1642  	{AFYL2X, ynone, Px, opBytes{0xd9, 0xf1}},
1643  	{AFYL2XP1, ynone, Px, opBytes{0xd9, 0xf9}},
1644  	{ACMPXCHGB, yrb_mb, Pb, opBytes{0x0f, 0xb0}},
1645  	{ACMPXCHGL, yrl_ml, Px, opBytes{0x0f, 0xb1}},
1646  	{ACMPXCHGW, yrl_ml, Pe, opBytes{0x0f, 0xb1}},
1647  	{ACMPXCHGQ, yrl_ml, Pw, opBytes{0x0f, 0xb1}},
1648  	{ACMPXCHG8B, yscond, Pm, opBytes{0xc7, 01}},
1649  	{ACMPXCHG16B, yscond, Pw, opBytes{0x0f, 0xc7, 01}},
1650  	{AINVD, ynone, Pm, opBytes{0x08}},
1651  	{AINVLPG, ydivb, Pm, opBytes{0x01, 07}},
1652  	{AINVPCID, ycrc32l, Pe, opBytes{0x0f, 0x38, 0x82, 0}},
1653  	{ALFENCE, ynone, Pm, opBytes{0xae, 0xe8}},
1654  	{AMFENCE, ynone, Pm, opBytes{0xae, 0xf0}},
1655  	{AMOVNTIL, yrl_ml, Pm, opBytes{0xc3}},
1656  	{AMOVNTIQ, yrl_ml, Pw, opBytes{0x0f, 0xc3}},
1657  	{ARDPKRU, ynone, Pm, opBytes{0x01, 0xee, 0}},
1658  	{ARDMSR, ynone, Pm, opBytes{0x32}},
1659  	{ARDPMC, ynone, Pm, opBytes{0x33}},
1660  	{ARDTSC, ynone, Pm, opBytes{0x31}},
1661  	{ARSM, ynone, Pm, opBytes{0xaa}},
1662  	{ASFENCE, ynone, Pm, opBytes{0xae, 0xf8}},
1663  	{ASYSRET, ynone, Pm, opBytes{0x07}},
1664  	{AWBINVD, ynone, Pm, opBytes{0x09}},
1665  	{AWRMSR, ynone, Pm, opBytes{0x30}},
1666  	{AWRPKRU, ynone, Pm, opBytes{0x01, 0xef, 0}},
1667  	{AXADDB, yrb_mb, Pb, opBytes{0x0f, 0xc0}},
1668  	{AXADDL, yrl_ml, Px, opBytes{0x0f, 0xc1}},
1669  	{AXADDQ, yrl_ml, Pw, opBytes{0x0f, 0xc1}},
1670  	{AXADDW, yrl_ml, Pe, opBytes{0x0f, 0xc1}},
1671  	{ACRC32B, ycrc32b, Px, opBytes{0xf2, 0x0f, 0x38, 0xf0, 0}},
1672  	{ACRC32L, ycrc32l, Px, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1673  	{ACRC32Q, ycrc32l, Pw, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1674  	{ACRC32W, ycrc32l, Pe, opBytes{0xf2, 0x0f, 0x38, 0xf1, 0}},
1675  	{APREFETCHT0, yprefetch, Pm, opBytes{0x18, 01}},
1676  	{APREFETCHT1, yprefetch, Pm, opBytes{0x18, 02}},
1677  	{APREFETCHT2, yprefetch, Pm, opBytes{0x18, 03}},
1678  	{APREFETCHNTA, yprefetch, Pm, opBytes{0x18, 00}},
1679  	{AMOVQL, yrl_ml, Px, opBytes{0x89}},
1680  	{obj.AUNDEF, ynone, Px, opBytes{0x0f, 0x0b}},
1681  	{AAESENC, yaes, Pq, opBytes{0x38, 0xdc, 0}},
1682  	{AAESENCLAST, yaes, Pq, opBytes{0x38, 0xdd, 0}},
1683  	{AAESDEC, yaes, Pq, opBytes{0x38, 0xde, 0}},
1684  	{AAESDECLAST, yaes, Pq, opBytes{0x38, 0xdf, 0}},
1685  	{AAESIMC, yaes, Pq, opBytes{0x38, 0xdb, 0}},
1686  	{AAESKEYGENASSIST, yxshuf, Pq, opBytes{0x3a, 0xdf, 0}},
1687  	{AROUNDPD, yxshuf, Pq, opBytes{0x3a, 0x09, 0}},
1688  	{AROUNDPS, yxshuf, Pq, opBytes{0x3a, 0x08, 0}},
1689  	{AROUNDSD, yxshuf, Pq, opBytes{0x3a, 0x0b, 0}},
1690  	{AROUNDSS, yxshuf, Pq, opBytes{0x3a, 0x0a, 0}},
1691  	{APSHUFD, yxshuf, Pq, opBytes{0x70, 0}},
1692  	{APCLMULQDQ, yxshuf, Pq, opBytes{0x3a, 0x44, 0}},
1693  	{APCMPESTRI, yxshuf, Pq, opBytes{0x3a, 0x61, 0}},
1694  	{APCMPESTRM, yxshuf, Pq, opBytes{0x3a, 0x60, 0}},
1695  	{AMOVDDUP, yxm, Pf2, opBytes{0x12}},
1696  	{AMOVSHDUP, yxm, Pf3, opBytes{0x16}},
1697  	{AMOVSLDUP, yxm, Pf3, opBytes{0x12}},
1698  	{ARDTSCP, ynone, Pm, opBytes{0x01, 0xf9, 0}},
1699  	{ASTAC, ynone, Pm, opBytes{0x01, 0xcb, 0}},
1700  	{AUD1, ynone, Pm, opBytes{0xb9, 0}},
1701  	{AUD2, ynone, Pm, opBytes{0x0b, 0}},
1702  	{AUMWAIT, ywrfsbase, Pf2, opBytes{0xae, 06}},
1703  	{ASYSENTER, ynone, Px, opBytes{0x0f, 0x34, 0}},
1704  	{ASYSENTER64, ynone, Pw, opBytes{0x0f, 0x34, 0}},
1705  	{ASYSEXIT, ynone, Px, opBytes{0x0f, 0x35, 0}},
1706  	{ASYSEXIT64, ynone, Pw, opBytes{0x0f, 0x35, 0}},
1707  	{ALMSW, ydivl, Pm, opBytes{0x01, 06}},
1708  	{ALLDT, ydivl, Pm, opBytes{0x00, 02}},
1709  	{ALIDT, ysvrs_mo, Pm, opBytes{0x01, 03}},
1710  	{ALGDT, ysvrs_mo, Pm, opBytes{0x01, 02}},
1711  	{ATZCNTW, ycrc32l, Pe, opBytes{0xf3, 0x0f, 0xbc, 0}},
1712  	{ATZCNTL, ycrc32l, Px, opBytes{0xf3, 0x0f, 0xbc, 0}},
1713  	{ATZCNTQ, ycrc32l, Pw, opBytes{0xf3, 0x0f, 0xbc, 0}},
1714  	{AXRSTOR, ydivl, Px, opBytes{0x0f, 0xae, 05}},
1715  	{AXRSTOR64, ydivl, Pw, opBytes{0x0f, 0xae, 05}},
1716  	{AXRSTORS, ydivl, Px, opBytes{0x0f, 0xc7, 03}},
1717  	{AXRSTORS64, ydivl, Pw, opBytes{0x0f, 0xc7, 03}},
1718  	{AXSAVE, yclflush, Px, opBytes{0x0f, 0xae, 04}},
1719  	{AXSAVE64, yclflush, Pw, opBytes{0x0f, 0xae, 04}},
1720  	{AXSAVEOPT, yclflush, Px, opBytes{0x0f, 0xae, 06}},
1721  	{AXSAVEOPT64, yclflush, Pw, opBytes{0x0f, 0xae, 06}},
1722  	{AXSAVEC, yclflush, Px, opBytes{0x0f, 0xc7, 04}},
1723  	{AXSAVEC64, yclflush, Pw, opBytes{0x0f, 0xc7, 04}},
1724  	{AXSAVES, yclflush, Px, opBytes{0x0f, 0xc7, 05}},
1725  	{AXSAVES64, yclflush, Pw, opBytes{0x0f, 0xc7, 05}},
1726  	{ASGDT, yclflush, Pm, opBytes{0x01, 00}},
1727  	{ASIDT, yclflush, Pm, opBytes{0x01, 01}},
1728  	{ARDRANDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 06}},
1729  	{ARDRANDL, yrdrand, Px, opBytes{0x0f, 0xc7, 06}},
1730  	{ARDRANDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 06}},
1731  	{ARDSEEDW, yrdrand, Pe, opBytes{0x0f, 0xc7, 07}},
1732  	{ARDSEEDL, yrdrand, Px, opBytes{0x0f, 0xc7, 07}},
1733  	{ARDSEEDQ, yrdrand, Pw, opBytes{0x0f, 0xc7, 07}},
1734  	{ASTRW, yincq, Pe, opBytes{0x0f, 0x00, 01}},
1735  	{ASTRL, yincq, Px, opBytes{0x0f, 0x00, 01}},
1736  	{ASTRQ, yincq, Pw, opBytes{0x0f, 0x00, 01}},
1737  	{AXSETBV, ynone, Pm, opBytes{0x01, 0xd1, 0}},
1738  	{AMOVBEWW, ymovbe, Pq, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
1739  	{AMOVBELL, ymovbe, Pm, opBytes{0x38, 0xf0, 0, 0x38, 0xf1, 0}},
1740  	{AMOVBEQQ, ymovbe, Pw, opBytes{0x0f, 0x38, 0xf0, 0, 0x0f, 0x38, 0xf1, 0}},
1741  	{ANOPW, ydivl, Pe, opBytes{0x0f, 0x1f, 00}},
1742  	{ANOPL, ydivl, Px, opBytes{0x0f, 0x1f, 00}},
1743  	{ASLDTW, yincq, Pe, opBytes{0x0f, 0x00, 00}},
1744  	{ASLDTL, yincq, Px, opBytes{0x0f, 0x00, 00}},
1745  	{ASLDTQ, yincq, Pw, opBytes{0x0f, 0x00, 00}},
1746  	{ASMSWW, yincq, Pe, opBytes{0x0f, 0x01, 04}},
1747  	{ASMSWL, yincq, Px, opBytes{0x0f, 0x01, 04}},
1748  	{ASMSWQ, yincq, Pw, opBytes{0x0f, 0x01, 04}},
1749  	{ABLENDVPS, yblendvpd, Pq4, opBytes{0x14}},
1750  	{ABLENDVPD, yblendvpd, Pq4, opBytes{0x15}},
1751  	{APBLENDVB, yblendvpd, Pq4, opBytes{0x10}},
1752  	{ASHA1MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xc9, 0}},
1753  	{ASHA1MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xca, 0}},
1754  	{ASHA1NEXTE, yaes, Px, opBytes{0x0f, 0x38, 0xc8, 0}},
1755  	{ASHA256MSG1, yaes, Px, opBytes{0x0f, 0x38, 0xcc, 0}},
1756  	{ASHA256MSG2, yaes, Px, opBytes{0x0f, 0x38, 0xcd, 0}},
1757  	{ASHA1RNDS4, ysha1rnds4, Pm, opBytes{0x3a, 0xcc, 0}},
1758  	{ASHA256RNDS2, ysha256rnds2, Px, opBytes{0x0f, 0x38, 0xcb, 0}},
1759  	{ARDFSBASEL, yrdrand, Pf3, opBytes{0xae, 00}},
1760  	{ARDFSBASEQ, yrdrand, Pfw, opBytes{0xae, 00}},
1761  	{ARDGSBASEL, yrdrand, Pf3, opBytes{0xae, 01}},
1762  	{ARDGSBASEQ, yrdrand, Pfw, opBytes{0xae, 01}},
1763  	{AWRFSBASEL, ywrfsbase, Pf3, opBytes{0xae, 02}},
1764  	{AWRFSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 02}},
1765  	{AWRGSBASEL, ywrfsbase, Pf3, opBytes{0xae, 03}},
1766  	{AWRGSBASEQ, ywrfsbase, Pfw, opBytes{0xae, 03}},
1767  	{ALFSW, ym_rl, Pe, opBytes{0x0f, 0xb4}},
1768  	{ALFSL, ym_rl, Px, opBytes{0x0f, 0xb4}},
1769  	{ALFSQ, ym_rl, Pw, opBytes{0x0f, 0xb4}},
1770  	{ALGSW, ym_rl, Pe, opBytes{0x0f, 0xb5}},
1771  	{ALGSL, ym_rl, Px, opBytes{0x0f, 0xb5}},
1772  	{ALGSQ, ym_rl, Pw, opBytes{0x0f, 0xb5}},
1773  	{ALSSW, ym_rl, Pe, opBytes{0x0f, 0xb2}},
1774  	{ALSSL, ym_rl, Px, opBytes{0x0f, 0xb2}},
1775  	{ALSSQ, ym_rl, Pw, opBytes{0x0f, 0xb2}},
1776  
1777  	{ABLENDPD, yxshuf, Pq, opBytes{0x3a, 0x0d, 0}},
1778  	{ABLENDPS, yxshuf, Pq, opBytes{0x3a, 0x0c, 0}},
1779  	{AXACQUIRE, ynone, Px, opBytes{0xf2}},
1780  	{AXRELEASE, ynone, Px, opBytes{0xf3}},
1781  	{AXBEGIN, yxbegin, Px, opBytes{0xc7, 0xf8}},
1782  	{AXABORT, yxabort, Px, opBytes{0xc6, 0xf8}},
1783  	{AXEND, ynone, Px, opBytes{0x0f, 01, 0xd5}},
1784  	{AXTEST, ynone, Px, opBytes{0x0f, 01, 0xd6}},
1785  	{AXGETBV, ynone, Pm, opBytes{01, 0xd0}},
1786  	{obj.AFUNCDATA, yfuncdata, Px, opBytes{0, 0}},
1787  	{obj.APCDATA, ypcdata, Px, opBytes{0, 0}},
1788  	{obj.ADUFFCOPY, yduff, Px, opBytes{0xe8}},
1789  	{obj.ADUFFZERO, yduff, Px, opBytes{0xe8}},
1790  
1791  	{obj.AEND, nil, 0, opBytes{}},
1792  	{0, nil, 0, opBytes{}},
1793  }
1794  
1795  var opindex [(ALAST + 1) & obj.AMask]*Optab
1796  
1797  // useAbs reports whether s describes a symbol that must avoid pc-relative addressing.
1798  // This happens on systems like Solaris that call .so functions instead of system calls.
1799  // It does not seem to be necessary for any other systems. This is probably working
1800  // around a Solaris-specific bug that should be fixed differently, but we don't know
1801  // what that bug is. And this does fix it.
1802  func useAbs(ctxt *obj.Link, s *obj.LSym) bool {
1803  	if ctxt.Headtype == objabi.Hsolaris {
1804  		// All the Solaris dynamic imports from libc.so begin with "libc_".
1805  		return strings.HasPrefix(s.Name, "libc_")
1806  	}
1807  	return ctxt.Arch.Family == sys.I386 && !ctxt.Flag_shared
1808  }
1809  
1810  // single-instruction no-ops of various lengths.
1811  // constructed by hand and disassembled with gdb to verify.
1812  // see http://www.agner.org/optimize/optimizing_assembly.pdf for discussion.
1813  var nop = [][16]uint8{
1814  	{0x90},
1815  	{0x66, 0x90},
1816  	{0x0F, 0x1F, 0x00},
1817  	{0x0F, 0x1F, 0x40, 0x00},
1818  	{0x0F, 0x1F, 0x44, 0x00, 0x00},
1819  	{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00},
1820  	{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00},
1821  	{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1822  	{0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1823  }
1824  
1825  // Native Client rejects the repeated 0x66 prefix.
1826  // {0x66, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
1827  func fillnop(p []byte, n int) {
1828  	var m int
1829  
1830  	for n > 0 {
1831  		m = n
1832  		if m > len(nop) {
1833  			m = len(nop)
1834  		}
1835  		copy(p[:m], nop[m-1][:m])
1836  		p = p[m:]
1837  		n -= m
1838  	}
1839  }
1840  
1841  func noppad(ctxt *obj.Link, s *obj.LSym, c int32, pad int32) int32 {
1842  	s.Grow(int64(c) + int64(pad))
1843  	fillnop(s.P[c:], int(pad))
1844  	return c + pad
1845  }
1846  
1847  func spadjop(ctxt *obj.Link, l, q obj.As) obj.As {
1848  	if ctxt.Arch.Family != sys.AMD64 || ctxt.Arch.PtrSize == 4 {
1849  		return l
1850  	}
1851  	return q
1852  }
1853  
1854  // If the environment variable GOAMD64=alignedjumps the assembler will ensure that
1855  // no standalone or macro-fused jump will straddle or end on a 32 byte boundary
1856  // by inserting NOPs before the jumps
1857  func isJump(p *obj.Prog) bool {
1858  	return p.To.Target() != nil || p.As == obj.AJMP || p.As == obj.ACALL ||
1859  		p.As == obj.ARET || p.As == obj.ADUFFCOPY || p.As == obj.ADUFFZERO
1860  }
1861  
1862  // lookForJCC returns the first real instruction starting from p, if that instruction is a conditional
1863  // jump. Otherwise, nil is returned.
1864  func lookForJCC(p *obj.Prog) *obj.Prog {
1865  	// Skip any PCDATA, FUNCDATA or NOP instructions
1866  	var q *obj.Prog
1867  	for q = p.Link; q != nil && (q.As == obj.APCDATA || q.As == obj.AFUNCDATA || q.As == obj.ANOP); q = q.Link {
1868  	}
1869  
1870  	if q == nil || q.To.Target() == nil || p.As == obj.AJMP || p.As == obj.ACALL {
1871  		return nil
1872  	}
1873  
1874  	switch q.As {
1875  	case AJOS, AJOC, AJCS, AJCC, AJEQ, AJNE, AJLS, AJHI,
1876  		AJMI, AJPL, AJPS, AJPC, AJLT, AJGE, AJLE, AJGT:
1877  	default:
1878  		return nil
1879  	}
1880  
1881  	return q
1882  }
1883  
1884  // fusedJump determines whether p can be fused with a subsequent conditional jump instruction.
1885  // If it can, we return true followed by the total size of the fused jump. If it can't, we return false.
1886  // Macro fusion rules are derived from the Intel Optimization Manual (April 2019) section 3.4.2.2.
1887  func fusedJump(p *obj.Prog) (bool, uint8) {
1888  	var fusedSize uint8
1889  
1890  	// The first instruction in a macro fused pair may be preceeded by the LOCK prefix,
1891  	// or possibly an XACQUIRE/XRELEASE prefix followed by a LOCK prefix. If it is, we
1892  	// need to be careful to insert any padding before the locks rather than directly after them.
1893  
1894  	if p.As == AXRELEASE || p.As == AXACQUIRE {
1895  		fusedSize += p.Isize
1896  		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
1897  		}
1898  		if p == nil {
1899  			return false, 0
1900  		}
1901  	}
1902  	if p.As == ALOCK {
1903  		fusedSize += p.Isize
1904  		for p = p.Link; p != nil && (p.As == obj.APCDATA || p.As == obj.AFUNCDATA); p = p.Link {
1905  		}
1906  		if p == nil {
1907  			return false, 0
1908  		}
1909  	}
1910  	cmp := p.As == ACMPB || p.As == ACMPL || p.As == ACMPQ || p.As == ACMPW
1911  
1912  	cmpAddSub := p.As == AADDB || p.As == AADDL || p.As == AADDW || p.As == AADDQ ||
1913  		p.As == ASUBB || p.As == ASUBL || p.As == ASUBW || p.As == ASUBQ || cmp
1914  
1915  	testAnd := p.As == ATESTB || p.As == ATESTL || p.As == ATESTQ || p.As == ATESTW ||
1916  		p.As == AANDB || p.As == AANDL || p.As == AANDQ || p.As == AANDW
1917  
1918  	incDec := p.As == AINCB || p.As == AINCL || p.As == AINCQ || p.As == AINCW ||
1919  		p.As == ADECB || p.As == ADECL || p.As == ADECQ || p.As == ADECW
1920  
1921  	if !cmpAddSub && !testAnd && !incDec {
1922  		return false, 0
1923  	}
1924  
1925  	if !incDec {
1926  		var argOne obj.AddrType
1927  		var argTwo obj.AddrType
1928  		if cmp {
1929  			argOne = p.From.Type
1930  			argTwo = p.To.Type
1931  		} else {
1932  			argOne = p.To.Type
1933  			argTwo = p.From.Type
1934  		}
1935  		if argOne == obj.TYPE_REG {
1936  			if argTwo != obj.TYPE_REG && argTwo != obj.TYPE_CONST && argTwo != obj.TYPE_MEM {
1937  				return false, 0
1938  			}
1939  		} else if argOne == obj.TYPE_MEM {
1940  			if argTwo != obj.TYPE_REG {
1941  				return false, 0
1942  			}
1943  		} else {
1944  			return false, 0
1945  		}
1946  	}
1947  
1948  	fusedSize += p.Isize
1949  	jmp := lookForJCC(p)
1950  	if jmp == nil {
1951  		return false, 0
1952  	}
1953  
1954  	fusedSize += jmp.Isize
1955  
1956  	if testAnd {
1957  		return true, fusedSize
1958  	}
1959  
1960  	if jmp.As == AJOC || jmp.As == AJOS || jmp.As == AJMI ||
1961  		jmp.As == AJPL || jmp.As == AJPS || jmp.As == AJPC {
1962  		return false, 0
1963  	}
1964  
1965  	if cmpAddSub {
1966  		return true, fusedSize
1967  	}
1968  
1969  	if jmp.As == AJCS || jmp.As == AJCC || jmp.As == AJHI || jmp.As == AJLS {
1970  		return false, 0
1971  	}
1972  
1973  	return true, fusedSize
1974  }
1975  
1976  type padJumpsCtx int32
1977  
1978  func makePjcCtx(ctxt *obj.Link) padJumpsCtx {
1979  	// Disable jump padding on 32 bit builds by settting
1980  	// padJumps to 0.
1981  	if ctxt.Arch.Family == sys.I386 {
1982  		return padJumpsCtx(0)
1983  	}
1984  
1985  	// Disable jump padding for hand written assembly code.
1986  	if ctxt.IsAsm {
1987  		return padJumpsCtx(0)
1988  	}
1989  
1990  	if objabi.GOAMD64 != "alignedjumps" {
1991  		return padJumpsCtx(0)
1992  
1993  	}
1994  
1995  	return padJumpsCtx(32)
1996  }
1997  
1998  // padJump detects whether the instruction being assembled is a standalone or a macro-fused
1999  // jump that needs to be padded. If it is, NOPs are inserted to ensure that the jump does
2000  // not cross or end on a 32 byte boundary.
2001  func (pjc padJumpsCtx) padJump(ctxt *obj.Link, s *obj.LSym, p *obj.Prog, c int32) int32 {
2002  	if pjc == 0 {
2003  		return c
2004  	}
2005  
2006  	var toPad int32
2007  	fj, fjSize := fusedJump(p)
2008  	mask := int32(pjc - 1)
2009  	if fj {
2010  		if (c&mask)+int32(fjSize) >= int32(pjc) {
2011  			toPad = int32(pjc) - (c & mask)
2012  		}
2013  	} else if isJump(p) {
2014  		if (c&mask)+int32(p.Isize) >= int32(pjc) {
2015  			toPad = int32(pjc) - (c & mask)
2016  		}
2017  	}
2018  	if toPad <= 0 {
2019  		return c
2020  	}
2021  
2022  	return noppad(ctxt, s, c, toPad)
2023  }
2024  
2025  // reAssemble is called if an instruction's size changes during assembly. If
2026  // it does and the instruction is a standalone or a macro-fused jump we need to
2027  // reassemble.
2028  func (pjc padJumpsCtx) reAssemble(p *obj.Prog) bool {
2029  	if pjc == 0 {
2030  		return false
2031  	}
2032  
2033  	fj, _ := fusedJump(p)
2034  	return fj || isJump(p)
2035  }
2036  
2037  type nopPad struct {
2038  	p *obj.Prog // Instruction before the pad
2039  	n int32     // Size of the pad
2040  }
2041  
2042  func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
2043  	pjc := makePjcCtx(ctxt)
2044  
2045  	if s.P != nil {
2046  		return
2047  	}
2048  
2049  	if ycover[0] == 0 {
2050  		ctxt.Diag("x86 tables not initialized, call x86.instinit first")
2051  	}
2052  
2053  	for p := s.Func.Text; p != nil; p = p.Link {
2054  		if p.To.Type == obj.TYPE_BRANCH && p.To.Target() == nil {
2055  			p.To.SetTarget(p)
2056  		}
2057  		if p.As == AADJSP {
2058  			p.To.Type = obj.TYPE_REG
2059  			p.To.Reg = REG_SP
2060  			// Generate 'ADDQ $x, SP' or 'SUBQ $x, SP', with x positive.
2061  			// One exception: It is smaller to encode $-0x80 than $0x80.
2062  			// For that case, flip the sign and the op:
2063  			// Instead of 'ADDQ $0x80, SP', generate 'SUBQ $-0x80, SP'.
2064  			switch v := p.From.Offset; {
2065  			case v == 0:
2066  				p.As = obj.ANOP
2067  			case v == 0x80 || (v < 0 && v != -0x80):
2068  				p.As = spadjop(ctxt, AADDL, AADDQ)
2069  				p.From.Offset *= -1
2070  			default:
2071  				p.As = spadjop(ctxt, ASUBL, ASUBQ)
2072  			}
2073  		}
2074  		if ctxt.Retpoline && (p.As == obj.ACALL || p.As == obj.AJMP) && (p.To.Type == obj.TYPE_REG || p.To.Type == obj.TYPE_MEM) {
2075  			if p.To.Type != obj.TYPE_REG {
2076  				ctxt.Diag("non-retpoline-compatible: %v", p)
2077  				continue
2078  			}
2079  			p.To.Type = obj.TYPE_BRANCH
2080  			p.To.Name = obj.NAME_EXTERN
2081  			p.To.Sym = ctxt.Lookup("runtime.retpoline" + obj.Rconv(int(p.To.Reg)))
2082  			p.To.Reg = 0
2083  			p.To.Offset = 0
2084  		}
2085  	}
2086  
2087  	var count int64 // rough count of number of instructions
2088  	for p := s.Func.Text; p != nil; p = p.Link {
2089  		count++
2090  		p.Back = branchShort // use short branches first time through
2091  		if q := p.To.Target(); q != nil && (q.Back&branchShort != 0) {
2092  			p.Back |= branchBackwards
2093  			q.Back |= branchLoopHead
2094  		}
2095  	}
2096  	s.GrowCap(count * 5) // preallocate roughly 5 bytes per instruction
2097  
2098  	var ab AsmBuf
2099  	var n int
2100  	var c int32
2101  	errors := ctxt.Errors
2102  	var nops []nopPad // Padding for a particular assembly (reuse slice storage if multiple assemblies)
2103  	for {
2104  		// This loop continues while there are reasons to re-assemble
2105  		// whole block, like the presence of long forward jumps.
2106  		reAssemble := false
2107  		for i := range s.R {
2108  			s.R[i] = obj.Reloc{}
2109  		}
2110  		s.R = s.R[:0]
2111  		s.P = s.P[:0]
2112  		c = 0
2113  		var pPrev *obj.Prog
2114  		nops = nops[:0]
2115  		for p := s.Func.Text; p != nil; p = p.Link {
2116  			c0 := c
2117  			c = pjc.padJump(ctxt, s, p, c)
2118  
2119  			if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 {
2120  				// pad with NOPs
2121  				v := -c & (loopAlign - 1)
2122  
2123  				if v <= maxLoopPad {
2124  					s.Grow(int64(c) + int64(v))
2125  					fillnop(s.P[c:], int(v))
2126  					c += v
2127  				}
2128  			}
2129  
2130  			p.Pc = int64(c)
2131  
2132  			// process forward jumps to p
2133  			for q := p.Rel; q != nil; q = q.Forwd {
2134  				v := int32(p.Pc - (q.Pc + int64(q.Isize)))
2135  				if q.Back&branchShort != 0 {
2136  					if v > 127 {
2137  						reAssemble = true
2138  						q.Back ^= branchShort
2139  					}
2140  
2141  					if q.As == AJCXZL || q.As == AXBEGIN {
2142  						s.P[q.Pc+2] = byte(v)
2143  					} else {
2144  						s.P[q.Pc+1] = byte(v)
2145  					}
2146  				} else {
2147  					binary.LittleEndian.PutUint32(s.P[q.Pc+int64(q.Isize)-4:], uint32(v))
2148  				}
2149  			}
2150  
2151  			p.Rel = nil
2152  
2153  			p.Pc = int64(c)
2154  			ab.asmins(ctxt, s, p)
2155  			m := ab.Len()
2156  			if int(p.Isize) != m {
2157  				p.Isize = uint8(m)
2158  				if pjc.reAssemble(p) {
2159  					// We need to re-assemble here to check for jumps and fused jumps
2160  					// that span or end on 32 byte boundaries.
2161  					reAssemble = true
2162  				}
2163  			}
2164  
2165  			s.Grow(p.Pc + int64(m))
2166  			copy(s.P[p.Pc:], ab.Bytes())
2167  			// If there was padding, remember it.
2168  			if pPrev != nil && !ctxt.IsAsm && c > c0 {
2169  				nops = append(nops, nopPad{p: pPrev, n: c - c0})
2170  			}
2171  			c += int32(m)
2172  			pPrev = p
2173  		}
2174  
2175  		n++
2176  		if n > 20 {
2177  			ctxt.Diag("span must be looping")
2178  			log.Fatalf("loop")
2179  		}
2180  		if !reAssemble {
2181  			break
2182  		}
2183  		if ctxt.Errors > errors {
2184  			return
2185  		}
2186  	}
2187  	// splice padding nops into Progs
2188  	for _, n := range nops {
2189  		pp := n.p
2190  		np := &obj.Prog{Link: pp.Link, Ctxt: pp.Ctxt, As: obj.ANOP, Pos: pp.Pos.WithNotStmt(), Pc: pp.Pc + int64(pp.Isize), Isize: uint8(n.n)}
2191  		pp.Link = np
2192  	}
2193  
2194  	s.Size = int64(c)
2195  
2196  	if false { /* debug['a'] > 1 */
2197  		fmt.Printf("span1 %s %d (%d tries)\n %.6x", s.Name, s.Size, n, 0)
2198  		var i int
2199  		for i = 0; i < len(s.P); i++ {
2200  			fmt.Printf(" %.2x", s.P[i])
2201  			if i%16 == 15 {
2202  				fmt.Printf("\n  %.6x", uint(i+1))
2203  			}
2204  		}
2205  
2206  		if i%16 != 0 {
2207  			fmt.Printf("\n")
2208  		}
2209  
2210  		for i := 0; i < len(s.R); i++ {
2211  			r := &s.R[i]
2212  			fmt.Printf(" rel %#.4x/%d %s%+d\n", uint32(r.Off), r.Siz, r.Sym.Name, r.Add)
2213  		}
2214  	}
2215  
2216  	// Mark nonpreemptible instruction sequences.
2217  	// The 2-instruction TLS access sequence
2218  	//	MOVQ TLS, BX
2219  	//	MOVQ 0(BX)(TLS*1), BX
2220  	// is not async preemptible, as if it is preempted and resumed on
2221  	// a different thread, the TLS address may become invalid.
2222  	if !CanUse1InsnTLS(ctxt) {
2223  		useTLS := func(p *obj.Prog) bool {
2224  			// Only need to mark the second instruction, which has
2225  			// REG_TLS as Index. (It is okay to interrupt and restart
2226  			// the first instruction.)
2227  			return p.From.Index == REG_TLS
2228  		}
2229  		obj.MarkUnsafePoints(ctxt, s.Func.Text, newprog, useTLS, nil)
2230  	}
2231  }
2232  
2233  func instinit(ctxt *obj.Link) {
2234  	if ycover[0] != 0 {
2235  		// Already initialized; stop now.
2236  		// This happens in the cmd/asm tests,
2237  		// each of which re-initializes the arch.
2238  		return
2239  	}
2240  
2241  	switch ctxt.Headtype {
2242  	case objabi.Hplan9:
2243  		plan9privates = ctxt.Lookup("_privates")
2244  	}
2245  
2246  	for i := range avxOptab {
2247  		c := avxOptab[i].as
2248  		if opindex[c&obj.AMask] != nil {
2249  			ctxt.Diag("phase error in avxOptab: %d (%v)", i, c)
2250  		}
2251  		opindex[c&obj.AMask] = &avxOptab[i]
2252  	}
2253  	for i := 1; optab[i].as != 0; i++ {
2254  		c := optab[i].as
2255  		if opindex[c&obj.AMask] != nil {
2256  			ctxt.Diag("phase error in optab: %d (%v)", i, c)
2257  		}
2258  		opindex[c&obj.AMask] = &optab[i]
2259  	}
2260  
2261  	for i := 0; i < Ymax; i++ {
2262  		ycover[i*Ymax+i] = 1
2263  	}
2264  
2265  	ycover[Yi0*Ymax+Yu2] = 1
2266  	ycover[Yi1*Ymax+Yu2] = 1
2267  
2268  	ycover[Yi0*Ymax+Yi8] = 1
2269  	ycover[Yi1*Ymax+Yi8] = 1
2270  	ycover[Yu2*Ymax+Yi8] = 1
2271  	ycover[Yu7*Ymax+Yi8] = 1
2272  
2273  	ycover[Yi0*Ymax+Yu7] = 1
2274  	ycover[Yi1*Ymax+Yu7] = 1
2275  	ycover[Yu2*Ymax+Yu7] = 1
2276  
2277  	ycover[Yi0*Ymax+Yu8] = 1
2278  	ycover[Yi1*Ymax+Yu8] = 1
2279  	ycover[Yu2*Ymax+Yu8] = 1
2280  	ycover[Yu7*Ymax+Yu8] = 1
2281  
2282  	ycover[Yi0*Ymax+Ys32] = 1
2283  	ycover[Yi1*Ymax+Ys32] = 1
2284  	ycover[Yu2*Ymax+Ys32] = 1
2285  	ycover[Yu7*Ymax+Ys32] = 1
2286  	ycover[Yu8*Ymax+Ys32] = 1
2287  	ycover[Yi8*Ymax+Ys32] = 1
2288  
2289  	ycover[Yi0*Ymax+Yi32] = 1
2290  	ycover[Yi1*Ymax+Yi32] = 1
2291  	ycover[Yu2*Ymax+Yi32] = 1
2292  	ycover[Yu7*Ymax+Yi32] = 1
2293  	ycover[Yu8*Ymax+Yi32] = 1
2294  	ycover[Yi8*Ymax+Yi32] = 1
2295  	ycover[Ys32*Ymax+Yi32] = 1
2296  
2297  	ycover[Yi0*Ymax+Yi64] = 1
2298  	ycover[Yi1*Ymax+Yi64] = 1
2299  	ycover[Yu7*Ymax+Yi64] = 1
2300  	ycover[Yu2*Ymax+Yi64] = 1
2301  	ycover[Yu8*Ymax+Yi64] = 1
2302  	ycover[Yi8*Ymax+Yi64] = 1
2303  	ycover[Ys32*Ymax+Yi64] = 1
2304  	ycover[Yi32*Ymax+Yi64] = 1
2305  
2306  	ycover[Yal*Ymax+Yrb] = 1
2307  	ycover[Ycl*Ymax+Yrb] = 1
2308  	ycover[Yax*Ymax+Yrb] = 1
2309  	ycover[Ycx*Ymax+Yrb] = 1
2310  	ycover[Yrx*Ymax+Yrb] = 1
2311  	ycover[Yrl*Ymax+Yrb] = 1 // but not Yrl32
2312  
2313  	ycover[Ycl*Ymax+Ycx] = 1
2314  
2315  	ycover[Yax*Ymax+Yrx] = 1
2316  	ycover[Ycx*Ymax+Yrx] = 1
2317  
2318  	ycover[Yax*Ymax+Yrl] = 1
2319  	ycover[Ycx*Ymax+Yrl] = 1
2320  	ycover[Yrx*Ymax+Yrl] = 1
2321  	ycover[Yrl32*Ymax+Yrl] = 1
2322  
2323  	ycover[Yf0*Ymax+Yrf] = 1
2324  
2325  	ycover[Yal*Ymax+Ymb] = 1
2326  	ycover[Ycl*Ymax+Ymb] = 1
2327  	ycover[Yax*Ymax+Ymb] = 1
2328  	ycover[Ycx*Ymax+Ymb] = 1
2329  	ycover[Yrx*Ymax+Ymb] = 1
2330  	ycover[Yrb*Ymax+Ymb] = 1
2331  	ycover[Yrl*Ymax+Ymb] = 1 // but not Yrl32
2332  	ycover[Ym*Ymax+Ymb] = 1
2333  
2334  	ycover[Yax*Ymax+Yml] = 1
2335  	ycover[Ycx*Ymax+Yml] = 1
2336  	ycover[Yrx*Ymax+Yml] = 1
2337  	ycover[Yrl*Ymax+Yml] = 1
2338  	ycover[Yrl32*Ymax+Yml] = 1
2339  	ycover[Ym*Ymax+Yml] = 1
2340  
2341  	ycover[Yax*Ymax+Ymm] = 1
2342  	ycover[Ycx*Ymax+Ymm] = 1
2343  	ycover[Yrx*Ymax+Ymm] = 1
2344  	ycover[Yrl*Ymax+Ymm] = 1
2345  	ycover[Yrl32*Ymax+Ymm] = 1
2346  	ycover[Ym*Ymax+Ymm] = 1
2347  	ycover[Ymr*Ymax+Ymm] = 1
2348  
2349  	ycover[Yxr0*Ymax+Yxr] = 1
2350  
2351  	ycover[Ym*Ymax+Yxm] = 1
2352  	ycover[Yxr0*Ymax+Yxm] = 1
2353  	ycover[Yxr*Ymax+Yxm] = 1
2354  
2355  	ycover[Ym*Ymax+Yym] = 1
2356  	ycover[Yyr*Ymax+Yym] = 1
2357  
2358  	ycover[Yxr0*Ymax+YxrEvex] = 1
2359  	ycover[Yxr*Ymax+YxrEvex] = 1
2360  
2361  	ycover[Ym*Ymax+YxmEvex] = 1
2362  	ycover[Yxr0*Ymax+YxmEvex] = 1
2363  	ycover[Yxr*Ymax+YxmEvex] = 1
2364  	ycover[YxrEvex*Ymax+YxmEvex] = 1
2365  
2366  	ycover[Yyr*Ymax+YyrEvex] = 1
2367  
2368  	ycover[Ym*Ymax+YymEvex] = 1
2369  	ycover[Yyr*Ymax+YymEvex] = 1
2370  	ycover[YyrEvex*Ymax+YymEvex] = 1
2371  
2372  	ycover[Ym*Ymax+Yzm] = 1
2373  	ycover[Yzr*Ymax+Yzm] = 1
2374  
2375  	ycover[Yk0*Ymax+Yk] = 1
2376  	ycover[Yknot0*Ymax+Yk] = 1
2377  
2378  	ycover[Yk0*Ymax+Ykm] = 1
2379  	ycover[Yknot0*Ymax+Ykm] = 1
2380  	ycover[Yk*Ymax+Ykm] = 1
2381  	ycover[Ym*Ymax+Ykm] = 1
2382  
2383  	ycover[Yxvm*Ymax+YxvmEvex] = 1
2384  
2385  	ycover[Yyvm*Ymax+YyvmEvex] = 1
2386  
2387  	for i := 0; i < MAXREG; i++ {
2388  		reg[i] = -1
2389  		if i >= REG_AL && i <= REG_R15B {
2390  			reg[i] = (i - REG_AL) & 7
2391  			if i >= REG_SPB && i <= REG_DIB {
2392  				regrex[i] = 0x40
2393  			}
2394  			if i >= REG_R8B && i <= REG_R15B {
2395  				regrex[i] = Rxr | Rxx | Rxb
2396  			}
2397  		}
2398  
2399  		if i >= REG_AH && i <= REG_BH {
2400  			reg[i] = 4 + ((i - REG_AH) & 7)
2401  		}
2402  		if i >= REG_AX && i <= REG_R15 {
2403  			reg[i] = (i - REG_AX) & 7
2404  			if i >= REG_R8 {
2405  				regrex[i] = Rxr | Rxx | Rxb
2406  			}
2407  		}
2408  
2409  		if i >= REG_F0 && i <= REG_F0+7 {
2410  			reg[i] = (i - REG_F0) & 7
2411  		}
2412  		if i >= REG_M0 && i <= REG_M0+7 {
2413  			reg[i] = (i - REG_M0) & 7
2414  		}
2415  		if i >= REG_K0 && i <= REG_K0+7 {
2416  			reg[i] = (i - REG_K0) & 7
2417  		}
2418  		if i >= REG_X0 && i <= REG_X0+15 {
2419  			reg[i] = (i - REG_X0) & 7
2420  			if i >= REG_X0+8 {
2421  				regrex[i] = Rxr | Rxx | Rxb
2422  			}
2423  		}
2424  		if i >= REG_X16 && i <= REG_X16+15 {
2425  			reg[i] = (i - REG_X16) & 7
2426  			if i >= REG_X16+8 {
2427  				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2428  			} else {
2429  				regrex[i] = RxrEvex
2430  			}
2431  		}
2432  		if i >= REG_Y0 && i <= REG_Y0+15 {
2433  			reg[i] = (i - REG_Y0) & 7
2434  			if i >= REG_Y0+8 {
2435  				regrex[i] = Rxr | Rxx | Rxb
2436  			}
2437  		}
2438  		if i >= REG_Y16 && i <= REG_Y16+15 {
2439  			reg[i] = (i - REG_Y16) & 7
2440  			if i >= REG_Y16+8 {
2441  				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2442  			} else {
2443  				regrex[i] = RxrEvex
2444  			}
2445  		}
2446  		if i >= REG_Z0 && i <= REG_Z0+15 {
2447  			reg[i] = (i - REG_Z0) & 7
2448  			if i > REG_Z0+7 {
2449  				regrex[i] = Rxr | Rxx | Rxb
2450  			}
2451  		}
2452  		if i >= REG_Z16 && i <= REG_Z16+15 {
2453  			reg[i] = (i - REG_Z16) & 7
2454  			if i >= REG_Z16+8 {
2455  				regrex[i] = Rxr | Rxx | Rxb | RxrEvex
2456  			} else {
2457  				regrex[i] = RxrEvex
2458  			}
2459  		}
2460  
2461  		if i >= REG_CR+8 && i <= REG_CR+15 {
2462  			regrex[i] = Rxr
2463  		}
2464  	}
2465  }
2466  
2467  var isAndroid = objabi.GOOS == "android"
2468  
2469  func prefixof(ctxt *obj.Link, a *obj.Addr) int {
2470  	if a.Reg < REG_CS && a.Index < REG_CS { // fast path
2471  		return 0
2472  	}
2473  	if a.Type == obj.TYPE_MEM && a.Name == obj.NAME_NONE {
2474  		switch a.Reg {
2475  		case REG_CS:
2476  			return 0x2e
2477  
2478  		case REG_DS:
2479  			return 0x3e
2480  
2481  		case REG_ES:
2482  			return 0x26
2483  
2484  		case REG_FS:
2485  			return 0x64
2486  
2487  		case REG_GS:
2488  			return 0x65
2489  
2490  		case REG_TLS:
2491  			// NOTE: Systems listed here should be only systems that
2492  			// support direct TLS references like 8(TLS) implemented as
2493  			// direct references from FS or GS. Systems that require
2494  			// the initial-exec model, where you load the TLS base into
2495  			// a register and then index from that register, do not reach
2496  			// this code and should not be listed.
2497  			if ctxt.Arch.Family == sys.I386 {
2498  				switch ctxt.Headtype {
2499  				default:
2500  					if isAndroid {
2501  						return 0x65 // GS
2502  					}
2503  					log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)
2504  
2505  				case objabi.Hdarwin,
2506  					objabi.Hdragonfly,
2507  					objabi.Hfreebsd,
2508  					objabi.Hnetbsd,
2509  					objabi.Hopenbsd:
2510  					return 0x65 // GS
2511  				}
2512  			}
2513  
2514  			switch ctxt.Headtype {
2515  			default:
2516  				log.Fatalf("unknown TLS base register for %v", ctxt.Headtype)
2517  
2518  			case objabi.Hlinux:
2519  				if isAndroid {
2520  					return 0x64 // FS
2521  				}
2522  
2523  				if ctxt.Flag_shared {
2524  					log.Fatalf("unknown TLS base register for linux with -shared")
2525  				} else {
2526  					return 0x64 // FS
2527  				}
2528  
2529  			case objabi.Hdragonfly,
2530  				objabi.Hfreebsd,
2531  				objabi.Hnetbsd,
2532  				objabi.Hopenbsd,
2533  				objabi.Hsolaris:
2534  				return 0x64 // FS
2535  
2536  			case objabi.Hdarwin:
2537  				return 0x65 // GS
2538  			}
2539  		}
2540  	}
2541  
2542  	if ctxt.Arch.Family == sys.I386 {
2543  		if a.Index == REG_TLS && ctxt.Flag_shared {
2544  			// When building for inclusion into a shared library, an instruction of the form
2545  			//     MOVL off(CX)(TLS*1), AX
2546  			// becomes
2547  			//     mov %gs:off(%ecx), %eax
2548  			// which assumes that the correct TLS offset has been loaded into %ecx (today
2549  			// there is only one TLS variable -- g -- so this is OK). When not building for
2550  			// a shared library the instruction it becomes
2551  			//     mov 0x0(%ecx), %eax
2552  			// and a R_TLS_LE relocation, and so does not require a prefix.
2553  			return 0x65 // GS
2554  		}
2555  		return 0
2556  	}
2557  
2558  	switch a.Index {
2559  	case REG_CS:
2560  		return 0x2e
2561  
2562  	case REG_DS:
2563  		return 0x3e
2564  
2565  	case REG_ES:
2566  		return 0x26
2567  
2568  	case REG_TLS:
2569  		if ctxt.Flag_shared && ctxt.Headtype != objabi.Hwindows {
2570  			// When building for inclusion into a shared library, an instruction of the form
2571  			//     MOV off(CX)(TLS*1), AX
2572  			// becomes
2573  			//     mov %fs:off(%rcx), %rax
2574  			// which assumes that the correct TLS offset has been loaded into %rcx (today
2575  			// there is only one TLS variable -- g -- so this is OK). When not building for
2576  			// a shared library the instruction does not require a prefix.
2577  			return 0x64
2578  		}
2579  
2580  	case REG_FS:
2581  		return 0x64
2582  
2583  	case REG_GS:
2584  		return 0x65
2585  	}
2586  
2587  	return 0
2588  }
2589  
2590  // oclassRegList returns multisource operand class for addr.
2591  func oclassRegList(ctxt *obj.Link, addr *obj.Addr) int {
2592  	// TODO(quasilyte): when oclass register case is refactored into
2593  	// lookup table, use it here to get register kind more easily.
2594  	// Helper functions like regIsXmm should go away too (they will become redundant).
2595  
2596  	regIsXmm := func(r int) bool { return r >= REG_X0 && r <= REG_X31 }
2597  	regIsYmm := func(r int) bool { return r >= REG_Y0 && r <= REG_Y31 }
2598  	regIsZmm := func(r int) bool { return r >= REG_Z0 && r <= REG_Z31 }
2599  
2600  	reg0, reg1 := decodeRegisterRange(addr.Offset)
2601  	low := regIndex(int16(reg0))
2602  	high := regIndex(int16(reg1))
2603  
2604  	if ctxt.Arch.Family == sys.I386 {
2605  		if low >= 8 || high >= 8 {
2606  			return Yxxx
2607  		}
2608  	}
2609  
2610  	switch high - low {
2611  	case 3:
2612  		switch {
2613  		case regIsXmm(reg0) && regIsXmm(reg1):
2614  			return YxrEvexMulti4
2615  		case regIsYmm(reg0) && regIsYmm(reg1):
2616  			return YyrEvexMulti4
2617  		case regIsZmm(reg0) && regIsZmm(reg1):
2618  			return YzrMulti4
2619  		default:
2620  			return Yxxx
2621  		}
2622  	default:
2623  		return Yxxx
2624  	}
2625  }
2626  
2627  // oclassVMem returns V-mem (vector memory with VSIB) operand class.
2628  // For addr that is not V-mem returns (Yxxx, false).
2629  func oclassVMem(ctxt *obj.Link, addr *obj.Addr) (int, bool) {
2630  	switch addr.Index {
2631  	case REG_X0 + 0,
2632  		REG_X0 + 1,
2633  		REG_X0 + 2,
2634  		REG_X0 + 3,
2635  		REG_X0 + 4,
2636  		REG_X0 + 5,
2637  		REG_X0 + 6,
2638  		REG_X0 + 7:
2639  		return Yxvm, true
2640  	case REG_X8 + 0,
2641  		REG_X8 + 1,
2642  		REG_X8 + 2,
2643  		REG_X8 + 3,
2644  		REG_X8 + 4,
2645  		REG_X8 + 5,
2646  		REG_X8 + 6,
2647  		REG_X8 + 7:
2648  		if ctxt.Arch.Family == sys.I386 {
2649  			return Yxxx, true
2650  		}
2651  		return Yxvm, true
2652  	case REG_X16 + 0,
2653  		REG_X16 + 1,
2654  		REG_X16 + 2,
2655  		REG_X16 + 3,
2656  		REG_X16 + 4,
2657  		REG_X16 + 5,
2658  		REG_X16 + 6,
2659  		REG_X16 + 7,
2660  		REG_X16 + 8,
2661  		REG_X16 + 9,
2662  		REG_X16 + 10,
2663  		REG_X16 + 11,
2664  		REG_X16 + 12,
2665  		REG_X16 + 13,
2666  		REG_X16 + 14,
2667  		REG_X16 + 15:
2668  		if ctxt.Arch.Family == sys.I386 {
2669  			return Yxxx, true
2670  		}
2671  		return YxvmEvex, true
2672  
2673  	case REG_Y0 + 0,
2674  		REG_Y0 + 1,
2675  		REG_Y0 + 2,
2676  		REG_Y0 + 3,
2677  		REG_Y0 + 4,
2678  		REG_Y0 + 5,
2679  		REG_Y0 + 6,
2680  		REG_Y0 + 7:
2681  		return Yyvm, true
2682  	case REG_Y8 + 0,
2683  		REG_Y8 + 1,
2684  		REG_Y8 + 2,
2685  		REG_Y8 + 3,
2686  		REG_Y8 + 4,
2687  		REG_Y8 + 5,
2688  		REG_Y8 + 6,
2689  		REG_Y8 + 7:
2690  		if ctxt.Arch.Family == sys.I386 {
2691  			return Yxxx, true
2692  		}
2693  		return Yyvm, true
2694  	case REG_Y16 + 0,
2695  		REG_Y16 + 1,
2696  		REG_Y16 + 2,
2697  		REG_Y16 + 3,
2698  		REG_Y16 + 4,
2699  		REG_Y16 + 5,
2700  		REG_Y16 + 6,
2701  		REG_Y16 + 7,
2702  		REG_Y16 + 8,
2703  		REG_Y16 + 9,
2704  		REG_Y16 + 10,
2705  		REG_Y16 + 11,
2706  		REG_Y16 + 12,
2707  		REG_Y16 + 13,
2708  		REG_Y16 + 14,
2709  		REG_Y16 + 15:
2710  		if ctxt.Arch.Family == sys.I386 {
2711  			return Yxxx, true
2712  		}
2713  		return YyvmEvex, true
2714  
2715  	case REG_Z0 + 0,
2716  		REG_Z0 + 1,
2717  		REG_Z0 + 2,
2718  		REG_Z0 + 3,
2719  		REG_Z0 + 4,
2720  		REG_Z0 + 5,
2721  		REG_Z0 + 6,
2722  		REG_Z0 + 7:
2723  		return Yzvm, true
2724  	case REG_Z8 + 0,
2725  		REG_Z8 + 1,
2726  		REG_Z8 + 2,
2727  		REG_Z8 + 3,
2728  		REG_Z8 + 4,
2729  		REG_Z8 + 5,
2730  		REG_Z8 + 6,
2731  		REG_Z8 + 7,
2732  		REG_Z8 + 8,
2733  		REG_Z8 + 9,
2734  		REG_Z8 + 10,
2735  		REG_Z8 + 11,
2736  		REG_Z8 + 12,
2737  		REG_Z8 + 13,
2738  		REG_Z8 + 14,
2739  		REG_Z8 + 15,
2740  		REG_Z8 + 16,
2741  		REG_Z8 + 17,
2742  		REG_Z8 + 18,
2743  		REG_Z8 + 19,
2744  		REG_Z8 + 20,
2745  		REG_Z8 + 21,
2746  		REG_Z8 + 22,
2747  		REG_Z8 + 23:
2748  		if ctxt.Arch.Family == sys.I386 {
2749  			return Yxxx, true
2750  		}
2751  		return Yzvm, true
2752  	}
2753  
2754  	return Yxxx, false
2755  }
2756  
2757  func oclass(ctxt *obj.Link, p *obj.Prog, a *obj.Addr) int {
2758  	switch a.Type {
2759  	case obj.TYPE_REGLIST:
2760  		return oclassRegList(ctxt, a)
2761  
2762  	case obj.TYPE_NONE:
2763  		return Ynone
2764  
2765  	case obj.TYPE_BRANCH:
2766  		return Ybr
2767  
2768  	case obj.TYPE_INDIR:
2769  		if a.Name != obj.NAME_NONE && a.Reg == REG_NONE && a.Index == REG_NONE && a.Scale == 0 {
2770  			return Yindir
2771  		}
2772  		return Yxxx
2773  
2774  	case obj.TYPE_MEM:
2775  		// Pseudo registers have negative index, but SP is
2776  		// not pseudo on x86, hence REG_SP check is not redundant.
2777  		if a.Index == REG_SP || a.Index < 0 {
2778  			// Can't use FP/SB/PC/SP as the index register.
2779  			return Yxxx
2780  		}
2781  
2782  		if vmem, ok := oclassVMem(ctxt, a); ok {
2783  			return vmem
2784  		}
2785  
2786  		if ctxt.Arch.Family == sys.AMD64 {
2787  			switch a.Name {
2788  			case obj.NAME_EXTERN, obj.NAME_STATIC, obj.NAME_GOTREF:
2789  				// Global variables can't use index registers and their
2790  				// base register is %rip (%rip is encoded as REG_NONE).
2791  				if a.Reg != REG_NONE || a.Index != REG_NONE || a.Scale != 0 {
2792  					return Yxxx
2793  				}
2794  			case obj.NAME_AUTO, obj.NAME_PARAM:
2795  				// These names must have a base of SP.  The old compiler
2796  				// uses 0 for the base register. SSA uses REG_SP.
2797  				if a.Reg != REG_SP && a.Reg != 0 {
2798  					return Yxxx
2799  				}
2800  			case obj.NAME_NONE:
2801  				// everything is ok
2802  			default:
2803  				// unknown name
2804  				return Yxxx
2805  			}
2806  		}
2807  		return Ym
2808  
2809  	case obj.TYPE_ADDR:
2810  		switch a.Name {
2811  		case obj.NAME_GOTREF:
2812  			ctxt.Diag("unexpected TYPE_ADDR with NAME_GOTREF")
2813  			return Yxxx
2814  
2815  		case obj.NAME_EXTERN,
2816  			obj.NAME_STATIC:
2817  			if a.Sym != nil && useAbs(ctxt, a.Sym) {
2818  				return Yi32
2819  			}
2820  			return Yiauto // use pc-relative addressing
2821  
2822  		case obj.NAME_AUTO,
2823  			obj.NAME_PARAM:
2824  			return Yiauto
2825  		}
2826  
2827  		// TODO(rsc): DUFFZERO/DUFFCOPY encoding forgot to set a->index
2828  		// and got Yi32 in an earlier version of this code.
2829  		// Keep doing that until we fix yduff etc.
2830  		if a.Sym != nil && strings.HasPrefix(a.Sym.Name, "runtime.duff") {
2831  			return Yi32
2832  		}
2833  
2834  		if a.Sym != nil || a.Name != obj.NAME_NONE {
2835  			ctxt.Diag("unexpected addr: %v", obj.Dconv(p, a))
2836  		}
2837  		fallthrough
2838  
2839  	case obj.TYPE_CONST:
2840  		if a.Sym != nil {
2841  			ctxt.Diag("TYPE_CONST with symbol: %v", obj.Dconv(p, a))
2842  		}
2843  
2844  		v := a.Offset
2845  		if ctxt.Arch.Family == sys.I386 {
2846  			v = int64(int32(v))
2847  		}
2848  		switch {
2849  		case v == 0:
2850  			return Yi0
2851  		case v == 1:
2852  			return Yi1
2853  		case v >= 0 && v <= 3:
2854  			return Yu2
2855  		case v >= 0 && v <= 127:
2856  			return Yu7
2857  		case v >= 0 && v <= 255:
2858  			return Yu8
2859  		case v >= -128 && v <= 127:
2860  			return Yi8
2861  		}
2862  		if ctxt.Arch.Family == sys.I386 {
2863  			return Yi32
2864  		}
2865  		l := int32(v)
2866  		if int64(l) == v {
2867  			return Ys32 // can sign extend
2868  		}
2869  		if v>>32 == 0 {
2870  			return Yi32 // unsigned
2871  		}
2872  		return Yi64
2873  
2874  	case obj.TYPE_TEXTSIZE:
2875  		return Ytextsize
2876  	}
2877  
2878  	if a.Type != obj.TYPE_REG {
2879  		ctxt.Diag("unexpected addr1: type=%d %v", a.Type, obj.Dconv(p, a))
2880  		return Yxxx
2881  	}
2882  
2883  	switch a.Reg {
2884  	case REG_AL:
2885  		return Yal
2886  
2887  	case REG_AX:
2888  		return Yax
2889  
2890  		/*
2891  			case REG_SPB:
2892  		*/
2893  	case REG_BPB,
2894  		REG_SIB,
2895  		REG_DIB,
2896  		REG_R8B,
2897  		REG_R9B,
2898  		REG_R10B,
2899  		REG_R11B,
2900  		REG_R12B,
2901  		REG_R13B,
2902  		REG_R14B,
2903  		REG_R15B:
2904  		if ctxt.Arch.Family == sys.I386 {
2905  			return Yxxx
2906  		}
2907  		fallthrough
2908  
2909  	case REG_DL,
2910  		REG_BL,
2911  		REG_AH,
2912  		REG_CH,
2913  		REG_DH,
2914  		REG_BH:
2915  		return Yrb
2916  
2917  	case REG_CL:
2918  		return Ycl
2919  
2920  	case REG_CX:
2921  		return Ycx
2922  
2923  	case REG_DX, REG_BX:
2924  		return Yrx
2925  
2926  	case REG_R8, // not really Yrl
2927  		REG_R9,
2928  		REG_R10,
2929  		REG_R11,
2930  		REG_R12,
2931  		REG_R13,
2932  		REG_R14,
2933  		REG_R15:
2934  		if ctxt.Arch.Family == sys.I386 {
2935  			return Yxxx
2936  		}
2937  		fallthrough
2938  
2939  	case REG_SP, REG_BP, REG_SI, REG_DI:
2940  		if ctxt.Arch.Family == sys.I386 {
2941  			return Yrl32
2942  		}
2943  		return Yrl
2944  
2945  	case REG_F0 + 0:
2946  		return Yf0
2947  
2948  	case REG_F0 + 1,
2949  		REG_F0 + 2,
2950  		REG_F0 + 3,
2951  		REG_F0 + 4,
2952  		REG_F0 + 5,
2953  		REG_F0 + 6,
2954  		REG_F0 + 7:
2955  		return Yrf
2956  
2957  	case REG_M0 + 0,
2958  		REG_M0 + 1,
2959  		REG_M0 + 2,
2960  		REG_M0 + 3,
2961  		REG_M0 + 4,
2962  		REG_M0 + 5,
2963  		REG_M0 + 6,
2964  		REG_M0 + 7:
2965  		return Ymr
2966  
2967  	case REG_X0:
2968  		return Yxr0
2969  
2970  	case REG_X0 + 1,
2971  		REG_X0 + 2,
2972  		REG_X0 + 3,
2973  		REG_X0 + 4,
2974  		REG_X0 + 5,
2975  		REG_X0 + 6,
2976  		REG_X0 + 7,
2977  		REG_X0 + 8,
2978  		REG_X0 + 9,
2979  		REG_X0 + 10,
2980  		REG_X0 + 11,
2981  		REG_X0 + 12,
2982  		REG_X0 + 13,
2983  		REG_X0 + 14,
2984  		REG_X0 + 15:
2985  		return Yxr
2986  
2987  	case REG_X0 + 16,
2988  		REG_X0 + 17,
2989  		REG_X0 + 18,
2990  		REG_X0 + 19,
2991  		REG_X0 + 20,
2992  		REG_X0 + 21,
2993  		REG_X0 + 22,
2994  		REG_X0 + 23,
2995  		REG_X0 + 24,
2996  		REG_X0 + 25,
2997  		REG_X0 + 26,
2998  		REG_X0 + 27,
2999  		REG_X0 + 28,
3000  		REG_X0 + 29,
3001  		REG_X0 + 30,
3002  		REG_X0 + 31:
3003  		return YxrEvex
3004  
3005  	case REG_Y0 + 0,
3006  		REG_Y0 + 1,
3007  		REG_Y0 + 2,
3008  		REG_Y0 + 3,
3009  		REG_Y0 + 4,
3010  		REG_Y0 + 5,
3011  		REG_Y0 + 6,
3012  		REG_Y0 + 7,
3013  		REG_Y0 + 8,
3014  		REG_Y0 + 9,
3015  		REG_Y0 + 10,
3016  		REG_Y0 + 11,
3017  		REG_Y0 + 12,
3018  		REG_Y0 + 13,
3019  		REG_Y0 + 14,
3020  		REG_Y0 + 15:
3021  		return Yyr
3022  
3023  	case REG_Y0 + 16,
3024  		REG_Y0 + 17,
3025  		REG_Y0 + 18,
3026  		REG_Y0 + 19,
3027  		REG_Y0 + 20,
3028  		REG_Y0 + 21,
3029  		REG_Y0 + 22,
3030  		REG_Y0 + 23,
3031  		REG_Y0 + 24,
3032  		REG_Y0 + 25,
3033  		REG_Y0 + 26,
3034  		REG_Y0 + 27,
3035  		REG_Y0 + 28,
3036  		REG_Y0 + 29,
3037  		REG_Y0 + 30,
3038  		REG_Y0 + 31:
3039  		return YyrEvex
3040  
3041  	case REG_Z0 + 0,
3042  		REG_Z0 + 1,
3043  		REG_Z0 + 2,
3044  		REG_Z0 + 3,
3045  		REG_Z0 + 4,
3046  		REG_Z0 + 5,
3047  		REG_Z0 + 6,
3048  		REG_Z0 + 7:
3049  		return Yzr
3050  
3051  	case REG_Z0 + 8,
3052  		REG_Z0 + 9,
3053  		REG_Z0 + 10,
3054  		REG_Z0 + 11,
3055  		REG_Z0 + 12,
3056  		REG_Z0 + 13,
3057  		REG_Z0 + 14,
3058  		REG_Z0 + 15,
3059  		REG_Z0 + 16,
3060  		REG_Z0 + 17,
3061  		REG_Z0 + 18,
3062  		REG_Z0 + 19,
3063  		REG_Z0 + 20,
3064  		REG_Z0 + 21,
3065  		REG_Z0 + 22,
3066  		REG_Z0 + 23,
3067  		REG_Z0 + 24,
3068  		REG_Z0 + 25,
3069  		REG_Z0 + 26,
3070  		REG_Z0 + 27,
3071  		REG_Z0 + 28,
3072  		REG_Z0 + 29,
3073  		REG_Z0 + 30,
3074  		REG_Z0 + 31:
3075  		if ctxt.Arch.Family == sys.I386 {
3076  			return Yxxx
3077  		}
3078  		return Yzr
3079  
3080  	case REG_K0:
3081  		return Yk0
3082  
3083  	case REG_K0 + 1,
3084  		REG_K0 + 2,
3085  		REG_K0 + 3,
3086  		REG_K0 + 4,
3087  		REG_K0 + 5,
3088  		REG_K0 + 6,
3089  		REG_K0 + 7:
3090  		return Yknot0
3091  
3092  	case REG_CS:
3093  		return Ycs
3094  	case REG_SS:
3095  		return Yss
3096  	case REG_DS:
3097  		return Yds
3098  	case REG_ES:
3099  		return Yes
3100  	case REG_FS:
3101  		return Yfs
3102  	case REG_GS:
3103  		return Ygs
3104  	case REG_TLS:
3105  		return Ytls
3106  
3107  	case REG_GDTR:
3108  		return Ygdtr
3109  	case REG_IDTR:
3110  		return Yidtr
3111  	case REG_LDTR:
3112  		return Yldtr
3113  	case REG_MSW:
3114  		return Ymsw
3115  	case REG_TASK:
3116  		return Ytask
3117  
3118  	case REG_CR + 0:
3119  		return Ycr0
3120  	case REG_CR + 1:
3121  		return Ycr1
3122  	case REG_CR + 2:
3123  		return Ycr2
3124  	case REG_CR + 3:
3125  		return Ycr3
3126  	case REG_CR + 4:
3127  		return Ycr4
3128  	case REG_CR + 5:
3129  		return Ycr5
3130  	case REG_CR + 6:
3131  		return Ycr6
3132  	case REG_CR + 7:
3133  		return Ycr7
3134  	case REG_CR + 8:
3135  		return Ycr8
3136  
3137  	case REG_DR + 0:
3138  		return Ydr0
3139  	case REG_DR + 1:
3140  		return Ydr1
3141  	case REG_DR + 2:
3142  		return Ydr2
3143  	case REG_DR + 3:
3144  		return Ydr3
3145  	case REG_DR + 4:
3146  		return Ydr4
3147  	case REG_DR + 5:
3148  		return Ydr5
3149  	case REG_DR + 6:
3150  		return Ydr6
3151  	case REG_DR + 7:
3152  		return Ydr7
3153  
3154  	case REG_TR + 0:
3155  		return Ytr0
3156  	case REG_TR + 1:
3157  		return Ytr1
3158  	case REG_TR + 2:
3159  		return Ytr2
3160  	case REG_TR + 3:
3161  		return Ytr3
3162  	case REG_TR + 4:
3163  		return Ytr4
3164  	case REG_TR + 5:
3165  		return Ytr5
3166  	case REG_TR + 6:
3167  		return Ytr6
3168  	case REG_TR + 7:
3169  		return Ytr7
3170  	}
3171  
3172  	return Yxxx
3173  }
3174  
3175  // AsmBuf is a simple buffer to assemble variable-length x86 instructions into
3176  // and hold assembly state.
3177  type AsmBuf struct {
3178  	buf      [100]byte
3179  	off      int
3180  	rexflag  int
3181  	vexflag  bool // Per inst: true for VEX-encoded
3182  	evexflag bool // Per inst: true for EVEX-encoded
3183  	rep      bool
3184  	repn     bool
3185  	lock     bool
3186  
3187  	evex evexBits // Initialized when evexflag is true
3188  }
3189  
3190  // Put1 appends one byte to the end of the buffer.
3191  func (ab *AsmBuf) Put1(x byte) {
3192  	ab.buf[ab.off] = x
3193  	ab.off++
3194  }
3195  
3196  // Put2 appends two bytes to the end of the buffer.
3197  func (ab *AsmBuf) Put2(x, y byte) {
3198  	ab.buf[ab.off+0] = x
3199  	ab.buf[ab.off+1] = y
3200  	ab.off += 2
3201  }
3202  
3203  // Put3 appends three bytes to the end of the buffer.
3204  func (ab *AsmBuf) Put3(x, y, z byte) {
3205  	ab.buf[ab.off+0] = x
3206  	ab.buf[ab.off+1] = y
3207  	ab.buf[ab.off+2] = z
3208  	ab.off += 3
3209  }
3210  
3211  // Put4 appends four bytes to the end of the buffer.
3212  func (ab *AsmBuf) Put4(x, y, z, w byte) {
3213  	ab.buf[ab.off+0] = x
3214  	ab.buf[ab.off+1] = y
3215  	ab.buf[ab.off+2] = z
3216  	ab.buf[ab.off+3] = w
3217  	ab.off += 4
3218  }
3219  
3220  // PutInt16 writes v into the buffer using little-endian encoding.
3221  func (ab *AsmBuf) PutInt16(v int16) {
3222  	ab.buf[ab.off+0] = byte(v)
3223  	ab.buf[ab.off+1] = byte(v >> 8)
3224  	ab.off += 2
3225  }
3226  
3227  // PutInt32 writes v into the buffer using little-endian encoding.
3228  func (ab *AsmBuf) PutInt32(v int32) {
3229  	ab.buf[ab.off+0] = byte(v)
3230  	ab.buf[ab.off+1] = byte(v >> 8)
3231  	ab.buf[ab.off+2] = byte(v >> 16)
3232  	ab.buf[ab.off+3] = byte(v >> 24)
3233  	ab.off += 4
3234  }
3235  
3236  // PutInt64 writes v into the buffer using little-endian encoding.
3237  func (ab *AsmBuf) PutInt64(v int64) {
3238  	ab.buf[ab.off+0] = byte(v)
3239  	ab.buf[ab.off+1] = byte(v >> 8)
3240  	ab.buf[ab.off+2] = byte(v >> 16)
3241  	ab.buf[ab.off+3] = byte(v >> 24)
3242  	ab.buf[ab.off+4] = byte(v >> 32)
3243  	ab.buf[ab.off+5] = byte(v >> 40)
3244  	ab.buf[ab.off+6] = byte(v >> 48)
3245  	ab.buf[ab.off+7] = byte(v >> 56)
3246  	ab.off += 8
3247  }
3248  
3249  // Put copies b into the buffer.
3250  func (ab *AsmBuf) Put(b []byte) {
3251  	copy(ab.buf[ab.off:], b)
3252  	ab.off += len(b)
3253  }
3254  
3255  // PutOpBytesLit writes zero terminated sequence of bytes from op,
3256  // starting at specified offset (e.g. z counter value).
3257  // Trailing 0 is not written.
3258  //
3259  // Intended to be used for literal Z cases.
3260  // Literal Z cases usually have "Zlit" in their name (Zlit, Zlitr_m, Zlitm_r).
3261  func (ab *AsmBuf) PutOpBytesLit(offset int, op *opBytes) {
3262  	for int(op[offset]) != 0 {
3263  		ab.Put1(byte(op[offset]))
3264  		offset++
3265  	}
3266  }
3267  
3268  // Insert inserts b at offset i.
3269  func (ab *AsmBuf) Insert(i int, b byte) {
3270  	ab.off++
3271  	copy(ab.buf[i+1:ab.off], ab.buf[i:ab.off-1])
3272  	ab.buf[i] = b
3273  }
3274  
3275  // Last returns the byte at the end of the buffer.
3276  func (ab *AsmBuf) Last() byte { return ab.buf[ab.off-1] }
3277  
3278  // Len returns the length of the buffer.
3279  func (ab *AsmBuf) Len() int { return ab.off }
3280  
3281  // Bytes returns the contents of the buffer.
3282  func (ab *AsmBuf) Bytes() []byte { return ab.buf[:ab.off] }
3283  
3284  // Reset empties the buffer.
3285  func (ab *AsmBuf) Reset() { ab.off = 0 }
3286  
3287  // At returns the byte at offset i.
3288  func (ab *AsmBuf) At(i int) byte { return ab.buf[i] }
3289  
3290  // asmidx emits SIB byte.
3291  func (ab *AsmBuf) asmidx(ctxt *obj.Link, scale int, index int, base int) {
3292  	var i int
3293  
3294  	// X/Y index register is used in VSIB.
3295  	switch index {
3296  	default:
3297  		goto bad
3298  
3299  	case REG_NONE:
3300  		i = 4 << 3
3301  		goto bas
3302  
3303  	case REG_R8,
3304  		REG_R9,
3305  		REG_R10,
3306  		REG_R11,
3307  		REG_R12,
3308  		REG_R13,
3309  		REG_R14,
3310  		REG_R15,
3311  		REG_X8,
3312  		REG_X9,
3313  		REG_X10,
3314  		REG_X11,
3315  		REG_X12,
3316  		REG_X13,
3317  		REG_X14,
3318  		REG_X15,
3319  		REG_X16,
3320  		REG_X17,
3321  		REG_X18,
3322  		REG_X19,
3323  		REG_X20,
3324  		REG_X21,
3325  		REG_X22,
3326  		REG_X23,
3327  		REG_X24,
3328  		REG_X25,
3329  		REG_X26,
3330  		REG_X27,
3331  		REG_X28,
3332  		REG_X29,
3333  		REG_X30,
3334  		REG_X31,
3335  		REG_Y8,
3336  		REG_Y9,
3337  		REG_Y10,
3338  		REG_Y11,
3339  		REG_Y12,
3340  		REG_Y13,
3341  		REG_Y14,
3342  		REG_Y15,
3343  		REG_Y16,
3344  		REG_Y17,
3345  		REG_Y18,
3346  		REG_Y19,
3347  		REG_Y20,
3348  		REG_Y21,
3349  		REG_Y22,
3350  		REG_Y23,
3351  		REG_Y24,
3352  		REG_Y25,
3353  		REG_Y26,
3354  		REG_Y27,
3355  		REG_Y28,
3356  		REG_Y29,
3357  		REG_Y30,
3358  		REG_Y31,
3359  		REG_Z8,
3360  		REG_Z9,
3361  		REG_Z10,
3362  		REG_Z11,
3363  		REG_Z12,
3364  		REG_Z13,
3365  		REG_Z14,
3366  		REG_Z15,
3367  		REG_Z16,
3368  		REG_Z17,
3369  		REG_Z18,
3370  		REG_Z19,
3371  		REG_Z20,
3372  		REG_Z21,
3373  		REG_Z22,
3374  		REG_Z23,
3375  		REG_Z24,
3376  		REG_Z25,
3377  		REG_Z26,
3378  		REG_Z27,
3379  		REG_Z28,
3380  		REG_Z29,
3381  		REG_Z30,
3382  		REG_Z31:
3383  		if ctxt.Arch.Family == sys.I386 {
3384  			goto bad
3385  		}
3386  		fallthrough
3387  
3388  	case REG_AX,
3389  		REG_CX,
3390  		REG_DX,
3391  		REG_BX,
3392  		REG_BP,
3393  		REG_SI,
3394  		REG_DI,
3395  		REG_X0,
3396  		REG_X1,
3397  		REG_X2,
3398  		REG_X3,
3399  		REG_X4,
3400  		REG_X5,
3401  		REG_X6,
3402  		REG_X7,
3403  		REG_Y0,
3404  		REG_Y1,
3405  		REG_Y2,
3406  		REG_Y3,
3407  		REG_Y4,
3408  		REG_Y5,
3409  		REG_Y6,
3410  		REG_Y7,
3411  		REG_Z0,
3412  		REG_Z1,
3413  		REG_Z2,
3414  		REG_Z3,
3415  		REG_Z4,
3416  		REG_Z5,
3417  		REG_Z6,
3418  		REG_Z7:
3419  		i = reg[index] << 3
3420  	}
3421  
3422  	switch scale {
3423  	default:
3424  		goto bad
3425  
3426  	case 1:
3427  		break
3428  
3429  	case 2:
3430  		i |= 1 << 6
3431  
3432  	case 4:
3433  		i |= 2 << 6
3434  
3435  	case 8:
3436  		i |= 3 << 6
3437  	}
3438  
3439  bas:
3440  	switch base {
3441  	default:
3442  		goto bad
3443  
3444  	case REG_NONE: // must be mod=00
3445  		i |= 5
3446  
3447  	case REG_R8,
3448  		REG_R9,
3449  		REG_R10,
3450  		REG_R11,
3451  		REG_R12,
3452  		REG_R13,
3453  		REG_R14,
3454  		REG_R15:
3455  		if ctxt.Arch.Family == sys.I386 {
3456  			goto bad
3457  		}
3458  		fallthrough
3459  
3460  	case REG_AX,
3461  		REG_CX,
3462  		REG_DX,
3463  		REG_BX,
3464  		REG_SP,
3465  		REG_BP,
3466  		REG_SI,
3467  		REG_DI:
3468  		i |= reg[base]
3469  	}
3470  
3471  	ab.Put1(byte(i))
3472  	return
3473  
3474  bad:
3475  	ctxt.Diag("asmidx: bad address %d/%d/%d", scale, index, base)
3476  	ab.Put1(0)
3477  }
3478  
3479  func (ab *AsmBuf) relput4(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr) {
3480  	var rel obj.Reloc
3481  
3482  	v := vaddr(ctxt, p, a, &rel)
3483  	if rel.Siz != 0 {
3484  		if rel.Siz != 4 {
3485  			ctxt.Diag("bad reloc")
3486  		}
3487  		r := obj.Addrel(cursym)
3488  		*r = rel
3489  		r.Off = int32(p.Pc + int64(ab.Len()))
3490  	}
3491  
3492  	ab.PutInt32(int32(v))
3493  }
3494  
3495  func vaddr(ctxt *obj.Link, p *obj.Prog, a *obj.Addr, r *obj.Reloc) int64 {
3496  	if r != nil {
3497  		*r = obj.Reloc{}
3498  	}
3499  
3500  	switch a.Name {
3501  	case obj.NAME_STATIC,
3502  		obj.NAME_GOTREF,
3503  		obj.NAME_EXTERN:
3504  		s := a.Sym
3505  		if r == nil {
3506  			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
3507  			log.Fatalf("reloc")
3508  		}
3509  
3510  		if a.Name == obj.NAME_GOTREF {
3511  			r.Siz = 4
3512  			r.Type = objabi.R_GOTPCREL
3513  		} else if useAbs(ctxt, s) {
3514  			r.Siz = 4
3515  			r.Type = objabi.R_ADDR
3516  		} else {
3517  			r.Siz = 4
3518  			r.Type = objabi.R_PCREL
3519  		}
3520  
3521  		r.Off = -1 // caller must fill in
3522  		r.Sym = s
3523  		r.Add = a.Offset
3524  
3525  		return 0
3526  	}
3527  
3528  	if (a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Reg == REG_TLS {
3529  		if r == nil {
3530  			ctxt.Diag("need reloc for %v", obj.Dconv(p, a))
3531  			log.Fatalf("reloc")
3532  		}
3533  
3534  		if !ctxt.Flag_shared || isAndroid || ctxt.Headtype == objabi.Hdarwin {
3535  			r.Type = objabi.R_TLS_LE
3536  			r.Siz = 4
3537  			r.Off = -1 // caller must fill in
3538  			r.Add = a.Offset
3539  		}
3540  		return 0
3541  	}
3542  
3543  	return a.Offset
3544  }
3545  
3546  func (ab *AsmBuf) asmandsz(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, r int, rex int, m64 int) {
3547  	var base int
3548  	var rel obj.Reloc
3549  
3550  	rex &= 0x40 | Rxr
3551  	if a.Offset != int64(int32(a.Offset)) {
3552  		// The rules are slightly different for 386 and AMD64,
3553  		// mostly for historical reasons. We may unify them later,
3554  		// but it must be discussed beforehand.
3555  		//
3556  		// For 64bit mode only LEAL is allowed to overflow.
3557  		// It's how https://golang.org/cl/59630 made it.
3558  		// crypto/sha1/sha1block_amd64.s depends on this feature.
3559  		//
3560  		// For 32bit mode rules are more permissive.
3561  		// If offset fits uint32, it's permitted.
3562  		// This is allowed for assembly that wants to use 32-bit hex
3563  		// constants, e.g. LEAL 0x99999999(AX), AX.
3564  		overflowOK := (ctxt.Arch.Family == sys.AMD64 && p.As == ALEAL) ||
3565  			(ctxt.Arch.Family != sys.AMD64 &&
3566  				int64(uint32(a.Offset)) == a.Offset &&
3567  				ab.rexflag&Rxw == 0)
3568  		if !overflowOK {
3569  			ctxt.Diag("offset too large in %s", p)
3570  		}
3571  	}
3572  	v := int32(a.Offset)
3573  	rel.Siz = 0
3574  
3575  	switch a.Type {
3576  	case obj.TYPE_ADDR:
3577  		if a.Name == obj.NAME_NONE {
3578  			ctxt.Diag("unexpected TYPE_ADDR with NAME_NONE")
3579  		}
3580  		if a.Index == REG_TLS {
3581  			ctxt.Diag("unexpected TYPE_ADDR with index==REG_TLS")
3582  		}
3583  		goto bad
3584  
3585  	case obj.TYPE_REG:
3586  		const regFirst = REG_AL
3587  		const regLast = REG_Z31
3588  		if a.Reg < regFirst || regLast < a.Reg {
3589  			goto bad
3590  		}
3591  		if v != 0 {
3592  			goto bad
3593  		}
3594  		ab.Put1(byte(3<<6 | reg[a.Reg]<<0 | r<<3))
3595  		ab.rexflag |= regrex[a.Reg]&(0x40|Rxb) | rex
3596  		return
3597  	}
3598  
3599  	if a.Type != obj.TYPE_MEM {
3600  		goto bad
3601  	}
3602  
3603  	if a.Index != REG_NONE && a.Index != REG_TLS {
3604  		base := int(a.Reg)
3605  		switch a.Name {
3606  		case obj.NAME_EXTERN,
3607  			obj.NAME_GOTREF,
3608  			obj.NAME_STATIC:
3609  			if !useAbs(ctxt, a.Sym) && ctxt.Arch.Family == sys.AMD64 {
3610  				goto bad
3611  			}
3612  			if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
3613  				// The base register has already been set. It holds the PC
3614  				// of this instruction returned by a PC-reading thunk.
3615  				// See obj6.go:rewriteToPcrel.
3616  			} else {
3617  				base = REG_NONE
3618  			}
3619  			v = int32(vaddr(ctxt, p, a, &rel))
3620  
3621  		case obj.NAME_AUTO,
3622  			obj.NAME_PARAM:
3623  			base = REG_SP
3624  		}
3625  
3626  		ab.rexflag |= regrex[int(a.Index)]&Rxx | regrex[base]&Rxb | rex
3627  		if base == REG_NONE {
3628  			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
3629  			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3630  			goto putrelv
3631  		}
3632  
3633  		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
3634  			ab.Put1(byte(0<<6 | 4<<0 | r<<3))
3635  			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3636  			return
3637  		}
3638  
3639  		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
3640  			ab.Put1(byte(1<<6 | 4<<0 | r<<3))
3641  			ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3642  			ab.Put1(disp8)
3643  			return
3644  		}
3645  
3646  		ab.Put1(byte(2<<6 | 4<<0 | r<<3))
3647  		ab.asmidx(ctxt, int(a.Scale), int(a.Index), base)
3648  		goto putrelv
3649  	}
3650  
3651  	base = int(a.Reg)
3652  	switch a.Name {
3653  	case obj.NAME_STATIC,
3654  		obj.NAME_GOTREF,
3655  		obj.NAME_EXTERN:
3656  		if a.Sym == nil {
3657  			ctxt.Diag("bad addr: %v", p)
3658  		}
3659  		if ctxt.Arch.Family == sys.I386 && ctxt.Flag_shared {
3660  			// The base register has already been set. It holds the PC
3661  			// of this instruction returned by a PC-reading thunk.
3662  			// See obj6.go:rewriteToPcrel.
3663  		} else {
3664  			base = REG_NONE
3665  		}
3666  		v = int32(vaddr(ctxt, p, a, &rel))
3667  
3668  	case obj.NAME_AUTO,
3669  		obj.NAME_PARAM:
3670  		base = REG_SP
3671  	}
3672  
3673  	if base == REG_TLS {
3674  		v = int32(vaddr(ctxt, p, a, &rel))
3675  	}
3676  
3677  	ab.rexflag |= regrex[base]&Rxb | rex
3678  	if base == REG_NONE || (REG_CS <= base && base <= REG_GS) || base == REG_TLS {
3679  		if (a.Sym == nil || !useAbs(ctxt, a.Sym)) && base == REG_NONE && (a.Name == obj.NAME_STATIC || a.Name == obj.NAME_EXTERN || a.Name == obj.NAME_GOTREF) || ctxt.Arch.Family != sys.AMD64 {
3680  			if a.Name == obj.NAME_GOTREF && (a.Offset != 0 || a.Index != 0 || a.Scale != 0) {
3681  				ctxt.Diag("%v has offset against gotref", p)
3682  			}
3683  			ab.Put1(byte(0<<6 | 5<<0 | r<<3))
3684  			goto putrelv
3685  		}
3686  
3687  		// temporary
3688  		ab.Put2(
3689  			byte(0<<6|4<<0|r<<3), // sib present
3690  			0<<6|4<<3|5<<0,       // DS:d32
3691  		)
3692  		goto putrelv
3693  	}
3694  
3695  	if base == REG_SP || base == REG_R12 {
3696  		if v == 0 {
3697  			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
3698  			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3699  			return
3700  		}
3701  
3702  		if disp8, ok := toDisp8(v, p, ab); ok {
3703  			ab.Put1(byte(1<<6 | reg[base]<<0 | r<<3))
3704  			ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3705  			ab.Put1(disp8)
3706  			return
3707  		}
3708  
3709  		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
3710  		ab.asmidx(ctxt, int(a.Scale), REG_NONE, base)
3711  		goto putrelv
3712  	}
3713  
3714  	if REG_AX <= base && base <= REG_R15 {
3715  		if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid {
3716  			rel = obj.Reloc{}
3717  			rel.Type = objabi.R_TLS_LE
3718  			rel.Siz = 4
3719  			rel.Sym = nil
3720  			rel.Add = int64(v)
3721  			v = 0
3722  		}
3723  
3724  		if v == 0 && rel.Siz == 0 && base != REG_BP && base != REG_R13 {
3725  			ab.Put1(byte(0<<6 | reg[base]<<0 | r<<3))
3726  			return
3727  		}
3728  
3729  		if disp8, ok := toDisp8(v, p, ab); ok && rel.Siz == 0 {
3730  			ab.Put2(byte(1<<6|reg[base]<<0|r<<3), disp8)
3731  			return
3732  		}
3733  
3734  		ab.Put1(byte(2<<6 | reg[base]<<0 | r<<3))
3735  		goto putrelv
3736  	}
3737  
3738  	goto bad
3739  
3740  putrelv:
3741  	if rel.Siz != 0 {
3742  		if rel.Siz != 4 {
3743  			ctxt.Diag("bad rel")
3744  			goto bad
3745  		}
3746  
3747  		r := obj.Addrel(cursym)
3748  		*r = rel
3749  		r.Off = int32(p.Pc + int64(ab.Len()))
3750  	}
3751  
3752  	ab.PutInt32(v)
3753  	return
3754  
3755  bad:
3756  	ctxt.Diag("asmand: bad address %v", obj.Dconv(p, a))
3757  }
3758  
3759  func (ab *AsmBuf) asmand(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, ra *obj.Addr) {
3760  	ab.asmandsz(ctxt, cursym, p, a, reg[ra.Reg], regrex[ra.Reg], 0)
3761  }
3762  
3763  func (ab *AsmBuf) asmando(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj.Addr, o int) {
3764  	ab.asmandsz(ctxt, cursym, p, a, o, 0, 0)
3765  }
3766  
3767  func bytereg(a *obj.Addr, t *uint8) {
3768  	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AX <= a.Reg && a.Reg <= REG_R15) {
3769  		a.Reg += REG_AL - REG_AX
3770  		*t = 0
3771  	}
3772  }
3773  
3774  func unbytereg(a *obj.Addr, t *uint8) {
3775  	if a.Type == obj.TYPE_REG && a.Index == REG_NONE && (REG_AL <= a.Reg && a.Reg <= REG_R15B) {
3776  		a.Reg += REG_AX - REG_AL
3777  		*t = 0
3778  	}
3779  }
3780  
3781  const (
3782  	movLit uint8 = iota // Like Zlit
3783  	movRegMem
3784  	movMemReg
3785  	movRegMem2op
3786  	movMemReg2op
3787  	movFullPtr // Load full pointer, trash heap (unsupported)
3788  	movDoubleShift
3789  	movTLSReg
3790  )
3791  
3792  var ymovtab = []movtab{
3793  	// push
3794  	{APUSHL, Ycs, Ynone, Ynone, movLit, [4]uint8{0x0e, 0}},
3795  	{APUSHL, Yss, Ynone, Ynone, movLit, [4]uint8{0x16, 0}},
3796  	{APUSHL, Yds, Ynone, Ynone, movLit, [4]uint8{0x1e, 0}},
3797  	{APUSHL, Yes, Ynone, Ynone, movLit, [4]uint8{0x06, 0}},
3798  	{APUSHL, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
3799  	{APUSHL, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
3800  	{APUSHQ, Yfs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa0, 0}},
3801  	{APUSHQ, Ygs, Ynone, Ynone, movLit, [4]uint8{0x0f, 0xa8, 0}},
3802  	{APUSHW, Ycs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0e, 0}},
3803  	{APUSHW, Yss, Ynone, Ynone, movLit, [4]uint8{Pe, 0x16, 0}},
3804  	{APUSHW, Yds, Ynone, Ynone, movLit, [4]uint8{Pe, 0x1e, 0}},
3805  	{APUSHW, Yes, Ynone, Ynone, movLit, [4]uint8{Pe, 0x06, 0}},
3806  	{APUSHW, Yfs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa0, 0}},
3807  	{APUSHW, Ygs, Ynone, Ynone, movLit, [4]uint8{Pe, 0x0f, 0xa8, 0}},
3808  
3809  	// pop
3810  	{APOPL, Ynone, Ynone, Yds, movLit, [4]uint8{0x1f, 0}},
3811  	{APOPL, Ynone, Ynone, Yes, movLit, [4]uint8{0x07, 0}},
3812  	{APOPL, Ynone, Ynone, Yss, movLit, [4]uint8{0x17, 0}},
3813  	{APOPL, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
3814  	{APOPL, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
3815  	{APOPQ, Ynone, Ynone, Yfs, movLit, [4]uint8{0x0f, 0xa1, 0}},
3816  	{APOPQ, Ynone, Ynone, Ygs, movLit, [4]uint8{0x0f, 0xa9, 0}},
3817  	{APOPW, Ynone, Ynone, Yds, movLit, [4]uint8{Pe, 0x1f, 0}},
3818  	{APOPW, Ynone, Ynone, Yes, movLit, [4]uint8{Pe, 0x07, 0}},
3819  	{APOPW, Ynone, Ynone, Yss, movLit, [4]uint8{Pe, 0x17, 0}},
3820  	{APOPW, Ynone, Ynone, Yfs, movLit, [4]uint8{Pe, 0x0f, 0xa1, 0}},
3821  	{APOPW, Ynone, Ynone, Ygs, movLit, [4]uint8{Pe, 0x0f, 0xa9, 0}},
3822  
3823  	// mov seg
3824  	{AMOVW, Yes, Ynone, Yml, movRegMem, [4]uint8{0x8c, 0, 0, 0}},
3825  	{AMOVW, Ycs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 1, 0, 0}},
3826  	{AMOVW, Yss, Ynone, Yml, movRegMem, [4]uint8{0x8c, 2, 0, 0}},
3827  	{AMOVW, Yds, Ynone, Yml, movRegMem, [4]uint8{0x8c, 3, 0, 0}},
3828  	{AMOVW, Yfs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 4, 0, 0}},
3829  	{AMOVW, Ygs, Ynone, Yml, movRegMem, [4]uint8{0x8c, 5, 0, 0}},
3830  	{AMOVW, Yml, Ynone, Yes, movMemReg, [4]uint8{0x8e, 0, 0, 0}},
3831  	{AMOVW, Yml, Ynone, Ycs, movMemReg, [4]uint8{0x8e, 1, 0, 0}},
3832  	{AMOVW, Yml, Ynone, Yss, movMemReg, [4]uint8{0x8e, 2, 0, 0}},
3833  	{AMOVW, Yml, Ynone, Yds, movMemReg, [4]uint8{0x8e, 3, 0, 0}},
3834  	{AMOVW, Yml, Ynone, Yfs, movMemReg, [4]uint8{0x8e, 4, 0, 0}},
3835  	{AMOVW, Yml, Ynone, Ygs, movMemReg, [4]uint8{0x8e, 5, 0, 0}},
3836  
3837  	// mov cr
3838  	{AMOVL, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
3839  	{AMOVL, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
3840  	{AMOVL, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
3841  	{AMOVL, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
3842  	{AMOVL, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
3843  	{AMOVQ, Ycr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 0, 0}},
3844  	{AMOVQ, Ycr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 2, 0}},
3845  	{AMOVQ, Ycr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 3, 0}},
3846  	{AMOVQ, Ycr4, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 4, 0}},
3847  	{AMOVQ, Ycr8, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x20, 8, 0}},
3848  	{AMOVL, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
3849  	{AMOVL, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
3850  	{AMOVL, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
3851  	{AMOVL, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
3852  	{AMOVL, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},
3853  	{AMOVQ, Yrl, Ynone, Ycr0, movMemReg2op, [4]uint8{0x0f, 0x22, 0, 0}},
3854  	{AMOVQ, Yrl, Ynone, Ycr2, movMemReg2op, [4]uint8{0x0f, 0x22, 2, 0}},
3855  	{AMOVQ, Yrl, Ynone, Ycr3, movMemReg2op, [4]uint8{0x0f, 0x22, 3, 0}},
3856  	{AMOVQ, Yrl, Ynone, Ycr4, movMemReg2op, [4]uint8{0x0f, 0x22, 4, 0}},
3857  	{AMOVQ, Yrl, Ynone, Ycr8, movMemReg2op, [4]uint8{0x0f, 0x22, 8, 0}},
3858  
3859  	// mov dr
3860  	{AMOVL, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
3861  	{AMOVL, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
3862  	{AMOVL, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
3863  	{AMOVQ, Ydr0, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 0, 0}},
3864  	{AMOVQ, Ydr2, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 2, 0}},
3865  	{AMOVQ, Ydr3, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 3, 0}},
3866  	{AMOVQ, Ydr6, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 6, 0}},
3867  	{AMOVQ, Ydr7, Ynone, Yrl, movRegMem2op, [4]uint8{0x0f, 0x21, 7, 0}},
3868  	{AMOVL, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
3869  	{AMOVL, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
3870  	{AMOVL, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},
3871  	{AMOVQ, Yrl, Ynone, Ydr0, movMemReg2op, [4]uint8{0x0f, 0x23, 0, 0}},
3872  	{AMOVQ, Yrl, Ynone, Ydr2, movMemReg2op, [4]uint8{0x0f, 0x23, 2, 0}},
3873  	{AMOVQ, Yrl, Ynone, Ydr3, movMemReg2op, [4]uint8{0x0f, 0x23, 3, 0}},
3874  	{AMOVQ, Yrl, Ynone, Ydr6, movMemReg2op, [4]uint8{0x0f, 0x23, 6, 0}},
3875  	{AMOVQ, Yrl, Ynone, Ydr7, movMemReg2op, [4]uint8{0x0f, 0x23, 7, 0}},
3876  
3877  	// mov tr
3878  	{AMOVL, Ytr6, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 6, 0}},
3879  	{AMOVL, Ytr7, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x24, 7, 0}},
3880  	{AMOVL, Yml, Ynone, Ytr6, movMemReg2op, [4]uint8{0x0f, 0x26, 6, 0xff}},
3881  	{AMOVL, Yml, Ynone, Ytr7, movMemReg2op, [4]uint8{0x0f, 0x26, 7, 0xff}},
3882  
3883  	// lgdt, sgdt, lidt, sidt
3884  	{AMOVL, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
3885  	{AMOVL, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
3886  	{AMOVL, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
3887  	{AMOVL, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},
3888  	{AMOVQ, Ym, Ynone, Ygdtr, movMemReg2op, [4]uint8{0x0f, 0x01, 2, 0}},
3889  	{AMOVQ, Ygdtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 0, 0}},
3890  	{AMOVQ, Ym, Ynone, Yidtr, movMemReg2op, [4]uint8{0x0f, 0x01, 3, 0}},
3891  	{AMOVQ, Yidtr, Ynone, Ym, movRegMem2op, [4]uint8{0x0f, 0x01, 1, 0}},
3892  
3893  	// lldt, sldt
3894  	{AMOVW, Yml, Ynone, Yldtr, movMemReg2op, [4]uint8{0x0f, 0x00, 2, 0}},
3895  	{AMOVW, Yldtr, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 0, 0}},
3896  
3897  	// lmsw, smsw
3898  	{AMOVW, Yml, Ynone, Ymsw, movMemReg2op, [4]uint8{0x0f, 0x01, 6, 0}},
3899  	{AMOVW, Ymsw, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x01, 4, 0}},
3900  
3901  	// ltr, str
3902  	{AMOVW, Yml, Ynone, Ytask, movMemReg2op, [4]uint8{0x0f, 0x00, 3, 0}},
3903  	{AMOVW, Ytask, Ynone, Yml, movRegMem2op, [4]uint8{0x0f, 0x00, 1, 0}},
3904  
3905  	/* load full pointer - unsupported
3906  	{AMOVL, Yml, Ycol, movFullPtr, [4]uint8{0, 0, 0, 0}},
3907  	{AMOVW, Yml, Ycol, movFullPtr, [4]uint8{Pe, 0, 0, 0}},
3908  	*/
3909  
3910  	// double shift
3911  	{ASHLL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3912  	{ASHLL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3913  	{ASHLL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xa4, 0xa5, 0, 0}},
3914  	{ASHRL, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3915  	{ASHRL, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3916  	{ASHRL, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{0xac, 0xad, 0, 0}},
3917  	{ASHLQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3918  	{ASHLQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3919  	{ASHLQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xa4, 0xa5, 0}},
3920  	{ASHRQ, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3921  	{ASHRQ, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3922  	{ASHRQ, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pw, 0xac, 0xad, 0}},
3923  	{ASHLW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3924  	{ASHLW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3925  	{ASHLW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xa4, 0xa5, 0}},
3926  	{ASHRW, Yi8, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3927  	{ASHRW, Ycl, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3928  	{ASHRW, Ycx, Yrl, Yml, movDoubleShift, [4]uint8{Pe, 0xac, 0xad, 0}},
3929  
3930  	// load TLS base
3931  	{AMOVL, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
3932  	{AMOVQ, Ytls, Ynone, Yrl, movTLSReg, [4]uint8{0, 0, 0, 0}},
3933  	{0, 0, 0, 0, 0, [4]uint8{}},
3934  }
3935  
3936  func isax(a *obj.Addr) bool {
3937  	switch a.Reg {
3938  	case REG_AX, REG_AL, REG_AH:
3939  		return true
3940  	}
3941  
3942  	if a.Index == REG_AX {
3943  		return true
3944  	}
3945  	return false
3946  }
3947  
3948  func subreg(p *obj.Prog, from int, to int) {
3949  	if false { /* debug['Q'] */
3950  		fmt.Printf("\n%v\ts/%v/%v/\n", p, rconv(from), rconv(to))
3951  	}
3952  
3953  	if int(p.From.Reg) == from {
3954  		p.From.Reg = int16(to)
3955  		p.Ft = 0
3956  	}
3957  
3958  	if int(p.To.Reg) == from {
3959  		p.To.Reg = int16(to)
3960  		p.Tt = 0
3961  	}
3962  
3963  	if int(p.From.Index) == from {
3964  		p.From.Index = int16(to)
3965  		p.Ft = 0
3966  	}
3967  
3968  	if int(p.To.Index) == from {
3969  		p.To.Index = int16(to)
3970  		p.Tt = 0
3971  	}
3972  
3973  	if false { /* debug['Q'] */
3974  		fmt.Printf("%v\n", p)
3975  	}
3976  }
3977  
3978  func (ab *AsmBuf) mediaop(ctxt *obj.Link, o *Optab, op int, osize int, z int) int {
3979  	switch op {
3980  	case Pm, Pe, Pf2, Pf3:
3981  		if osize != 1 {
3982  			if op != Pm {
3983  				ab.Put1(byte(op))
3984  			}
3985  			ab.Put1(Pm)
3986  			z++
3987  			op = int(o.op[z])
3988  			break
3989  		}
3990  		fallthrough
3991  
3992  	default:
3993  		if ab.Len() == 0 || ab.Last() != Pm {
3994  			ab.Put1(Pm)
3995  		}
3996  	}
3997  
3998  	ab.Put1(byte(op))
3999  	return z
4000  }
4001  
4002  var bpduff1 = []byte{
4003  	0x48, 0x89, 0x6c, 0x24, 0xf0, // MOVQ BP, -16(SP)
4004  	0x48, 0x8d, 0x6c, 0x24, 0xf0, // LEAQ -16(SP), BP
4005  }
4006  
4007  var bpduff2 = []byte{
4008  	0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
4009  }
4010  
4011  // asmevex emits EVEX pregis and opcode byte.
4012  // In addition to asmvex r/m, vvvv and reg fields also requires optional
4013  // K-masking register.
4014  //
4015  // Expects asmbuf.evex to be properly initialized.
4016  func (ab *AsmBuf) asmevex(ctxt *obj.Link, p *obj.Prog, rm, v, r, k *obj.Addr) {
4017  	ab.evexflag = true
4018  	evex := ab.evex
4019  
4020  	rexR := byte(1)
4021  	evexR := byte(1)
4022  	rexX := byte(1)
4023  	rexB := byte(1)
4024  	if r != nil {
4025  		if regrex[r.Reg]&Rxr != 0 {
4026  			rexR = 0 // "ModR/M.reg" selector 4th bit.
4027  		}
4028  		if regrex[r.Reg]&RxrEvex != 0 {
4029  			evexR = 0 // "ModR/M.reg" selector 5th bit.
4030  		}
4031  	}
4032  	if rm != nil {
4033  		if rm.Index == REG_NONE && regrex[rm.Reg]&RxrEvex != 0 {
4034  			rexX = 0
4035  		} else if regrex[rm.Index]&Rxx != 0 {
4036  			rexX = 0
4037  		}
4038  		if regrex[rm.Reg]&Rxb != 0 {
4039  			rexB = 0
4040  		}
4041  	}
4042  	// P0 = [R][X][B][R'][00][mm]
4043  	p0 := (rexR << 7) |
4044  		(rexX << 6) |
4045  		(rexB << 5) |
4046  		(evexR << 4) |
4047  		(0 << 2) |
4048  		(evex.M() << 0)
4049  
4050  	vexV := byte(0)
4051  	if v != nil {
4052  		// 4bit-wide reg index.
4053  		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
4054  	}
4055  	vexV ^= 0x0F
4056  	// P1 = [W][vvvv][1][pp]
4057  	p1 := (evex.W() << 7) |
4058  		(vexV << 3) |
4059  		(1 << 2) |
4060  		(evex.P() << 0)
4061  
4062  	suffix := evexSuffixMap[p.Scond]
4063  	evexZ := byte(0)
4064  	evexLL := evex.L()
4065  	evexB := byte(0)
4066  	evexV := byte(1)
4067  	evexA := byte(0)
4068  	if suffix.zeroing {
4069  		if !evex.ZeroingEnabled() {
4070  			ctxt.Diag("unsupported zeroing: %v", p)
4071  		}
4072  		evexZ = 1
4073  	}
4074  	switch {
4075  	case suffix.rounding != rcUnset:
4076  		if rm != nil && rm.Type == obj.TYPE_MEM {
4077  			ctxt.Diag("illegal rounding with memory argument: %v", p)
4078  		} else if !evex.RoundingEnabled() {
4079  			ctxt.Diag("unsupported rounding: %v", p)
4080  		}
4081  		evexB = 1
4082  		evexLL = suffix.rounding
4083  	case suffix.broadcast:
4084  		if rm == nil || rm.Type != obj.TYPE_MEM {
4085  			ctxt.Diag("illegal broadcast without memory argument: %v", p)
4086  		} else if !evex.BroadcastEnabled() {
4087  			ctxt.Diag("unsupported broadcast: %v", p)
4088  		}
4089  		evexB = 1
4090  	case suffix.sae:
4091  		if rm != nil && rm.Type == obj.TYPE_MEM {
4092  			ctxt.Diag("illegal SAE with memory argument: %v", p)
4093  		} else if !evex.SaeEnabled() {
4094  			ctxt.Diag("unsupported SAE: %v", p)
4095  		}
4096  		evexB = 1
4097  	}
4098  	if rm != nil && regrex[rm.Index]&RxrEvex != 0 {
4099  		evexV = 0
4100  	} else if v != nil && regrex[v.Reg]&RxrEvex != 0 {
4101  		evexV = 0 // VSR selector 5th bit.
4102  	}
4103  	if k != nil {
4104  		evexA = byte(reg[k.Reg])
4105  	}
4106  	// P2 = [z][L'L][b][V'][aaa]
4107  	p2 := (evexZ << 7) |
4108  		(evexLL << 5) |
4109  		(evexB << 4) |
4110  		(evexV << 3) |
4111  		(evexA << 0)
4112  
4113  	const evexEscapeByte = 0x62
4114  	ab.Put4(evexEscapeByte, p0, p1, p2)
4115  	ab.Put1(evex.opcode)
4116  }
4117  
4118  // Emit VEX prefix and opcode byte.
4119  // The three addresses are the r/m, vvvv, and reg fields.
4120  // The reg and rm arguments appear in the same order as the
4121  // arguments to asmand, which typically follows the call to asmvex.
4122  // The final two arguments are the VEX prefix (see encoding above)
4123  // and the opcode byte.
4124  // For details about vex prefix see:
4125  // https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
4126  func (ab *AsmBuf) asmvex(ctxt *obj.Link, rm, v, r *obj.Addr, vex, opcode uint8) {
4127  	ab.vexflag = true
4128  	rexR := 0
4129  	if r != nil {
4130  		rexR = regrex[r.Reg] & Rxr
4131  	}
4132  	rexB := 0
4133  	rexX := 0
4134  	if rm != nil {
4135  		rexB = regrex[rm.Reg] & Rxb
4136  		rexX = regrex[rm.Index] & Rxx
4137  	}
4138  	vexM := (vex >> 3) & 0x7
4139  	vexWLP := vex & 0x87
4140  	vexV := byte(0)
4141  	if v != nil {
4142  		vexV = byte(reg[v.Reg]|(regrex[v.Reg]&Rxr)<<1) & 0xF
4143  	}
4144  	vexV ^= 0xF
4145  	if vexM == 1 && (rexX|rexB) == 0 && vex&vexW1 == 0 {
4146  		// Can use 2-byte encoding.
4147  		ab.Put2(0xc5, byte(rexR<<5)^0x80|vexV<<3|vexWLP)
4148  	} else {
4149  		// Must use 3-byte encoding.
4150  		ab.Put3(0xc4,
4151  			(byte(rexR|rexX|rexB)<<5)^0xE0|vexM,
4152  			vexV<<3|vexWLP,
4153  		)
4154  	}
4155  	ab.Put1(opcode)
4156  }
4157  
4158  // regIndex returns register index that fits in 5 bits.
4159  //
4160  //	R         : 3 bit | legacy instructions     | N/A
4161  //	[R/V]EX.R : 1 bit | REX / VEX extension bit | Rxr
4162  //	EVEX.R    : 1 bit | EVEX extension bit      | RxrEvex
4163  //
4164  // Examples:
4165  //	REG_Z30 => 30
4166  //	REG_X15 => 15
4167  //	REG_R9  => 9
4168  //	REG_AX  => 0
4169  //
4170  func regIndex(r int16) int {
4171  	lower3bits := reg[r]
4172  	high4bit := regrex[r] & Rxr << 1
4173  	high5bit := regrex[r] & RxrEvex << 0
4174  	return lower3bits | high4bit | high5bit
4175  }
4176  
4177  // avx2gatherValid reports whether p satisfies AVX2 gather constraints.
4178  // Reports errors via ctxt.
4179  func avx2gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
4180  	// If any pair of the index, mask, or destination registers
4181  	// are the same, illegal instruction trap (#UD) is triggered.
4182  	index := regIndex(p.GetFrom3().Index)
4183  	mask := regIndex(p.From.Reg)
4184  	dest := regIndex(p.To.Reg)
4185  	if dest == mask || dest == index || mask == index {
4186  		ctxt.Diag("mask, index, and destination registers should be distinct: %v", p)
4187  		return false
4188  	}
4189  
4190  	return true
4191  }
4192  
4193  // avx512gatherValid reports whether p satisfies AVX512 gather constraints.
4194  // Reports errors via ctxt.
4195  func avx512gatherValid(ctxt *obj.Link, p *obj.Prog) bool {
4196  	// Illegal instruction trap (#UD) is triggered if the destination vector
4197  	// register is the same as index vector in VSIB.
4198  	index := regIndex(p.From.Index)
4199  	dest := regIndex(p.To.Reg)
4200  	if dest == index {
4201  		ctxt.Diag("index and destination registers should be distinct: %v", p)
4202  		return false
4203  	}
4204  
4205  	return true
4206  }
4207  
4208  func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
4209  	o := opindex[p.As&obj.AMask]
4210  
4211  	if o == nil {
4212  		ctxt.Diag("asmins: missing op %v", p)
4213  		return
4214  	}
4215  
4216  	if pre := prefixof(ctxt, &p.From); pre != 0 {
4217  		ab.Put1(byte(pre))
4218  	}
4219  	if pre := prefixof(ctxt, &p.To); pre != 0 {
4220  		ab.Put1(byte(pre))
4221  	}
4222  
4223  	// Checks to warn about instruction/arguments combinations that
4224  	// will unconditionally trigger illegal instruction trap (#UD).
4225  	switch p.As {
4226  	case AVGATHERDPD,
4227  		AVGATHERQPD,
4228  		AVGATHERDPS,
4229  		AVGATHERQPS,
4230  		AVPGATHERDD,
4231  		AVPGATHERQD,
4232  		AVPGATHERDQ,
4233  		AVPGATHERQQ:
4234  		// AVX512 gather requires explicit K mask.
4235  		if p.GetFrom3().Reg >= REG_K0 && p.GetFrom3().Reg <= REG_K7 {
4236  			if !avx512gatherValid(ctxt, p) {
4237  				return
4238  			}
4239  		} else {
4240  			if !avx2gatherValid(ctxt, p) {
4241  				return
4242  			}
4243  		}
4244  	}
4245  
4246  	if p.Ft == 0 {
4247  		p.Ft = uint8(oclass(ctxt, p, &p.From))
4248  	}
4249  	if p.Tt == 0 {
4250  		p.Tt = uint8(oclass(ctxt, p, &p.To))
4251  	}
4252  
4253  	ft := int(p.Ft) * Ymax
4254  	var f3t int
4255  	tt := int(p.Tt) * Ymax
4256  
4257  	xo := obj.Bool2int(o.op[0] == 0x0f)
4258  	z := 0
4259  	var a *obj.Addr
4260  	var l int
4261  	var op int
4262  	var q *obj.Prog
4263  	var r *obj.Reloc
4264  	var rel obj.Reloc
4265  	var v int64
4266  
4267  	args := make([]int, 0, argListMax)
4268  	if ft != Ynone*Ymax {
4269  		args = append(args, ft)
4270  	}
4271  	for i := range p.RestArgs {
4272  		args = append(args, oclass(ctxt, p, &p.RestArgs[i])*Ymax)
4273  	}
4274  	if tt != Ynone*Ymax {
4275  		args = append(args, tt)
4276  	}
4277  
4278  	for _, yt := range o.ytab {
4279  		// ytab matching is purely args-based,
4280  		// but AVX512 suffixes like "Z" or "RU_SAE" will
4281  		// add EVEX-only filter that will reject non-EVEX matches.
4282  		//
4283  		// Consider "VADDPD.BCST 2032(DX), X0, X0".
4284  		// Without this rule, operands will lead to VEX-encoded form
4285  		// and produce "c5b15813" encoding.
4286  		if !yt.match(args) {
4287  			// "xo" is always zero for VEX/EVEX encoded insts.
4288  			z += int(yt.zoffset) + xo
4289  		} else {
4290  			if p.Scond != 0 && !evexZcase(yt.zcase) {
4291  				// Do not signal error and continue to search
4292  				// for matching EVEX-encoded form.
4293  				z += int(yt.zoffset)
4294  				continue
4295  			}
4296  
4297  			switch o.prefix {
4298  			case Px1: // first option valid only in 32-bit mode
4299  				if ctxt.Arch.Family == sys.AMD64 && z == 0 {
4300  					z += int(yt.zoffset) + xo
4301  					continue
4302  				}
4303  			case Pq: // 16 bit escape and opcode escape
4304  				ab.Put2(Pe, Pm)
4305  
4306  			case Pq3: // 16 bit escape and opcode escape + REX.W
4307  				ab.rexflag |= Pw
4308  				ab.Put2(Pe, Pm)
4309  
4310  			case Pq4: // 66 0F 38
4311  				ab.Put3(0x66, 0x0F, 0x38)
4312  
4313  			case Pq4w: // 66 0F 38 + REX.W
4314  				ab.rexflag |= Pw
4315  				ab.Put3(0x66, 0x0F, 0x38)
4316  
4317  			case Pq5: // F3 0F 38
4318  				ab.Put3(0xF3, 0x0F, 0x38)
4319  
4320  			case Pq5w: //  F3 0F 38 + REX.W
4321  				ab.rexflag |= Pw
4322  				ab.Put3(0xF3, 0x0F, 0x38)
4323  
4324  			case Pf2, // xmm opcode escape
4325  				Pf3:
4326  				ab.Put2(o.prefix, Pm)
4327  
4328  			case Pef3:
4329  				ab.Put3(Pe, Pf3, Pm)
4330  
4331  			case Pfw: // xmm opcode escape + REX.W
4332  				ab.rexflag |= Pw
4333  				ab.Put2(Pf3, Pm)
4334  
4335  			case Pm: // opcode escape
4336  				ab.Put1(Pm)
4337  
4338  			case Pe: // 16 bit escape
4339  				ab.Put1(Pe)
4340  
4341  			case Pw: // 64-bit escape
4342  				if ctxt.Arch.Family != sys.AMD64 {
4343  					ctxt.Diag("asmins: illegal 64: %v", p)
4344  				}
4345  				ab.rexflag |= Pw
4346  
4347  			case Pw8: // 64-bit escape if z >= 8
4348  				if z >= 8 {
4349  					if ctxt.Arch.Family != sys.AMD64 {
4350  						ctxt.Diag("asmins: illegal 64: %v", p)
4351  					}
4352  					ab.rexflag |= Pw
4353  				}
4354  
4355  			case Pb: // botch
4356  				if ctxt.Arch.Family != sys.AMD64 && (isbadbyte(&p.From) || isbadbyte(&p.To)) {
4357  					goto bad
4358  				}
4359  				// NOTE(rsc): This is probably safe to do always,
4360  				// but when enabled it chooses different encodings
4361  				// than the old cmd/internal/obj/i386 code did,
4362  				// which breaks our "same bits out" checks.
4363  				// In particular, CMPB AX, $0 encodes as 80 f8 00
4364  				// in the original obj/i386, and it would encode
4365  				// (using a valid, shorter form) as 3c 00 if we enabled
4366  				// the call to bytereg here.
4367  				if ctxt.Arch.Family == sys.AMD64 {
4368  					bytereg(&p.From, &p.Ft)
4369  					bytereg(&p.To, &p.Tt)
4370  				}
4371  
4372  			case P32: // 32 bit but illegal if 64-bit mode
4373  				if ctxt.Arch.Family == sys.AMD64 {
4374  					ctxt.Diag("asmins: illegal in 64-bit mode: %v", p)
4375  				}
4376  
4377  			case Py: // 64-bit only, no prefix
4378  				if ctxt.Arch.Family != sys.AMD64 {
4379  					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4380  				}
4381  
4382  			case Py1: // 64-bit only if z < 1, no prefix
4383  				if z < 1 && ctxt.Arch.Family != sys.AMD64 {
4384  					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4385  				}
4386  
4387  			case Py3: // 64-bit only if z < 3, no prefix
4388  				if z < 3 && ctxt.Arch.Family != sys.AMD64 {
4389  					ctxt.Diag("asmins: illegal in %d-bit mode: %v", ctxt.Arch.RegSize*8, p)
4390  				}
4391  			}
4392  
4393  			if z >= len(o.op) {
4394  				log.Fatalf("asmins bad table %v", p)
4395  			}
4396  			op = int(o.op[z])
4397  			if op == 0x0f {
4398  				ab.Put1(byte(op))
4399  				z++
4400  				op = int(o.op[z])
4401  			}
4402  
4403  			switch yt.zcase {
4404  			default:
4405  				ctxt.Diag("asmins: unknown z %d %v", yt.zcase, p)
4406  				return
4407  
4408  			case Zpseudo:
4409  				break
4410  
4411  			case Zlit:
4412  				ab.PutOpBytesLit(z, &o.op)
4413  
4414  			case Zlitr_m:
4415  				ab.PutOpBytesLit(z, &o.op)
4416  				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4417  
4418  			case Zlitm_r:
4419  				ab.PutOpBytesLit(z, &o.op)
4420  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4421  
4422  			case Zlit_m_r:
4423  				ab.PutOpBytesLit(z, &o.op)
4424  				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4425  
4426  			case Zmb_r:
4427  				bytereg(&p.From, &p.Ft)
4428  				fallthrough
4429  
4430  			case Zm_r:
4431  				ab.Put1(byte(op))
4432  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4433  
4434  			case Z_m_r:
4435  				ab.Put1(byte(op))
4436  				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4437  
4438  			case Zm2_r:
4439  				ab.Put2(byte(op), o.op[z+1])
4440  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4441  
4442  			case Zm_r_xm:
4443  				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4444  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4445  
4446  			case Zm_r_xm_nr:
4447  				ab.rexflag = 0
4448  				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4449  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4450  
4451  			case Zm_r_i_xm:
4452  				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4453  				ab.asmand(ctxt, cursym, p, &p.From, p.GetFrom3())
4454  				ab.Put1(byte(p.To.Offset))
4455  
4456  			case Zibm_r, Zibr_m:
4457  				ab.PutOpBytesLit(z, &o.op)
4458  				if yt.zcase == Zibr_m {
4459  					ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4460  				} else {
4461  					ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4462  				}
4463  				switch {
4464  				default:
4465  					ab.Put1(byte(p.From.Offset))
4466  				case yt.args[0] == Yi32 && o.prefix == Pe:
4467  					ab.PutInt16(int16(p.From.Offset))
4468  				case yt.args[0] == Yi32:
4469  					ab.PutInt32(int32(p.From.Offset))
4470  				}
4471  
4472  			case Zaut_r:
4473  				ab.Put1(0x8d) // leal
4474  				if p.From.Type != obj.TYPE_ADDR {
4475  					ctxt.Diag("asmins: Zaut sb type ADDR")
4476  				}
4477  				p.From.Type = obj.TYPE_MEM
4478  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4479  				p.From.Type = obj.TYPE_ADDR
4480  
4481  			case Zm_o:
4482  				ab.Put1(byte(op))
4483  				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4484  
4485  			case Zr_m:
4486  				ab.Put1(byte(op))
4487  				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4488  
4489  			case Zvex:
4490  				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4491  
4492  			case Zvex_rm_v_r:
4493  				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4494  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4495  
4496  			case Zvex_rm_v_ro:
4497  				ab.asmvex(ctxt, &p.From, p.GetFrom3(), &p.To, o.op[z], o.op[z+1])
4498  				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))
4499  
4500  			case Zvex_i_rm_vo:
4501  				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
4502  				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+2]))
4503  				ab.Put1(byte(p.From.Offset))
4504  
4505  			case Zvex_i_r_v:
4506  				ab.asmvex(ctxt, p.GetFrom3(), &p.To, nil, o.op[z], o.op[z+1])
4507  				regnum := byte(0x7)
4508  				if p.GetFrom3().Reg >= REG_X0 && p.GetFrom3().Reg <= REG_X15 {
4509  					regnum &= byte(p.GetFrom3().Reg - REG_X0)
4510  				} else {
4511  					regnum &= byte(p.GetFrom3().Reg - REG_Y0)
4512  				}
4513  				ab.Put1(o.op[z+2] | regnum)
4514  				ab.Put1(byte(p.From.Offset))
4515  
4516  			case Zvex_i_rm_v_r:
4517  				imm, from, from3, to := unpackOps4(p)
4518  				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
4519  				ab.asmand(ctxt, cursym, p, from, to)
4520  				ab.Put1(byte(imm.Offset))
4521  
4522  			case Zvex_i_rm_r:
4523  				ab.asmvex(ctxt, p.GetFrom3(), nil, &p.To, o.op[z], o.op[z+1])
4524  				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4525  				ab.Put1(byte(p.From.Offset))
4526  
4527  			case Zvex_v_rm_r:
4528  				ab.asmvex(ctxt, p.GetFrom3(), &p.From, &p.To, o.op[z], o.op[z+1])
4529  				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4530  
4531  			case Zvex_r_v_rm:
4532  				ab.asmvex(ctxt, &p.To, p.GetFrom3(), &p.From, o.op[z], o.op[z+1])
4533  				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4534  
4535  			case Zvex_rm_r_vo:
4536  				ab.asmvex(ctxt, &p.From, &p.To, p.GetFrom3(), o.op[z], o.op[z+1])
4537  				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+2]))
4538  
4539  			case Zvex_i_r_rm:
4540  				ab.asmvex(ctxt, &p.To, nil, p.GetFrom3(), o.op[z], o.op[z+1])
4541  				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4542  				ab.Put1(byte(p.From.Offset))
4543  
4544  			case Zvex_hr_rm_v_r:
4545  				hr, from, from3, to := unpackOps4(p)
4546  				ab.asmvex(ctxt, from, from3, to, o.op[z], o.op[z+1])
4547  				ab.asmand(ctxt, cursym, p, from, to)
4548  				ab.Put1(byte(regIndex(hr.Reg) << 4))
4549  
4550  			case Zevex_k_rmo:
4551  				ab.evex = newEVEXBits(z, &o.op)
4552  				ab.asmevex(ctxt, p, &p.To, nil, nil, &p.From)
4553  				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+3]))
4554  
4555  			case Zevex_i_rm_vo:
4556  				ab.evex = newEVEXBits(z, &o.op)
4557  				ab.asmevex(ctxt, p, p.GetFrom3(), &p.To, nil, nil)
4558  				ab.asmando(ctxt, cursym, p, p.GetFrom3(), int(o.op[z+3]))
4559  				ab.Put1(byte(p.From.Offset))
4560  
4561  			case Zevex_i_rm_k_vo:
4562  				imm, from, kmask, to := unpackOps4(p)
4563  				ab.evex = newEVEXBits(z, &o.op)
4564  				ab.asmevex(ctxt, p, from, to, nil, kmask)
4565  				ab.asmando(ctxt, cursym, p, from, int(o.op[z+3]))
4566  				ab.Put1(byte(imm.Offset))
4567  
4568  			case Zevex_i_r_rm:
4569  				ab.evex = newEVEXBits(z, &o.op)
4570  				ab.asmevex(ctxt, p, &p.To, nil, p.GetFrom3(), nil)
4571  				ab.asmand(ctxt, cursym, p, &p.To, p.GetFrom3())
4572  				ab.Put1(byte(p.From.Offset))
4573  
4574  			case Zevex_i_r_k_rm:
4575  				imm, from, kmask, to := unpackOps4(p)
4576  				ab.evex = newEVEXBits(z, &o.op)
4577  				ab.asmevex(ctxt, p, to, nil, from, kmask)
4578  				ab.asmand(ctxt, cursym, p, to, from)
4579  				ab.Put1(byte(imm.Offset))
4580  
4581  			case Zevex_i_rm_r:
4582  				ab.evex = newEVEXBits(z, &o.op)
4583  				ab.asmevex(ctxt, p, p.GetFrom3(), nil, &p.To, nil)
4584  				ab.asmand(ctxt, cursym, p, p.GetFrom3(), &p.To)
4585  				ab.Put1(byte(p.From.Offset))
4586  
4587  			case Zevex_i_rm_k_r:
4588  				imm, from, kmask, to := unpackOps4(p)
4589  				ab.evex = newEVEXBits(z, &o.op)
4590  				ab.asmevex(ctxt, p, from, nil, to, kmask)
4591  				ab.asmand(ctxt, cursym, p, from, to)
4592  				ab.Put1(byte(imm.Offset))
4593  
4594  			case Zevex_i_rm_v_r:
4595  				imm, from, from3, to := unpackOps4(p)
4596  				ab.evex = newEVEXBits(z, &o.op)
4597  				ab.asmevex(ctxt, p, from, from3, to, nil)
4598  				ab.asmand(ctxt, cursym, p, from, to)
4599  				ab.Put1(byte(imm.Offset))
4600  
4601  			case Zevex_i_rm_v_k_r:
4602  				imm, from, from3, kmask, to := unpackOps5(p)
4603  				ab.evex = newEVEXBits(z, &o.op)
4604  				ab.asmevex(ctxt, p, from, from3, to, kmask)
4605  				ab.asmand(ctxt, cursym, p, from, to)
4606  				ab.Put1(byte(imm.Offset))
4607  
4608  			case Zevex_r_v_rm:
4609  				ab.evex = newEVEXBits(z, &o.op)
4610  				ab.asmevex(ctxt, p, &p.To, p.GetFrom3(), &p.From, nil)
4611  				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4612  
4613  			case Zevex_rm_v_r:
4614  				ab.evex = newEVEXBits(z, &o.op)
4615  				ab.asmevex(ctxt, p, &p.From, p.GetFrom3(), &p.To, nil)
4616  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4617  
4618  			case Zevex_rm_k_r:
4619  				ab.evex = newEVEXBits(z, &o.op)
4620  				ab.asmevex(ctxt, p, &p.From, nil, &p.To, p.GetFrom3())
4621  				ab.asmand(ctxt, cursym, p, &p.From, &p.To)
4622  
4623  			case Zevex_r_k_rm:
4624  				ab.evex = newEVEXBits(z, &o.op)
4625  				ab.asmevex(ctxt, p, &p.To, nil, &p.From, p.GetFrom3())
4626  				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4627  
4628  			case Zevex_rm_v_k_r:
4629  				from, from3, kmask, to := unpackOps4(p)
4630  				ab.evex = newEVEXBits(z, &o.op)
4631  				ab.asmevex(ctxt, p, from, from3, to, kmask)
4632  				ab.asmand(ctxt, cursym, p, from, to)
4633  
4634  			case Zevex_r_v_k_rm:
4635  				from, from3, kmask, to := unpackOps4(p)
4636  				ab.evex = newEVEXBits(z, &o.op)
4637  				ab.asmevex(ctxt, p, to, from3, from, kmask)
4638  				ab.asmand(ctxt, cursym, p, to, from)
4639  
4640  			case Zr_m_xm:
4641  				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4642  				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4643  
4644  			case Zr_m_xm_nr:
4645  				ab.rexflag = 0
4646  				ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4647  				ab.asmand(ctxt, cursym, p, &p.To, &p.From)
4648  
4649  			case Zo_m:
4650  				ab.Put1(byte(op))
4651  				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4652  
4653  			case Zcallindreg:
4654  				r = obj.Addrel(cursym)
4655  				r.Off = int32(p.Pc)
4656  				r.Type = objabi.R_CALLIND
4657  				r.Siz = 0
4658  				fallthrough
4659  
4660  			case Zo_m64:
4661  				ab.Put1(byte(op))
4662  				ab.asmandsz(ctxt, cursym, p, &p.To, int(o.op[z+1]), 0, 1)
4663  
4664  			case Zm_ibo:
4665  				ab.Put1(byte(op))
4666  				ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4667  				ab.Put1(byte(vaddr(ctxt, p, &p.To, nil)))
4668  
4669  			case Zibo_m:
4670  				ab.Put1(byte(op))
4671  				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4672  				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4673  
4674  			case Zibo_m_xm:
4675  				z = ab.mediaop(ctxt, o, op, int(yt.zoffset), z)
4676  				ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4677  				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4678  
4679  			case Z_ib, Zib_:
4680  				if yt.zcase == Zib_ {
4681  					a = &p.From
4682  				} else {
4683  					a = &p.To
4684  				}
4685  				ab.Put1(byte(op))
4686  				if p.As == AXABORT {
4687  					ab.Put1(o.op[z+1])
4688  				}
4689  				ab.Put1(byte(vaddr(ctxt, p, a, nil)))
4690  
4691  			case Zib_rp:
4692  				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
4693  				ab.Put2(byte(op+reg[p.To.Reg]), byte(vaddr(ctxt, p, &p.From, nil)))
4694  
4695  			case Zil_rp:
4696  				ab.rexflag |= regrex[p.To.Reg] & Rxb
4697  				ab.Put1(byte(op + reg[p.To.Reg]))
4698  				if o.prefix == Pe {
4699  					v = vaddr(ctxt, p, &p.From, nil)
4700  					ab.PutInt16(int16(v))
4701  				} else {
4702  					ab.relput4(ctxt, cursym, p, &p.From)
4703  				}
4704  
4705  			case Zo_iw:
4706  				ab.Put1(byte(op))
4707  				if p.From.Type != obj.TYPE_NONE {
4708  					v = vaddr(ctxt, p, &p.From, nil)
4709  					ab.PutInt16(int16(v))
4710  				}
4711  
4712  			case Ziq_rp:
4713  				v = vaddr(ctxt, p, &p.From, &rel)
4714  				l = int(v >> 32)
4715  				if l == 0 && rel.Siz != 8 {
4716  					ab.rexflag &^= (0x40 | Rxw)
4717  
4718  					ab.rexflag |= regrex[p.To.Reg] & Rxb
4719  					ab.Put1(byte(0xb8 + reg[p.To.Reg]))
4720  					if rel.Type != 0 {
4721  						r = obj.Addrel(cursym)
4722  						*r = rel
4723  						r.Off = int32(p.Pc + int64(ab.Len()))
4724  					}
4725  
4726  					ab.PutInt32(int32(v))
4727  				} else if l == -1 && uint64(v)&(uint64(1)<<31) != 0 { // sign extend
4728  					ab.Put1(0xc7)
4729  					ab.asmando(ctxt, cursym, p, &p.To, 0)
4730  
4731  					ab.PutInt32(int32(v)) // need all 8
4732  				} else {
4733  					ab.rexflag |= regrex[p.To.Reg] & Rxb
4734  					ab.Put1(byte(op + reg[p.To.Reg]))
4735  					if rel.Type != 0 {
4736  						r = obj.Addrel(cursym)
4737  						*r = rel
4738  						r.Off = int32(p.Pc + int64(ab.Len()))
4739  					}
4740  
4741  					ab.PutInt64(v)
4742  				}
4743  
4744  			case Zib_rr:
4745  				ab.Put1(byte(op))
4746  				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
4747  				ab.Put1(byte(vaddr(ctxt, p, &p.From, nil)))
4748  
4749  			case Z_il, Zil_:
4750  				if yt.zcase == Zil_ {
4751  					a = &p.From
4752  				} else {
4753  					a = &p.To
4754  				}
4755  				ab.Put1(byte(op))
4756  				if o.prefix == Pe {
4757  					v = vaddr(ctxt, p, a, nil)
4758  					ab.PutInt16(int16(v))
4759  				} else {
4760  					ab.relput4(ctxt, cursym, p, a)
4761  				}
4762  
4763  			case Zm_ilo, Zilo_m:
4764  				ab.Put1(byte(op))
4765  				if yt.zcase == Zilo_m {
4766  					a = &p.From
4767  					ab.asmando(ctxt, cursym, p, &p.To, int(o.op[z+1]))
4768  				} else {
4769  					a = &p.To
4770  					ab.asmando(ctxt, cursym, p, &p.From, int(o.op[z+1]))
4771  				}
4772  
4773  				if o.prefix == Pe {
4774  					v = vaddr(ctxt, p, a, nil)
4775  					ab.PutInt16(int16(v))
4776  				} else {
4777  					ab.relput4(ctxt, cursym, p, a)
4778  				}
4779  
4780  			case Zil_rr:
4781  				ab.Put1(byte(op))
4782  				ab.asmand(ctxt, cursym, p, &p.To, &p.To)
4783  				if o.prefix == Pe {
4784  					v = vaddr(ctxt, p, &p.From, nil)
4785  					ab.PutInt16(int16(v))
4786  				} else {
4787  					ab.relput4(ctxt, cursym, p, &p.From)
4788  				}
4789  
4790  			case Z_rp:
4791  				ab.rexflag |= regrex[p.To.Reg] & (Rxb | 0x40)
4792  				ab.Put1(byte(op + reg[p.To.Reg]))
4793  
4794  			case Zrp_:
4795  				ab.rexflag |= regrex[p.From.Reg] & (Rxb | 0x40)
4796  				ab.Put1(byte(op + reg[p.From.Reg]))
4797  
4798  			case Zcallcon, Zjmpcon:
4799  				if yt.zcase == Zcallcon {
4800  					ab.Put1(byte(op))
4801  				} else {
4802  					ab.Put1(o.op[z+1])
4803  				}
4804  				r = obj.Addrel(cursym)
4805  				r.Off = int32(p.Pc + int64(ab.Len()))
4806  				r.Type = objabi.R_PCREL
4807  				r.Siz = 4
4808  				r.Add = p.To.Offset
4809  				ab.PutInt32(0)
4810  
4811  			case Zcallind:
4812  				ab.Put2(byte(op), o.op[z+1])
4813  				r = obj.Addrel(cursym)
4814  				r.Off = int32(p.Pc + int64(ab.Len()))
4815  				if ctxt.Arch.Family == sys.AMD64 {
4816  					r.Type = objabi.R_PCREL
4817  				} else {
4818  					r.Type = objabi.R_ADDR
4819  				}
4820  				r.Siz = 4
4821  				r.Add = p.To.Offset
4822  				r.Sym = p.To.Sym
4823  				ab.PutInt32(0)
4824  
4825  			case Zcall, Zcallduff:
4826  				if p.To.Sym == nil {
4827  					ctxt.Diag("call without target")
4828  					ctxt.DiagFlush()
4829  					log.Fatalf("bad code")
4830  				}
4831  
4832  				if yt.zcase == Zcallduff && ctxt.Flag_dynlink {
4833  					ctxt.Diag("directly calling duff when dynamically linking Go")
4834  				}
4835  
4836  				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
4837  					// Maintain BP around call, since duffcopy/duffzero can't do it
4838  					// (the call jumps into the middle of the function).
4839  					// This makes it possible to see call sites for duffcopy/duffzero in
4840  					// BP-based profiling tools like Linux perf (which is the
4841  					// whole point of maintaining frame pointers in Go).
4842  					// MOVQ BP, -16(SP)
4843  					// LEAQ -16(SP), BP
4844  					ab.Put(bpduff1)
4845  				}
4846  				ab.Put1(byte(op))
4847  				r = obj.Addrel(cursym)
4848  				r.Off = int32(p.Pc + int64(ab.Len()))
4849  				r.Sym = p.To.Sym
4850  				r.Add = p.To.Offset
4851  				r.Type = objabi.R_CALL
4852  				r.Siz = 4
4853  				ab.PutInt32(0)
4854  
4855  				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
4856  					// Pop BP pushed above.
4857  					// MOVQ 0(BP), BP
4858  					ab.Put(bpduff2)
4859  				}
4860  
4861  			// TODO: jump across functions needs reloc
4862  			case Zbr, Zjmp, Zloop:
4863  				if p.As == AXBEGIN {
4864  					ab.Put1(byte(op))
4865  				}
4866  				if p.To.Sym != nil {
4867  					if yt.zcase != Zjmp {
4868  						ctxt.Diag("branch to ATEXT")
4869  						ctxt.DiagFlush()
4870  						log.Fatalf("bad code")
4871  					}
4872  
4873  					ab.Put1(o.op[z+1])
4874  					r = obj.Addrel(cursym)
4875  					r.Off = int32(p.Pc + int64(ab.Len()))
4876  					r.Sym = p.To.Sym
4877  					// Note: R_CALL instead of R_PCREL. R_CALL is more permissive in that
4878  					// it can point to a trampoline instead of the destination itself.
4879  					r.Type = objabi.R_CALL
4880  					r.Siz = 4
4881  					ab.PutInt32(0)
4882  					break
4883  				}
4884  
4885  				// Assumes q is in this function.
4886  				// TODO: Check in input, preserve in brchain.
4887  
4888  				// Fill in backward jump now.
4889  				q = p.To.Target()
4890  
4891  				if q == nil {
4892  					ctxt.Diag("jmp/branch/loop without target")
4893  					ctxt.DiagFlush()
4894  					log.Fatalf("bad code")
4895  				}
4896  
4897  				if p.Back&branchBackwards != 0 {
4898  					v = q.Pc - (p.Pc + 2)
4899  					if v >= -128 && p.As != AXBEGIN {
4900  						if p.As == AJCXZL {
4901  							ab.Put1(0x67)
4902  						}
4903  						ab.Put2(byte(op), byte(v))
4904  					} else if yt.zcase == Zloop {
4905  						ctxt.Diag("loop too far: %v", p)
4906  					} else {
4907  						v -= 5 - 2
4908  						if p.As == AXBEGIN {
4909  							v--
4910  						}
4911  						if yt.zcase == Zbr {
4912  							ab.Put1(0x0f)
4913  							v--
4914  						}
4915  
4916  						ab.Put1(o.op[z+1])
4917  						ab.PutInt32(int32(v))
4918  					}
4919  
4920  					break
4921  				}
4922  
4923  				// Annotate target; will fill in later.
4924  				p.Forwd = q.Rel
4925  
4926  				q.Rel = p
4927  				if p.Back&branchShort != 0 && p.As != AXBEGIN {
4928  					if p.As == AJCXZL {
4929  						ab.Put1(0x67)
4930  					}
4931  					ab.Put2(byte(op), 0)
4932  				} else if yt.zcase == Zloop {
4933  					ctxt.Diag("loop too far: %v", p)
4934  				} else {
4935  					if yt.zcase == Zbr {
4936  						ab.Put1(0x0f)
4937  					}
4938  					ab.Put1(o.op[z+1])
4939  					ab.PutInt32(0)
4940  				}
4941  
4942  			case Zbyte:
4943  				v = vaddr(ctxt, p, &p.From, &rel)
4944  				if rel.Siz != 0 {
4945  					rel.Siz = uint8(op)
4946  					r = obj.Addrel(cursym)
4947  					*r = rel
4948  					r.Off = int32(p.Pc + int64(ab.Len()))
4949  				}
4950  
4951  				ab.Put1(byte(v))
4952  				if op > 1 {
4953  					ab.Put1(byte(v >> 8))
4954  					if op > 2 {
4955  						ab.PutInt16(int16(v >> 16))
4956  						if op > 4 {
4957  							ab.PutInt32(int32(v >> 32))
4958  						}
4959  					}
4960  				}
4961  			}
4962  
4963  			return
4964  		}
4965  	}
4966  	f3t = Ynone * Ymax
4967  	if p.GetFrom3() != nil {
4968  		f3t = oclass(ctxt, p, p.GetFrom3()) * Ymax
4969  	}
4970  	for mo := ymovtab; mo[0].as != 0; mo = mo[1:] {
4971  		var pp obj.Prog
4972  		var t []byte
4973  		if p.As == mo[0].as {
4974  			if ycover[ft+int(mo[0].ft)] != 0 && ycover[f3t+int(mo[0].f3t)] != 0 && ycover[tt+int(mo[0].tt)] != 0 {
4975  				t = mo[0].op[:]
4976  				switch mo[0].code {
4977  				default:
4978  					ctxt.Diag("asmins: unknown mov %d %v", mo[0].code, p)
4979  
4980  				case movLit:
4981  					for z = 0; t[z] != 0; z++ {
4982  						ab.Put1(t[z])
4983  					}
4984  
4985  				case movRegMem:
4986  					ab.Put1(t[0])
4987  					ab.asmando(ctxt, cursym, p, &p.To, int(t[1]))
4988  
4989  				case movMemReg:
4990  					ab.Put1(t[0])
4991  					ab.asmando(ctxt, cursym, p, &p.From, int(t[1]))
4992  
4993  				case movRegMem2op: // r,m - 2op
4994  					ab.Put2(t[0], t[1])
4995  					ab.asmando(ctxt, cursym, p, &p.To, int(t[2]))
4996  					ab.rexflag |= regrex[p.From.Reg] & (Rxr | 0x40)
4997  
4998  				case movMemReg2op:
4999  					ab.Put2(t[0], t[1])
5000  					ab.asmando(ctxt, cursym, p, &p.From, int(t[2]))
5001  					ab.rexflag |= regrex[p.To.Reg] & (Rxr | 0x40)
5002  
5003  				case movFullPtr:
5004  					if t[0] != 0 {
5005  						ab.Put1(t[0])
5006  					}
5007  					switch p.To.Index {
5008  					default:
5009  						goto bad
5010  
5011  					case REG_DS:
5012  						ab.Put1(0xc5)
5013  
5014  					case REG_SS:
5015  						ab.Put2(0x0f, 0xb2)
5016  
5017  					case REG_ES:
5018  						ab.Put1(0xc4)
5019  
5020  					case REG_FS:
5021  						ab.Put2(0x0f, 0xb4)
5022  
5023  					case REG_GS:
5024  						ab.Put2(0x0f, 0xb5)
5025  					}
5026  
5027  					ab.asmand(ctxt, cursym, p, &p.From, &p.To)
5028  
5029  				case movDoubleShift:
5030  					if t[0] == Pw {
5031  						if ctxt.Arch.Family != sys.AMD64 {
5032  							ctxt.Diag("asmins: illegal 64: %v", p)
5033  						}
5034  						ab.rexflag |= Pw
5035  						t = t[1:]
5036  					} else if t[0] == Pe {
5037  						ab.Put1(Pe)
5038  						t = t[1:]
5039  					}
5040  
5041  					switch p.From.Type {
5042  					default:
5043  						goto bad
5044  
5045  					case obj.TYPE_CONST:
5046  						ab.Put2(0x0f, t[0])
5047  						ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
5048  						ab.Put1(byte(p.From.Offset))
5049  
5050  					case obj.TYPE_REG:
5051  						switch p.From.Reg {
5052  						default:
5053  							goto bad
5054  
5055  						case REG_CL, REG_CX:
5056  							ab.Put2(0x0f, t[1])
5057  							ab.asmandsz(ctxt, cursym, p, &p.To, reg[p.GetFrom3().Reg], regrex[p.GetFrom3().Reg], 0)
5058  						}
5059  					}
5060  
5061  				// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
5062  				// where you load the TLS base register into a register and then index off that
5063  				// register to access the actual TLS variables. Systems that allow direct TLS access
5064  				// are handled in prefixof above and should not be listed here.
5065  				case movTLSReg:
5066  					if ctxt.Arch.Family == sys.AMD64 && p.As != AMOVQ || ctxt.Arch.Family == sys.I386 && p.As != AMOVL {
5067  						ctxt.Diag("invalid load of TLS: %v", p)
5068  					}
5069  
5070  					if ctxt.Arch.Family == sys.I386 {
5071  						// NOTE: The systems listed here are the ones that use the "TLS initial exec" model,
5072  						// where you load the TLS base register into a register and then index off that
5073  						// register to access the actual TLS variables. Systems that allow direct TLS access
5074  						// are handled in prefixof above and should not be listed here.
5075  						switch ctxt.Headtype {
5076  						default:
5077  							log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)
5078  
5079  						case objabi.Hlinux, objabi.Hfreebsd:
5080  							if ctxt.Flag_shared {
5081  								// Note that this is not generating the same insns as the other cases.
5082  								//     MOV TLS, dst
5083  								// becomes
5084  								//     call __x86.get_pc_thunk.dst
5085  								//     movl (gotpc + g@gotntpoff)(dst), dst
5086  								// which is encoded as
5087  								//     call __x86.get_pc_thunk.dst
5088  								//     movq 0(dst), dst
5089  								// and R_CALL & R_TLS_IE relocs. This all assumes the only tls variable we access
5090  								// is g, which we can't check here, but will when we assemble the second
5091  								// instruction.
5092  								dst := p.To.Reg
5093  								ab.Put1(0xe8)
5094  								r = obj.Addrel(cursym)
5095  								r.Off = int32(p.Pc + int64(ab.Len()))
5096  								r.Type = objabi.R_CALL
5097  								r.Siz = 4
5098  								r.Sym = ctxt.Lookup("__x86.get_pc_thunk." + strings.ToLower(rconv(int(dst))))
5099  								ab.PutInt32(0)
5100  
5101  								ab.Put2(0x8B, byte(2<<6|reg[dst]|(reg[dst]<<3)))
5102  								r = obj.Addrel(cursym)
5103  								r.Off = int32(p.Pc + int64(ab.Len()))
5104  								r.Type = objabi.R_TLS_IE
5105  								r.Siz = 4
5106  								r.Add = 2
5107  								ab.PutInt32(0)
5108  							} else {
5109  								// ELF TLS base is 0(GS).
5110  								pp.From = p.From
5111  
5112  								pp.From.Type = obj.TYPE_MEM
5113  								pp.From.Reg = REG_GS
5114  								pp.From.Offset = 0
5115  								pp.From.Index = REG_NONE
5116  								pp.From.Scale = 0
5117  								ab.Put2(0x65, // GS
5118  									0x8B)
5119  								ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5120  							}
5121  						case objabi.Hplan9:
5122  							pp.From = obj.Addr{}
5123  							pp.From.Type = obj.TYPE_MEM
5124  							pp.From.Name = obj.NAME_EXTERN
5125  							pp.From.Sym = plan9privates
5126  							pp.From.Offset = 0
5127  							pp.From.Index = REG_NONE
5128  							ab.Put1(0x8B)
5129  							ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5130  
5131  						case objabi.Hwindows:
5132  							// Windows TLS base is always 0x14(FS).
5133  							pp.From = p.From
5134  
5135  							pp.From.Type = obj.TYPE_MEM
5136  							pp.From.Reg = REG_FS
5137  							pp.From.Offset = 0x14
5138  							pp.From.Index = REG_NONE
5139  							pp.From.Scale = 0
5140  							ab.Put2(0x64, // FS
5141  								0x8B)
5142  							ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5143  						}
5144  						break
5145  					}
5146  
5147  					switch ctxt.Headtype {
5148  					default:
5149  						log.Fatalf("unknown TLS base location for %v", ctxt.Headtype)
5150  
5151  					case objabi.Hlinux, objabi.Hfreebsd:
5152  						if !ctxt.Flag_shared {
5153  							log.Fatalf("unknown TLS base location for linux/freebsd without -shared")
5154  						}
5155  						// Note that this is not generating the same insn as the other cases.
5156  						//     MOV TLS, R_to
5157  						// becomes
5158  						//     movq g@gottpoff(%rip), R_to
5159  						// which is encoded as
5160  						//     movq 0(%rip), R_to
5161  						// and a R_TLS_IE reloc. This all assumes the only tls variable we access
5162  						// is g, which we can't check here, but will when we assemble the second
5163  						// instruction.
5164  						ab.rexflag = Pw | (regrex[p.To.Reg] & Rxr)
5165  
5166  						ab.Put2(0x8B, byte(0x05|(reg[p.To.Reg]<<3)))
5167  						r = obj.Addrel(cursym)
5168  						r.Off = int32(p.Pc + int64(ab.Len()))
5169  						r.Type = objabi.R_TLS_IE
5170  						r.Siz = 4
5171  						r.Add = -4
5172  						ab.PutInt32(0)
5173  
5174  					case objabi.Hplan9:
5175  						pp.From = obj.Addr{}
5176  						pp.From.Type = obj.TYPE_MEM
5177  						pp.From.Name = obj.NAME_EXTERN
5178  						pp.From.Sym = plan9privates
5179  						pp.From.Offset = 0
5180  						pp.From.Index = REG_NONE
5181  						ab.rexflag |= Pw
5182  						ab.Put1(0x8B)
5183  						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5184  
5185  					case objabi.Hsolaris: // TODO(rsc): Delete Hsolaris from list. Should not use this code. See progedit in obj6.c.
5186  						// TLS base is 0(FS).
5187  						pp.From = p.From
5188  
5189  						pp.From.Type = obj.TYPE_MEM
5190  						pp.From.Name = obj.NAME_NONE
5191  						pp.From.Reg = REG_NONE
5192  						pp.From.Offset = 0
5193  						pp.From.Index = REG_NONE
5194  						pp.From.Scale = 0
5195  						ab.rexflag |= Pw
5196  						ab.Put2(0x64, // FS
5197  							0x8B)
5198  						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5199  
5200  					case objabi.Hwindows:
5201  						// Windows TLS base is always 0x28(GS).
5202  						pp.From = p.From
5203  
5204  						pp.From.Type = obj.TYPE_MEM
5205  						pp.From.Name = obj.NAME_NONE
5206  						pp.From.Reg = REG_GS
5207  						pp.From.Offset = 0x28
5208  						pp.From.Index = REG_NONE
5209  						pp.From.Scale = 0
5210  						ab.rexflag |= Pw
5211  						ab.Put2(0x65, // GS
5212  							0x8B)
5213  						ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
5214  					}
5215  				}
5216  				return
5217  			}
5218  		}
5219  	}
5220  	goto bad
5221  
5222  bad:
5223  	if ctxt.Arch.Family != sys.AMD64 {
5224  		// here, the assembly has failed.
5225  		// if it's a byte instruction that has
5226  		// unaddressable registers, try to
5227  		// exchange registers and reissue the
5228  		// instruction with the operands renamed.
5229  		pp := *p
5230  
5231  		unbytereg(&pp.From, &pp.Ft)
5232  		unbytereg(&pp.To, &pp.Tt)
5233  
5234  		z := int(p.From.Reg)
5235  		if p.From.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
5236  			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
5237  			// For now, different to keep bit-for-bit compatibility.
5238  			if ctxt.Arch.Family == sys.I386 {
5239  				breg := byteswapreg(ctxt, &p.To)
5240  				if breg != REG_AX {
5241  					ab.Put1(0x87) // xchg lhs,bx
5242  					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
5243  					subreg(&pp, z, breg)
5244  					ab.doasm(ctxt, cursym, &pp)
5245  					ab.Put1(0x87) // xchg lhs,bx
5246  					ab.asmando(ctxt, cursym, p, &p.From, reg[breg])
5247  				} else {
5248  					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5249  					subreg(&pp, z, REG_AX)
5250  					ab.doasm(ctxt, cursym, &pp)
5251  					ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5252  				}
5253  				return
5254  			}
5255  
5256  			if isax(&p.To) || p.To.Type == obj.TYPE_NONE {
5257  				// We certainly don't want to exchange
5258  				// with AX if the op is MUL or DIV.
5259  				ab.Put1(0x87) // xchg lhs,bx
5260  				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
5261  				subreg(&pp, z, REG_BX)
5262  				ab.doasm(ctxt, cursym, &pp)
5263  				ab.Put1(0x87) // xchg lhs,bx
5264  				ab.asmando(ctxt, cursym, p, &p.From, reg[REG_BX])
5265  			} else {
5266  				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5267  				subreg(&pp, z, REG_AX)
5268  				ab.doasm(ctxt, cursym, &pp)
5269  				ab.Put1(byte(0x90 + reg[z])) // xchg lsh,ax
5270  			}
5271  			return
5272  		}
5273  
5274  		z = int(p.To.Reg)
5275  		if p.To.Type == obj.TYPE_REG && z >= REG_BP && z <= REG_DI {
5276  			// TODO(rsc): Use this code for x86-64 too. It has bug fixes not present in the amd64 code base.
5277  			// For now, different to keep bit-for-bit compatibility.
5278  			if ctxt.Arch.Family == sys.I386 {
5279  				breg := byteswapreg(ctxt, &p.From)
5280  				if breg != REG_AX {
5281  					ab.Put1(0x87) //xchg rhs,bx
5282  					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
5283  					subreg(&pp, z, breg)
5284  					ab.doasm(ctxt, cursym, &pp)
5285  					ab.Put1(0x87) // xchg rhs,bx
5286  					ab.asmando(ctxt, cursym, p, &p.To, reg[breg])
5287  				} else {
5288  					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5289  					subreg(&pp, z, REG_AX)
5290  					ab.doasm(ctxt, cursym, &pp)
5291  					ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5292  				}
5293  				return
5294  			}
5295  
5296  			if isax(&p.From) {
5297  				ab.Put1(0x87) // xchg rhs,bx
5298  				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
5299  				subreg(&pp, z, REG_BX)
5300  				ab.doasm(ctxt, cursym, &pp)
5301  				ab.Put1(0x87) // xchg rhs,bx
5302  				ab.asmando(ctxt, cursym, p, &p.To, reg[REG_BX])
5303  			} else {
5304  				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5305  				subreg(&pp, z, REG_AX)
5306  				ab.doasm(ctxt, cursym, &pp)
5307  				ab.Put1(byte(0x90 + reg[z])) // xchg rsh,ax
5308  			}
5309  			return
5310  		}
5311  	}
5312  
5313  	ctxt.Diag("invalid instruction: %v", p)
5314  }
5315  
5316  // byteswapreg returns a byte-addressable register (AX, BX, CX, DX)
5317  // which is not referenced in a.
5318  // If a is empty, it returns BX to account for MULB-like instructions
5319  // that might use DX and AX.
5320  func byteswapreg(ctxt *obj.Link, a *obj.Addr) int {
5321  	cana, canb, canc, cand := true, true, true, true
5322  	if a.Type == obj.TYPE_NONE {
5323  		cana, cand = false, false
5324  	}
5325  
5326  	if a.Type == obj.TYPE_REG || ((a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR) && a.Name == obj.NAME_NONE) {
5327  		switch a.Reg {
5328  		case REG_NONE:
5329  			cana, cand = false, false
5330  		case REG_AX, REG_AL, REG_AH:
5331  			cana = false
5332  		case REG_BX, REG_BL, REG_BH:
5333  			canb = false
5334  		case REG_CX, REG_CL, REG_CH:
5335  			canc = false
5336  		case REG_DX, REG_DL, REG_DH:
5337  			cand = false
5338  		}
5339  	}
5340  
5341  	if a.Type == obj.TYPE_MEM || a.Type == obj.TYPE_ADDR {
5342  		switch a.Index {
5343  		case REG_AX:
5344  			cana = false
5345  		case REG_BX:
5346  			canb = false
5347  		case REG_CX:
5348  			canc = false
5349  		case REG_DX:
5350  			cand = false
5351  		}
5352  	}
5353  
5354  	switch {
5355  	case cana:
5356  		return REG_AX
5357  	case canb:
5358  		return REG_BX
5359  	case canc:
5360  		return REG_CX
5361  	case cand:
5362  		return REG_DX
5363  	default:
5364  		ctxt.Diag("impossible byte register")
5365  		ctxt.DiagFlush()
5366  		log.Fatalf("bad code")
5367  		return 0
5368  	}
5369  }
5370  
5371  func isbadbyte(a *obj.Addr) bool {
5372  	return a.Type == obj.TYPE_REG && (REG_BP <= a.Reg && a.Reg <= REG_DI || REG_BPB <= a.Reg && a.Reg <= REG_DIB)
5373  }
5374  
5375  func (ab *AsmBuf) asmins(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
5376  	ab.Reset()
5377  
5378  	ab.rexflag = 0
5379  	ab.vexflag = false
5380  	ab.evexflag = false
5381  	mark := ab.Len()
5382  	ab.doasm(ctxt, cursym, p)
5383  	if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
5384  		// as befits the whole approach of the architecture,
5385  		// the rex prefix must appear before the first opcode byte
5386  		// (and thus after any 66/67/f2/f3/26/2e/3e prefix bytes, but
5387  		// before the 0f opcode escape!), or it might be ignored.
5388  		// note that the handbook often misleadingly shows 66/f2/f3 in `opcode'.
5389  		if ctxt.Arch.Family != sys.AMD64 {
5390  			ctxt.Diag("asmins: illegal in mode %d: %v (%d %d)", ctxt.Arch.RegSize*8, p, p.Ft, p.Tt)
5391  		}
5392  		n := ab.Len()
5393  		var np int
5394  		for np = mark; np < n; np++ {
5395  			c := ab.At(np)
5396  			if c != 0xf2 && c != 0xf3 && (c < 0x64 || c > 0x67) && c != 0x2e && c != 0x3e && c != 0x26 {
5397  				break
5398  			}
5399  		}
5400  		ab.Insert(np, byte(0x40|ab.rexflag))
5401  	}
5402  
5403  	n := ab.Len()
5404  	for i := len(cursym.R) - 1; i >= 0; i-- {
5405  		r := &cursym.R[i]
5406  		if int64(r.Off) < p.Pc {
5407  			break
5408  		}
5409  		if ab.rexflag != 0 && !ab.vexflag && !ab.evexflag {
5410  			r.Off++
5411  		}
5412  		if r.Type == objabi.R_PCREL {
5413  			if ctxt.Arch.Family == sys.AMD64 || p.As == obj.AJMP || p.As == obj.ACALL {
5414  				// PC-relative addressing is relative to the end of the instruction,
5415  				// but the relocations applied by the linker are relative to the end
5416  				// of the relocation. Because immediate instruction
5417  				// arguments can follow the PC-relative memory reference in the
5418  				// instruction encoding, the two may not coincide. In this case,
5419  				// adjust addend so that linker can keep relocating relative to the
5420  				// end of the relocation.
5421  				r.Add -= p.Pc + int64(n) - (int64(r.Off) + int64(r.Siz))
5422  			} else if ctxt.Arch.Family == sys.I386 {
5423  				// On 386 PC-relative addressing (for non-call/jmp instructions)
5424  				// assumes that the previous instruction loaded the PC of the end
5425  				// of that instruction into CX, so the adjustment is relative to
5426  				// that.
5427  				r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
5428  			}
5429  		}
5430  		if r.Type == objabi.R_GOTPCREL && ctxt.Arch.Family == sys.I386 {
5431  			// On 386, R_GOTPCREL makes the same assumptions as R_PCREL.
5432  			r.Add += int64(r.Off) - p.Pc + int64(r.Siz)
5433  		}
5434  
5435  	}
5436  }
5437  
5438  // unpackOps4 extracts 4 operands from p.
5439  func unpackOps4(p *obj.Prog) (arg0, arg1, arg2, dst *obj.Addr) {
5440  	return &p.From, &p.RestArgs[0], &p.RestArgs[1], &p.To
5441  }
5442  
5443  // unpackOps5 extracts 5 operands from p.
5444  func unpackOps5(p *obj.Prog) (arg0, arg1, arg2, arg3, dst *obj.Addr) {
5445  	return &p.From, &p.RestArgs[0], &p.RestArgs[1], &p.RestArgs[2], &p.To
5446  }
5447