amd64.s raw

   1  // Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.
   2  
   3  //go:build amd64 && !purego
   4  
   5  #include "textflag.h"
   6  
   7  // func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
   8  // Requires: AVX, AVX2
   9  TEXT ·addAVX2(SB), NOSPLIT, $0-24
  10  	MOVQ    p+0(FP), AX
  11  	MOVQ    a+8(FP), CX
  12  	MOVQ    b+16(FP), DX
  13  	VMOVDQU (CX), Y0
  14  	VMOVDQU 32(CX), Y2
  15  	VMOVDQU 64(CX), Y4
  16  	VMOVDQU 96(CX), Y6
  17  	VMOVDQU 128(CX), Y8
  18  	VMOVDQU 160(CX), Y10
  19  	VMOVDQU 192(CX), Y12
  20  	VMOVDQU 224(CX), Y14
  21  	VMOVDQU (DX), Y1
  22  	VMOVDQU 32(DX), Y3
  23  	VMOVDQU 64(DX), Y5
  24  	VMOVDQU 96(DX), Y7
  25  	VMOVDQU 128(DX), Y9
  26  	VMOVDQU 160(DX), Y11
  27  	VMOVDQU 192(DX), Y13
  28  	VMOVDQU 224(DX), Y15
  29  	VPADDW  Y0, Y1, Y1
  30  	VPADDW  Y2, Y3, Y3
  31  	VPADDW  Y4, Y5, Y5
  32  	VPADDW  Y6, Y7, Y7
  33  	VPADDW  Y8, Y9, Y9
  34  	VPADDW  Y10, Y11, Y11
  35  	VPADDW  Y12, Y13, Y13
  36  	VPADDW  Y14, Y15, Y15
  37  	VMOVDQU Y1, (AX)
  38  	VMOVDQU Y3, 32(AX)
  39  	VMOVDQU Y5, 64(AX)
  40  	VMOVDQU Y7, 96(AX)
  41  	VMOVDQU Y9, 128(AX)
  42  	VMOVDQU Y11, 160(AX)
  43  	VMOVDQU Y13, 192(AX)
  44  	VMOVDQU Y15, 224(AX)
  45  	VMOVDQU 256(CX), Y0
  46  	VMOVDQU 288(CX), Y2
  47  	VMOVDQU 320(CX), Y4
  48  	VMOVDQU 352(CX), Y6
  49  	VMOVDQU 384(CX), Y8
  50  	VMOVDQU 416(CX), Y10
  51  	VMOVDQU 448(CX), Y12
  52  	VMOVDQU 480(CX), Y14
  53  	VMOVDQU 256(DX), Y1
  54  	VMOVDQU 288(DX), Y3
  55  	VMOVDQU 320(DX), Y5
  56  	VMOVDQU 352(DX), Y7
  57  	VMOVDQU 384(DX), Y9
  58  	VMOVDQU 416(DX), Y11
  59  	VMOVDQU 448(DX), Y13
  60  	VMOVDQU 480(DX), Y15
  61  	VPADDW  Y0, Y1, Y1
  62  	VPADDW  Y2, Y3, Y3
  63  	VPADDW  Y4, Y5, Y5
  64  	VPADDW  Y6, Y7, Y7
  65  	VPADDW  Y8, Y9, Y9
  66  	VPADDW  Y10, Y11, Y11
  67  	VPADDW  Y12, Y13, Y13
  68  	VPADDW  Y14, Y15, Y15
  69  	VMOVDQU Y1, 256(AX)
  70  	VMOVDQU Y3, 288(AX)
  71  	VMOVDQU Y5, 320(AX)
  72  	VMOVDQU Y7, 352(AX)
  73  	VMOVDQU Y9, 384(AX)
  74  	VMOVDQU Y11, 416(AX)
  75  	VMOVDQU Y13, 448(AX)
  76  	VMOVDQU Y15, 480(AX)
  77  	RET
  78  
  79  // func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
  80  // Requires: AVX, AVX2
  81  TEXT ·subAVX2(SB), NOSPLIT, $0-24
  82  	MOVQ    p+0(FP), AX
  83  	MOVQ    a+8(FP), CX
  84  	MOVQ    b+16(FP), DX
  85  	VMOVDQU (CX), Y0
  86  	VMOVDQU 32(CX), Y2
  87  	VMOVDQU 64(CX), Y4
  88  	VMOVDQU 96(CX), Y6
  89  	VMOVDQU 128(CX), Y8
  90  	VMOVDQU 160(CX), Y10
  91  	VMOVDQU 192(CX), Y12
  92  	VMOVDQU 224(CX), Y14
  93  	VMOVDQU (DX), Y1
  94  	VMOVDQU 32(DX), Y3
  95  	VMOVDQU 64(DX), Y5
  96  	VMOVDQU 96(DX), Y7
  97  	VMOVDQU 128(DX), Y9
  98  	VMOVDQU 160(DX), Y11
  99  	VMOVDQU 192(DX), Y13
 100  	VMOVDQU 224(DX), Y15
 101  	VPSUBW  Y1, Y0, Y1
 102  	VPSUBW  Y3, Y2, Y3
 103  	VPSUBW  Y5, Y4, Y5
 104  	VPSUBW  Y7, Y6, Y7
 105  	VPSUBW  Y9, Y8, Y9
 106  	VPSUBW  Y11, Y10, Y11
 107  	VPSUBW  Y13, Y12, Y13
 108  	VPSUBW  Y15, Y14, Y15
 109  	VMOVDQU Y1, (AX)
 110  	VMOVDQU Y3, 32(AX)
 111  	VMOVDQU Y5, 64(AX)
 112  	VMOVDQU Y7, 96(AX)
 113  	VMOVDQU Y9, 128(AX)
 114  	VMOVDQU Y11, 160(AX)
 115  	VMOVDQU Y13, 192(AX)
 116  	VMOVDQU Y15, 224(AX)
 117  	VMOVDQU 256(CX), Y0
 118  	VMOVDQU 288(CX), Y2
 119  	VMOVDQU 320(CX), Y4
 120  	VMOVDQU 352(CX), Y6
 121  	VMOVDQU 384(CX), Y8
 122  	VMOVDQU 416(CX), Y10
 123  	VMOVDQU 448(CX), Y12
 124  	VMOVDQU 480(CX), Y14
 125  	VMOVDQU 256(DX), Y1
 126  	VMOVDQU 288(DX), Y3
 127  	VMOVDQU 320(DX), Y5
 128  	VMOVDQU 352(DX), Y7
 129  	VMOVDQU 384(DX), Y9
 130  	VMOVDQU 416(DX), Y11
 131  	VMOVDQU 448(DX), Y13
 132  	VMOVDQU 480(DX), Y15
 133  	VPSUBW  Y1, Y0, Y1
 134  	VPSUBW  Y3, Y2, Y3
 135  	VPSUBW  Y5, Y4, Y5
 136  	VPSUBW  Y7, Y6, Y7
 137  	VPSUBW  Y9, Y8, Y9
 138  	VPSUBW  Y11, Y10, Y11
 139  	VPSUBW  Y13, Y12, Y13
 140  	VPSUBW  Y15, Y14, Y15
 141  	VMOVDQU Y1, 256(AX)
 142  	VMOVDQU Y3, 288(AX)
 143  	VMOVDQU Y5, 320(AX)
 144  	VMOVDQU Y7, 352(AX)
 145  	VMOVDQU Y9, 384(AX)
 146  	VMOVDQU Y11, 416(AX)
 147  	VMOVDQU Y13, 448(AX)
 148  	VMOVDQU Y15, 480(AX)
 149  	RET
 150  
 151  // func nttAVX2(p *[256]int16)
 152  // Requires: AVX, AVX2
 153  TEXT ·nttAVX2(SB), NOSPLIT, $0-8
 154  	MOVQ         p+0(FP), AX
 155  	LEAQ         ·ZetasAVX2+0(SB), CX
 156  	MOVL         $0x00000d01, DX
 157  	VMOVD        DX, X0
 158  	VPBROADCASTW X0, Y15
 159  	VPBROADCASTW (CX), Y0
 160  	VPBROADCASTW 2(CX), Y1
 161  	VMOVDQU      (AX), Y7
 162  	VMOVDQU      32(AX), Y8
 163  	VMOVDQU      64(AX), Y9
 164  	VMOVDQU      96(AX), Y10
 165  	VMOVDQU      256(AX), Y11
 166  	VMOVDQU      288(AX), Y12
 167  	VMOVDQU      320(AX), Y13
 168  	VMOVDQU      352(AX), Y14
 169  	VPMULLW      Y11, Y0, Y2
 170  	VPMULLW      Y12, Y0, Y3
 171  	VPMULLW      Y13, Y0, Y4
 172  	VPMULLW      Y14, Y0, Y5
 173  	VPMULHW      Y11, Y1, Y11
 174  	VPMULHW      Y12, Y1, Y12
 175  	VPMULHW      Y13, Y1, Y13
 176  	VPMULHW      Y14, Y1, Y14
 177  	VPMULHW      Y2, Y15, Y2
 178  	VPMULHW      Y3, Y15, Y3
 179  	VPMULHW      Y4, Y15, Y4
 180  	VPMULHW      Y5, Y15, Y5
 181  	VPSUBW       Y2, Y11, Y2
 182  	VPSUBW       Y3, Y12, Y3
 183  	VPSUBW       Y4, Y13, Y4
 184  	VPSUBW       Y5, Y14, Y5
 185  	VPSUBW       Y2, Y7, Y11
 186  	VPSUBW       Y3, Y8, Y12
 187  	VPSUBW       Y4, Y9, Y13
 188  	VPSUBW       Y5, Y10, Y14
 189  	VPADDW       Y2, Y7, Y7
 190  	VPADDW       Y3, Y8, Y8
 191  	VPADDW       Y4, Y9, Y9
 192  	VPADDW       Y5, Y10, Y10
 193  	VMOVDQU      Y7, (AX)
 194  	VMOVDQU      Y8, 32(AX)
 195  	VMOVDQU      Y9, 64(AX)
 196  	VMOVDQU      Y10, 96(AX)
 197  	VMOVDQU      Y11, 256(AX)
 198  	VMOVDQU      Y12, 288(AX)
 199  	VMOVDQU      Y13, 320(AX)
 200  	VMOVDQU      Y14, 352(AX)
 201  	VMOVDQU      128(AX), Y7
 202  	VMOVDQU      160(AX), Y8
 203  	VMOVDQU      192(AX), Y9
 204  	VMOVDQU      224(AX), Y10
 205  	VMOVDQU      384(AX), Y11
 206  	VMOVDQU      416(AX), Y12
 207  	VMOVDQU      448(AX), Y13
 208  	VMOVDQU      480(AX), Y14
 209  	VPMULLW      Y11, Y0, Y2
 210  	VPMULLW      Y12, Y0, Y3
 211  	VPMULLW      Y13, Y0, Y4
 212  	VPMULLW      Y14, Y0, Y5
 213  	VPMULHW      Y11, Y1, Y11
 214  	VPMULHW      Y12, Y1, Y12
 215  	VPMULHW      Y13, Y1, Y13
 216  	VPMULHW      Y14, Y1, Y14
 217  	VPMULHW      Y2, Y15, Y2
 218  	VPMULHW      Y3, Y15, Y3
 219  	VPMULHW      Y4, Y15, Y4
 220  	VPMULHW      Y5, Y15, Y5
 221  	VPSUBW       Y2, Y11, Y2
 222  	VPSUBW       Y3, Y12, Y3
 223  	VPSUBW       Y4, Y13, Y4
 224  	VPSUBW       Y5, Y14, Y5
 225  	VPSUBW       Y2, Y7, Y11
 226  	VPSUBW       Y3, Y8, Y12
 227  	VPSUBW       Y4, Y9, Y13
 228  	VPSUBW       Y5, Y10, Y14
 229  	VPADDW       Y2, Y7, Y7
 230  	VPADDW       Y3, Y8, Y8
 231  	VPADDW       Y4, Y9, Y9
 232  	VPADDW       Y5, Y10, Y10
 233  	VMOVDQU      Y7, 128(AX)
 234  	VMOVDQU      Y8, 160(AX)
 235  	VMOVDQU      Y9, 192(AX)
 236  	VMOVDQU      Y10, 224(AX)
 237  	VMOVDQU      Y11, 384(AX)
 238  	VMOVDQU      Y12, 416(AX)
 239  	VMOVDQU      Y13, 448(AX)
 240  	VMOVDQU      Y14, 480(AX)
 241  	VPBROADCASTW 4(CX), Y0
 242  	VPBROADCASTW 6(CX), Y1
 243  	VMOVDQU      (AX), Y7
 244  	VMOVDQU      32(AX), Y8
 245  	VMOVDQU      64(AX), Y9
 246  	VMOVDQU      96(AX), Y10
 247  	VMOVDQU      128(AX), Y11
 248  	VMOVDQU      160(AX), Y12
 249  	VMOVDQU      192(AX), Y13
 250  	VMOVDQU      224(AX), Y14
 251  	VPMULLW      Y11, Y0, Y2
 252  	VPMULLW      Y12, Y0, Y3
 253  	VPMULLW      Y13, Y0, Y4
 254  	VPMULLW      Y14, Y0, Y5
 255  	VPMULHW      Y11, Y1, Y11
 256  	VPMULHW      Y12, Y1, Y12
 257  	VPMULHW      Y13, Y1, Y13
 258  	VPMULHW      Y14, Y1, Y14
 259  	VPMULHW      Y2, Y15, Y2
 260  	VPMULHW      Y3, Y15, Y3
 261  	VPMULHW      Y4, Y15, Y4
 262  	VPMULHW      Y5, Y15, Y5
 263  	VPSUBW       Y2, Y11, Y2
 264  	VPSUBW       Y3, Y12, Y3
 265  	VPSUBW       Y4, Y13, Y4
 266  	VPSUBW       Y5, Y14, Y5
 267  	VPSUBW       Y2, Y7, Y11
 268  	VPSUBW       Y3, Y8, Y12
 269  	VPSUBW       Y4, Y9, Y13
 270  	VPSUBW       Y5, Y10, Y14
 271  	VPADDW       Y2, Y7, Y7
 272  	VPADDW       Y3, Y8, Y8
 273  	VPADDW       Y4, Y9, Y9
 274  	VPADDW       Y5, Y10, Y10
 275  	VPBROADCASTW 12(CX), Y0
 276  	VPBROADCASTW 14(CX), Y1
 277  	VPBROADCASTW 16(CX), Y2
 278  	VPBROADCASTW 18(CX), Y3
 279  	VPMULLW      Y9, Y0, Y4
 280  	VPMULLW      Y10, Y0, Y5
 281  	VPMULLW      Y13, Y2, Y6
 282  	VPMULLW      Y14, Y2, Y0
 283  	VPMULHW      Y9, Y1, Y9
 284  	VPMULHW      Y10, Y1, Y10
 285  	VPMULHW      Y13, Y3, Y13
 286  	VPMULHW      Y14, Y3, Y14
 287  	VPMULHW      Y4, Y15, Y4
 288  	VPMULHW      Y5, Y15, Y5
 289  	VPMULHW      Y6, Y15, Y6
 290  	VPMULHW      Y0, Y15, Y0
 291  	VPSUBW       Y4, Y9, Y4
 292  	VPSUBW       Y5, Y10, Y5
 293  	VPSUBW       Y6, Y13, Y6
 294  	VPSUBW       Y0, Y14, Y0
 295  	VPSUBW       Y4, Y7, Y9
 296  	VPSUBW       Y5, Y8, Y10
 297  	VPSUBW       Y6, Y11, Y13
 298  	VPSUBW       Y0, Y12, Y14
 299  	VPADDW       Y4, Y7, Y7
 300  	VPADDW       Y5, Y8, Y8
 301  	VPADDW       Y6, Y11, Y11
 302  	VPADDW       Y0, Y12, Y12
 303  	VMOVDQU      32(CX), Y0
 304  	VMOVDQU      64(CX), Y1
 305  	VMOVDQU      96(CX), Y2
 306  	VMOVDQU      128(CX), Y3
 307  	VPERM2I128   $0x20, Y9, Y7, Y4
 308  	VPERM2I128   $0x31, Y9, Y7, Y9
 309  	VMOVDQA      Y4, Y7
 310  	VPERM2I128   $0x20, Y10, Y8, Y4
 311  	VPERM2I128   $0x31, Y10, Y8, Y10
 312  	VMOVDQA      Y4, Y8
 313  	VPERM2I128   $0x20, Y13, Y11, Y4
 314  	VPERM2I128   $0x31, Y13, Y11, Y13
 315  	VMOVDQA      Y4, Y11
 316  	VPERM2I128   $0x20, Y14, Y12, Y4
 317  	VPERM2I128   $0x31, Y14, Y12, Y14
 318  	VMOVDQA      Y4, Y12
 319  	VPMULLW      Y8, Y0, Y4
 320  	VPMULLW      Y10, Y0, Y5
 321  	VPMULLW      Y12, Y2, Y6
 322  	VPMULLW      Y14, Y2, Y0
 323  	VPMULHW      Y8, Y1, Y8
 324  	VPMULHW      Y10, Y1, Y10
 325  	VPMULHW      Y12, Y3, Y12
 326  	VPMULHW      Y14, Y3, Y14
 327  	VPMULHW      Y4, Y15, Y4
 328  	VPMULHW      Y5, Y15, Y5
 329  	VPMULHW      Y6, Y15, Y6
 330  	VPMULHW      Y0, Y15, Y0
 331  	VPSUBW       Y4, Y8, Y4
 332  	VPSUBW       Y5, Y10, Y5
 333  	VPSUBW       Y6, Y12, Y6
 334  	VPSUBW       Y0, Y14, Y0
 335  	VPSUBW       Y4, Y7, Y8
 336  	VPSUBW       Y5, Y9, Y10
 337  	VPSUBW       Y6, Y11, Y12
 338  	VPSUBW       Y0, Y13, Y14
 339  	VPADDW       Y4, Y7, Y7
 340  	VPADDW       Y5, Y9, Y9
 341  	VPADDW       Y6, Y11, Y11
 342  	VPADDW       Y0, Y13, Y13
 343  	VMOVDQU      288(CX), Y0
 344  	VMOVDQU      320(CX), Y1
 345  	VMOVDQU      352(CX), Y2
 346  	VMOVDQU      384(CX), Y3
 347  	VPUNPCKLQDQ  Y8, Y7, Y4
 348  	VPUNPCKHQDQ  Y8, Y7, Y8
 349  	VMOVDQA      Y4, Y7
 350  	VPUNPCKLQDQ  Y10, Y9, Y4
 351  	VPUNPCKHQDQ  Y10, Y9, Y10
 352  	VMOVDQA      Y4, Y9
 353  	VPUNPCKLQDQ  Y12, Y11, Y4
 354  	VPUNPCKHQDQ  Y12, Y11, Y12
 355  	VMOVDQA      Y4, Y11
 356  	VPUNPCKLQDQ  Y14, Y13, Y4
 357  	VPUNPCKHQDQ  Y14, Y13, Y14
 358  	VMOVDQA      Y4, Y13
 359  	VPMULLW      Y9, Y0, Y4
 360  	VPMULLW      Y10, Y0, Y5
 361  	VPMULLW      Y13, Y2, Y6
 362  	VPMULLW      Y14, Y2, Y0
 363  	VPMULHW      Y9, Y1, Y9
 364  	VPMULHW      Y10, Y1, Y10
 365  	VPMULHW      Y13, Y3, Y13
 366  	VPMULHW      Y14, Y3, Y14
 367  	VPMULHW      Y4, Y15, Y4
 368  	VPMULHW      Y5, Y15, Y5
 369  	VPMULHW      Y6, Y15, Y6
 370  	VPMULHW      Y0, Y15, Y0
 371  	VPSUBW       Y4, Y9, Y4
 372  	VPSUBW       Y5, Y10, Y5
 373  	VPSUBW       Y6, Y13, Y6
 374  	VPSUBW       Y0, Y14, Y0
 375  	VPSUBW       Y4, Y7, Y9
 376  	VPSUBW       Y5, Y8, Y10
 377  	VPSUBW       Y6, Y11, Y13
 378  	VPSUBW       Y0, Y12, Y14
 379  	VPADDW       Y4, Y7, Y7
 380  	VPADDW       Y5, Y8, Y8
 381  	VPADDW       Y6, Y11, Y11
 382  	VPADDW       Y0, Y12, Y12
 383  	VMOVDQU      544(CX), Y0
 384  	VMOVDQU      576(CX), Y1
 385  	VMOVDQU      608(CX), Y2
 386  	VMOVDQU      640(CX), Y3
 387  	VMOVSLDUP    Y9, Y4
 388  	VPBLENDD     $0xaa, Y4, Y7, Y4
 389  	VPSRLQ       $0x20, Y7, Y7
 390  	VPBLENDD     $0xaa, Y9, Y7, Y9
 391  	VMOVDQA      Y4, Y7
 392  	VMOVSLDUP    Y10, Y4
 393  	VPBLENDD     $0xaa, Y4, Y8, Y4
 394  	VPSRLQ       $0x20, Y8, Y8
 395  	VPBLENDD     $0xaa, Y10, Y8, Y10
 396  	VMOVDQA      Y4, Y8
 397  	VMOVSLDUP    Y13, Y4
 398  	VPBLENDD     $0xaa, Y4, Y11, Y4
 399  	VPSRLQ       $0x20, Y11, Y11
 400  	VPBLENDD     $0xaa, Y13, Y11, Y13
 401  	VMOVDQA      Y4, Y11
 402  	VMOVSLDUP    Y14, Y4
 403  	VPBLENDD     $0xaa, Y4, Y12, Y4
 404  	VPSRLQ       $0x20, Y12, Y12
 405  	VPBLENDD     $0xaa, Y14, Y12, Y14
 406  	VMOVDQA      Y4, Y12
 407  	VPMULLW      Y8, Y0, Y4
 408  	VPMULLW      Y10, Y0, Y5
 409  	VPMULLW      Y12, Y2, Y6
 410  	VPMULLW      Y14, Y2, Y0
 411  	VPMULHW      Y8, Y1, Y8
 412  	VPMULHW      Y10, Y1, Y10
 413  	VPMULHW      Y12, Y3, Y12
 414  	VPMULHW      Y14, Y3, Y14
 415  	VPMULHW      Y4, Y15, Y4
 416  	VPMULHW      Y5, Y15, Y5
 417  	VPMULHW      Y6, Y15, Y6
 418  	VPMULHW      Y0, Y15, Y0
 419  	VPSUBW       Y4, Y8, Y4
 420  	VPSUBW       Y5, Y10, Y5
 421  	VPSUBW       Y6, Y12, Y6
 422  	VPSUBW       Y0, Y14, Y0
 423  	VPSUBW       Y4, Y7, Y8
 424  	VPSUBW       Y5, Y9, Y10
 425  	VPSUBW       Y6, Y11, Y12
 426  	VPSUBW       Y0, Y13, Y14
 427  	VPADDW       Y4, Y7, Y7
 428  	VPADDW       Y5, Y9, Y9
 429  	VPADDW       Y6, Y11, Y11
 430  	VPADDW       Y0, Y13, Y13
 431  	VMOVDQU      800(CX), Y0
 432  	VMOVDQU      832(CX), Y1
 433  	VMOVDQU      864(CX), Y2
 434  	VMOVDQU      896(CX), Y3
 435  	VPSLLD       $0x10, Y8, Y4
 436  	VPBLENDW     $0xaa, Y4, Y7, Y4
 437  	VPSRLD       $0x10, Y7, Y7
 438  	VPBLENDW     $0xaa, Y8, Y7, Y8
 439  	VMOVDQA      Y4, Y7
 440  	VPSLLD       $0x10, Y10, Y4
 441  	VPBLENDW     $0xaa, Y4, Y9, Y4
 442  	VPSRLD       $0x10, Y9, Y9
 443  	VPBLENDW     $0xaa, Y10, Y9, Y10
 444  	VMOVDQA      Y4, Y9
 445  	VPSLLD       $0x10, Y12, Y4
 446  	VPBLENDW     $0xaa, Y4, Y11, Y4
 447  	VPSRLD       $0x10, Y11, Y11
 448  	VPBLENDW     $0xaa, Y12, Y11, Y12
 449  	VMOVDQA      Y4, Y11
 450  	VPSLLD       $0x10, Y14, Y4
 451  	VPBLENDW     $0xaa, Y4, Y13, Y4
 452  	VPSRLD       $0x10, Y13, Y13
 453  	VPBLENDW     $0xaa, Y14, Y13, Y14
 454  	VMOVDQA      Y4, Y13
 455  	VPMULLW      Y9, Y0, Y4
 456  	VPMULLW      Y10, Y0, Y5
 457  	VPMULLW      Y13, Y2, Y6
 458  	VPMULLW      Y14, Y2, Y0
 459  	VPMULHW      Y9, Y1, Y9
 460  	VPMULHW      Y10, Y1, Y10
 461  	VPMULHW      Y13, Y3, Y13
 462  	VPMULHW      Y14, Y3, Y14
 463  	VPMULHW      Y4, Y15, Y4
 464  	VPMULHW      Y5, Y15, Y5
 465  	VPMULHW      Y6, Y15, Y6
 466  	VPMULHW      Y0, Y15, Y0
 467  	VPSUBW       Y4, Y9, Y4
 468  	VPSUBW       Y5, Y10, Y5
 469  	VPSUBW       Y6, Y13, Y6
 470  	VPSUBW       Y0, Y14, Y0
 471  	VPSUBW       Y4, Y7, Y9
 472  	VPSUBW       Y5, Y8, Y10
 473  	VPSUBW       Y6, Y11, Y13
 474  	VPSUBW       Y0, Y12, Y14
 475  	VPADDW       Y4, Y7, Y7
 476  	VPADDW       Y5, Y8, Y8
 477  	VPADDW       Y6, Y11, Y11
 478  	VPADDW       Y0, Y12, Y12
 479  	VMOVDQU      Y7, (AX)
 480  	VMOVDQU      Y8, 32(AX)
 481  	VMOVDQU      Y9, 64(AX)
 482  	VMOVDQU      Y10, 96(AX)
 483  	VMOVDQU      Y11, 128(AX)
 484  	VMOVDQU      Y12, 160(AX)
 485  	VMOVDQU      Y13, 192(AX)
 486  	VMOVDQU      Y14, 224(AX)
 487  	VPBROADCASTW 8(CX), Y0
 488  	VPBROADCASTW 10(CX), Y1
 489  	VMOVDQU      256(AX), Y7
 490  	VMOVDQU      288(AX), Y8
 491  	VMOVDQU      320(AX), Y9
 492  	VMOVDQU      352(AX), Y10
 493  	VMOVDQU      384(AX), Y11
 494  	VMOVDQU      416(AX), Y12
 495  	VMOVDQU      448(AX), Y13
 496  	VMOVDQU      480(AX), Y14
 497  	VPMULLW      Y11, Y0, Y2
 498  	VPMULLW      Y12, Y0, Y3
 499  	VPMULLW      Y13, Y0, Y4
 500  	VPMULLW      Y14, Y0, Y5
 501  	VPMULHW      Y11, Y1, Y11
 502  	VPMULHW      Y12, Y1, Y12
 503  	VPMULHW      Y13, Y1, Y13
 504  	VPMULHW      Y14, Y1, Y14
 505  	VPMULHW      Y2, Y15, Y2
 506  	VPMULHW      Y3, Y15, Y3
 507  	VPMULHW      Y4, Y15, Y4
 508  	VPMULHW      Y5, Y15, Y5
 509  	VPSUBW       Y2, Y11, Y2
 510  	VPSUBW       Y3, Y12, Y3
 511  	VPSUBW       Y4, Y13, Y4
 512  	VPSUBW       Y5, Y14, Y5
 513  	VPSUBW       Y2, Y7, Y11
 514  	VPSUBW       Y3, Y8, Y12
 515  	VPSUBW       Y4, Y9, Y13
 516  	VPSUBW       Y5, Y10, Y14
 517  	VPADDW       Y2, Y7, Y7
 518  	VPADDW       Y3, Y8, Y8
 519  	VPADDW       Y4, Y9, Y9
 520  	VPADDW       Y5, Y10, Y10
 521  	VPBROADCASTW 20(CX), Y0
 522  	VPBROADCASTW 22(CX), Y1
 523  	VPBROADCASTW 24(CX), Y2
 524  	VPBROADCASTW 26(CX), Y3
 525  	VPMULLW      Y9, Y0, Y4
 526  	VPMULLW      Y10, Y0, Y5
 527  	VPMULLW      Y13, Y2, Y6
 528  	VPMULLW      Y14, Y2, Y0
 529  	VPMULHW      Y9, Y1, Y9
 530  	VPMULHW      Y10, Y1, Y10
 531  	VPMULHW      Y13, Y3, Y13
 532  	VPMULHW      Y14, Y3, Y14
 533  	VPMULHW      Y4, Y15, Y4
 534  	VPMULHW      Y5, Y15, Y5
 535  	VPMULHW      Y6, Y15, Y6
 536  	VPMULHW      Y0, Y15, Y0
 537  	VPSUBW       Y4, Y9, Y4
 538  	VPSUBW       Y5, Y10, Y5
 539  	VPSUBW       Y6, Y13, Y6
 540  	VPSUBW       Y0, Y14, Y0
 541  	VPSUBW       Y4, Y7, Y9
 542  	VPSUBW       Y5, Y8, Y10
 543  	VPSUBW       Y6, Y11, Y13
 544  	VPSUBW       Y0, Y12, Y14
 545  	VPADDW       Y4, Y7, Y7
 546  	VPADDW       Y5, Y8, Y8
 547  	VPADDW       Y6, Y11, Y11
 548  	VPADDW       Y0, Y12, Y12
 549  	VMOVDQU      160(CX), Y0
 550  	VMOVDQU      192(CX), Y1
 551  	VMOVDQU      224(CX), Y2
 552  	VMOVDQU      256(CX), Y3
 553  	VPERM2I128   $0x20, Y9, Y7, Y4
 554  	VPERM2I128   $0x31, Y9, Y7, Y9
 555  	VMOVDQA      Y4, Y7
 556  	VPERM2I128   $0x20, Y10, Y8, Y4
 557  	VPERM2I128   $0x31, Y10, Y8, Y10
 558  	VMOVDQA      Y4, Y8
 559  	VPERM2I128   $0x20, Y13, Y11, Y4
 560  	VPERM2I128   $0x31, Y13, Y11, Y13
 561  	VMOVDQA      Y4, Y11
 562  	VPERM2I128   $0x20, Y14, Y12, Y4
 563  	VPERM2I128   $0x31, Y14, Y12, Y14
 564  	VMOVDQA      Y4, Y12
 565  	VPMULLW      Y8, Y0, Y4
 566  	VPMULLW      Y10, Y0, Y5
 567  	VPMULLW      Y12, Y2, Y6
 568  	VPMULLW      Y14, Y2, Y0
 569  	VPMULHW      Y8, Y1, Y8
 570  	VPMULHW      Y10, Y1, Y10
 571  	VPMULHW      Y12, Y3, Y12
 572  	VPMULHW      Y14, Y3, Y14
 573  	VPMULHW      Y4, Y15, Y4
 574  	VPMULHW      Y5, Y15, Y5
 575  	VPMULHW      Y6, Y15, Y6
 576  	VPMULHW      Y0, Y15, Y0
 577  	VPSUBW       Y4, Y8, Y4
 578  	VPSUBW       Y5, Y10, Y5
 579  	VPSUBW       Y6, Y12, Y6
 580  	VPSUBW       Y0, Y14, Y0
 581  	VPSUBW       Y4, Y7, Y8
 582  	VPSUBW       Y5, Y9, Y10
 583  	VPSUBW       Y6, Y11, Y12
 584  	VPSUBW       Y0, Y13, Y14
 585  	VPADDW       Y4, Y7, Y7
 586  	VPADDW       Y5, Y9, Y9
 587  	VPADDW       Y6, Y11, Y11
 588  	VPADDW       Y0, Y13, Y13
 589  	VMOVDQU      416(CX), Y0
 590  	VMOVDQU      448(CX), Y1
 591  	VMOVDQU      480(CX), Y2
 592  	VMOVDQU      512(CX), Y3
 593  	VPUNPCKLQDQ  Y8, Y7, Y4
 594  	VPUNPCKHQDQ  Y8, Y7, Y8
 595  	VMOVDQA      Y4, Y7
 596  	VPUNPCKLQDQ  Y10, Y9, Y4
 597  	VPUNPCKHQDQ  Y10, Y9, Y10
 598  	VMOVDQA      Y4, Y9
 599  	VPUNPCKLQDQ  Y12, Y11, Y4
 600  	VPUNPCKHQDQ  Y12, Y11, Y12
 601  	VMOVDQA      Y4, Y11
 602  	VPUNPCKLQDQ  Y14, Y13, Y4
 603  	VPUNPCKHQDQ  Y14, Y13, Y14
 604  	VMOVDQA      Y4, Y13
 605  	VPMULLW      Y9, Y0, Y4
 606  	VPMULLW      Y10, Y0, Y5
 607  	VPMULLW      Y13, Y2, Y6
 608  	VPMULLW      Y14, Y2, Y0
 609  	VPMULHW      Y9, Y1, Y9
 610  	VPMULHW      Y10, Y1, Y10
 611  	VPMULHW      Y13, Y3, Y13
 612  	VPMULHW      Y14, Y3, Y14
 613  	VPMULHW      Y4, Y15, Y4
 614  	VPMULHW      Y5, Y15, Y5
 615  	VPMULHW      Y6, Y15, Y6
 616  	VPMULHW      Y0, Y15, Y0
 617  	VPSUBW       Y4, Y9, Y4
 618  	VPSUBW       Y5, Y10, Y5
 619  	VPSUBW       Y6, Y13, Y6
 620  	VPSUBW       Y0, Y14, Y0
 621  	VPSUBW       Y4, Y7, Y9
 622  	VPSUBW       Y5, Y8, Y10
 623  	VPSUBW       Y6, Y11, Y13
 624  	VPSUBW       Y0, Y12, Y14
 625  	VPADDW       Y4, Y7, Y7
 626  	VPADDW       Y5, Y8, Y8
 627  	VPADDW       Y6, Y11, Y11
 628  	VPADDW       Y0, Y12, Y12
 629  	VMOVDQU      672(CX), Y0
 630  	VMOVDQU      704(CX), Y1
 631  	VMOVDQU      736(CX), Y2
 632  	VMOVDQU      768(CX), Y3
 633  	VMOVSLDUP    Y9, Y4
 634  	VPBLENDD     $0xaa, Y4, Y7, Y4
 635  	VPSRLQ       $0x20, Y7, Y7
 636  	VPBLENDD     $0xaa, Y9, Y7, Y9
 637  	VMOVDQA      Y4, Y7
 638  	VMOVSLDUP    Y10, Y4
 639  	VPBLENDD     $0xaa, Y4, Y8, Y4
 640  	VPSRLQ       $0x20, Y8, Y8
 641  	VPBLENDD     $0xaa, Y10, Y8, Y10
 642  	VMOVDQA      Y4, Y8
 643  	VMOVSLDUP    Y13, Y4
 644  	VPBLENDD     $0xaa, Y4, Y11, Y4
 645  	VPSRLQ       $0x20, Y11, Y11
 646  	VPBLENDD     $0xaa, Y13, Y11, Y13
 647  	VMOVDQA      Y4, Y11
 648  	VMOVSLDUP    Y14, Y4
 649  	VPBLENDD     $0xaa, Y4, Y12, Y4
 650  	VPSRLQ       $0x20, Y12, Y12
 651  	VPBLENDD     $0xaa, Y14, Y12, Y14
 652  	VMOVDQA      Y4, Y12
 653  	VPMULLW      Y8, Y0, Y4
 654  	VPMULLW      Y10, Y0, Y5
 655  	VPMULLW      Y12, Y2, Y6
 656  	VPMULLW      Y14, Y2, Y0
 657  	VPMULHW      Y8, Y1, Y8
 658  	VPMULHW      Y10, Y1, Y10
 659  	VPMULHW      Y12, Y3, Y12
 660  	VPMULHW      Y14, Y3, Y14
 661  	VPMULHW      Y4, Y15, Y4
 662  	VPMULHW      Y5, Y15, Y5
 663  	VPMULHW      Y6, Y15, Y6
 664  	VPMULHW      Y0, Y15, Y0
 665  	VPSUBW       Y4, Y8, Y4
 666  	VPSUBW       Y5, Y10, Y5
 667  	VPSUBW       Y6, Y12, Y6
 668  	VPSUBW       Y0, Y14, Y0
 669  	VPSUBW       Y4, Y7, Y8
 670  	VPSUBW       Y5, Y9, Y10
 671  	VPSUBW       Y6, Y11, Y12
 672  	VPSUBW       Y0, Y13, Y14
 673  	VPADDW       Y4, Y7, Y7
 674  	VPADDW       Y5, Y9, Y9
 675  	VPADDW       Y6, Y11, Y11
 676  	VPADDW       Y0, Y13, Y13
 677  	VMOVDQU      928(CX), Y0
 678  	VMOVDQU      960(CX), Y1
 679  	VMOVDQU      992(CX), Y2
 680  	VMOVDQU      1024(CX), Y3
 681  	VPSLLD       $0x10, Y8, Y4
 682  	VPBLENDW     $0xaa, Y4, Y7, Y4
 683  	VPSRLD       $0x10, Y7, Y7
 684  	VPBLENDW     $0xaa, Y8, Y7, Y8
 685  	VMOVDQA      Y4, Y7
 686  	VPSLLD       $0x10, Y10, Y4
 687  	VPBLENDW     $0xaa, Y4, Y9, Y4
 688  	VPSRLD       $0x10, Y9, Y9
 689  	VPBLENDW     $0xaa, Y10, Y9, Y10
 690  	VMOVDQA      Y4, Y9
 691  	VPSLLD       $0x10, Y12, Y4
 692  	VPBLENDW     $0xaa, Y4, Y11, Y4
 693  	VPSRLD       $0x10, Y11, Y11
 694  	VPBLENDW     $0xaa, Y12, Y11, Y12
 695  	VMOVDQA      Y4, Y11
 696  	VPSLLD       $0x10, Y14, Y4
 697  	VPBLENDW     $0xaa, Y4, Y13, Y4
 698  	VPSRLD       $0x10, Y13, Y13
 699  	VPBLENDW     $0xaa, Y14, Y13, Y14
 700  	VMOVDQA      Y4, Y13
 701  	VPMULLW      Y9, Y0, Y4
 702  	VPMULLW      Y10, Y0, Y5
 703  	VPMULLW      Y13, Y2, Y6
 704  	VPMULLW      Y14, Y2, Y0
 705  	VPMULHW      Y9, Y1, Y9
 706  	VPMULHW      Y10, Y1, Y10
 707  	VPMULHW      Y13, Y3, Y13
 708  	VPMULHW      Y14, Y3, Y14
 709  	VPMULHW      Y4, Y15, Y4
 710  	VPMULHW      Y5, Y15, Y5
 711  	VPMULHW      Y6, Y15, Y6
 712  	VPMULHW      Y0, Y15, Y0
 713  	VPSUBW       Y4, Y9, Y4
 714  	VPSUBW       Y5, Y10, Y5
 715  	VPSUBW       Y6, Y13, Y6
 716  	VPSUBW       Y0, Y14, Y0
 717  	VPSUBW       Y4, Y7, Y9
 718  	VPSUBW       Y5, Y8, Y10
 719  	VPSUBW       Y6, Y11, Y13
 720  	VPSUBW       Y0, Y12, Y14
 721  	VPADDW       Y4, Y7, Y7
 722  	VPADDW       Y5, Y8, Y8
 723  	VPADDW       Y6, Y11, Y11
 724  	VPADDW       Y0, Y12, Y12
 725  	VMOVDQU      Y7, 256(AX)
 726  	VMOVDQU      Y8, 288(AX)
 727  	VMOVDQU      Y9, 320(AX)
 728  	VMOVDQU      Y10, 352(AX)
 729  	VMOVDQU      Y11, 384(AX)
 730  	VMOVDQU      Y12, 416(AX)
 731  	VMOVDQU      Y13, 448(AX)
 732  	VMOVDQU      Y14, 480(AX)
 733  	RET
 734  
 735  // func invNttAVX2(p *[256]int16)
 736  // Requires: AVX, AVX2
 737  TEXT ·invNttAVX2(SB), NOSPLIT, $0-8
 738  	MOVQ         p+0(FP), AX
 739  	LEAQ         ·ZetasAVX2+0(SB), CX
 740  	MOVL         $0x00000d01, DX
 741  	VMOVD        DX, X0
 742  	VPBROADCASTW X0, Y15
 743  	VMOVDQU      (AX), Y7
 744  	VMOVDQU      32(AX), Y8
 745  	VMOVDQU      64(AX), Y9
 746  	VMOVDQU      96(AX), Y10
 747  	VMOVDQU      128(AX), Y11
 748  	VMOVDQU      160(AX), Y12
 749  	VMOVDQU      192(AX), Y13
 750  	VMOVDQU      224(AX), Y14
 751  	VMOVDQU      1056(CX), Y0
 752  	VMOVDQU      1088(CX), Y1
 753  	VMOVDQU      1120(CX), Y2
 754  	VMOVDQU      1152(CX), Y3
 755  	VPSUBW       Y7, Y9, Y4
 756  	VPSUBW       Y8, Y10, Y5
 757  	VPSUBW       Y11, Y13, Y6
 758  	VPADDW       Y7, Y9, Y7
 759  	VPADDW       Y8, Y10, Y8
 760  	VPADDW       Y11, Y13, Y11
 761  	VPMULLW      Y4, Y0, Y9
 762  	VPMULLW      Y5, Y0, Y10
 763  	VPSUBW       Y12, Y14, Y0
 764  	VPMULLW      Y6, Y2, Y13
 765  	VPADDW       Y12, Y14, Y12
 766  	VPMULLW      Y0, Y2, Y14
 767  	VPMULHW      Y4, Y1, Y4
 768  	VPMULHW      Y5, Y1, Y5
 769  	VPMULHW      Y6, Y3, Y6
 770  	VPMULHW      Y0, Y3, Y0
 771  	VPMULHW      Y9, Y15, Y9
 772  	VPMULHW      Y10, Y15, Y10
 773  	VPMULHW      Y13, Y15, Y13
 774  	VPMULHW      Y14, Y15, Y14
 775  	VPSUBW       Y9, Y4, Y9
 776  	VPSUBW       Y10, Y5, Y10
 777  	VPSUBW       Y13, Y6, Y13
 778  	VPSUBW       Y14, Y0, Y14
 779  	VMOVDQU      1312(CX), Y0
 780  	VMOVDQU      1344(CX), Y1
 781  	VMOVDQU      1376(CX), Y2
 782  	VMOVDQU      1408(CX), Y3
 783  	VPSLLD       $0x10, Y8, Y4
 784  	VPBLENDW     $0xaa, Y4, Y7, Y4
 785  	VPSRLD       $0x10, Y7, Y7
 786  	VPBLENDW     $0xaa, Y8, Y7, Y8
 787  	VMOVDQA      Y4, Y7
 788  	VPSLLD       $0x10, Y10, Y4
 789  	VPBLENDW     $0xaa, Y4, Y9, Y4
 790  	VPSRLD       $0x10, Y9, Y9
 791  	VPBLENDW     $0xaa, Y10, Y9, Y10
 792  	VMOVDQA      Y4, Y9
 793  	VPSLLD       $0x10, Y12, Y4
 794  	VPBLENDW     $0xaa, Y4, Y11, Y4
 795  	VPSRLD       $0x10, Y11, Y11
 796  	VPBLENDW     $0xaa, Y12, Y11, Y12
 797  	VMOVDQA      Y4, Y11
 798  	VPSLLD       $0x10, Y14, Y4
 799  	VPBLENDW     $0xaa, Y4, Y13, Y4
 800  	VPSRLD       $0x10, Y13, Y13
 801  	VPBLENDW     $0xaa, Y14, Y13, Y14
 802  	VMOVDQA      Y4, Y13
 803  	VPSUBW       Y7, Y8, Y4
 804  	VPSUBW       Y9, Y10, Y5
 805  	VPSUBW       Y11, Y12, Y6
 806  	VPADDW       Y7, Y8, Y7
 807  	VPADDW       Y9, Y10, Y9
 808  	VPADDW       Y11, Y12, Y11
 809  	VPMULLW      Y4, Y0, Y8
 810  	VPMULLW      Y5, Y0, Y10
 811  	VPSUBW       Y13, Y14, Y0
 812  	VPMULLW      Y6, Y2, Y12
 813  	VPADDW       Y13, Y14, Y13
 814  	VPMULLW      Y0, Y2, Y14
 815  	VPMULHW      Y4, Y1, Y4
 816  	VPMULHW      Y5, Y1, Y5
 817  	VPMULHW      Y6, Y3, Y6
 818  	VPMULHW      Y0, Y3, Y0
 819  	VPMULHW      Y8, Y15, Y8
 820  	VPMULHW      Y10, Y15, Y10
 821  	VPMULHW      Y12, Y15, Y12
 822  	VPMULHW      Y14, Y15, Y14
 823  	VPSUBW       Y8, Y4, Y8
 824  	VPSUBW       Y10, Y5, Y10
 825  	VPSUBW       Y12, Y6, Y12
 826  	VPSUBW       Y14, Y0, Y14
 827  	VMOVDQU      1568(CX), Y0
 828  	VMOVDQU      1600(CX), Y1
 829  	VMOVDQU      1632(CX), Y2
 830  	VMOVDQU      1664(CX), Y3
 831  	VMOVSLDUP    Y9, Y4
 832  	VPBLENDD     $0xaa, Y4, Y7, Y4
 833  	VPSRLQ       $0x20, Y7, Y7
 834  	VPBLENDD     $0xaa, Y9, Y7, Y9
 835  	VMOVDQA      Y4, Y7
 836  	VMOVSLDUP    Y10, Y4
 837  	VPBLENDD     $0xaa, Y4, Y8, Y4
 838  	VPSRLQ       $0x20, Y8, Y8
 839  	VPBLENDD     $0xaa, Y10, Y8, Y10
 840  	VMOVDQA      Y4, Y8
 841  	VMOVSLDUP    Y13, Y4
 842  	VPBLENDD     $0xaa, Y4, Y11, Y4
 843  	VPSRLQ       $0x20, Y11, Y11
 844  	VPBLENDD     $0xaa, Y13, Y11, Y13
 845  	VMOVDQA      Y4, Y11
 846  	VMOVSLDUP    Y14, Y4
 847  	VPBLENDD     $0xaa, Y4, Y12, Y4
 848  	VPSRLQ       $0x20, Y12, Y12
 849  	VPBLENDD     $0xaa, Y14, Y12, Y14
 850  	VMOVDQA      Y4, Y12
 851  	VPSUBW       Y7, Y9, Y4
 852  	VPSUBW       Y8, Y10, Y5
 853  	VPSUBW       Y11, Y13, Y6
 854  	VPADDW       Y7, Y9, Y7
 855  	VPADDW       Y8, Y10, Y8
 856  	VPADDW       Y11, Y13, Y11
 857  	VPMULLW      Y4, Y0, Y9
 858  	VPMULLW      Y5, Y0, Y10
 859  	VPSUBW       Y12, Y14, Y0
 860  	VPMULLW      Y6, Y2, Y13
 861  	VPADDW       Y12, Y14, Y12
 862  	VPMULLW      Y0, Y2, Y14
 863  	VPMULHW      Y4, Y1, Y4
 864  	VPMULHW      Y5, Y1, Y5
 865  	VPMULHW      Y6, Y3, Y6
 866  	VPMULHW      Y0, Y3, Y0
 867  	VPMULHW      Y9, Y15, Y9
 868  	VPMULHW      Y10, Y15, Y10
 869  	VPMULHW      Y13, Y15, Y13
 870  	VPMULHW      Y14, Y15, Y14
 871  	VPSUBW       Y9, Y4, Y9
 872  	VPSUBW       Y10, Y5, Y10
 873  	VPSUBW       Y13, Y6, Y13
 874  	VPSUBW       Y14, Y0, Y14
 875  	MOVL         $0x00004ebf, DX
 876  	VMOVD        DX, X0
 877  	VPBROADCASTW X0, Y4
 878  	VPMULHW      Y4, Y7, Y5
 879  	VPSRAW       $0x0a, Y5, Y5
 880  	VPMULLW      Y15, Y5, Y5
 881  	VPSUBW       Y5, Y7, Y7
 882  	VPMULHW      Y4, Y11, Y5
 883  	VPSRAW       $0x0a, Y5, Y5
 884  	VPMULLW      Y15, Y5, Y5
 885  	VPSUBW       Y5, Y11, Y11
 886  	VMOVDQU      1824(CX), Y0
 887  	VMOVDQU      1856(CX), Y1
 888  	VMOVDQU      1888(CX), Y2
 889  	VMOVDQU      1920(CX), Y3
 890  	VPUNPCKLQDQ  Y8, Y7, Y4
 891  	VPUNPCKHQDQ  Y8, Y7, Y8
 892  	VMOVDQA      Y4, Y7
 893  	VPUNPCKLQDQ  Y10, Y9, Y4
 894  	VPUNPCKHQDQ  Y10, Y9, Y10
 895  	VMOVDQA      Y4, Y9
 896  	VPUNPCKLQDQ  Y12, Y11, Y4
 897  	VPUNPCKHQDQ  Y12, Y11, Y12
 898  	VMOVDQA      Y4, Y11
 899  	VPUNPCKLQDQ  Y14, Y13, Y4
 900  	VPUNPCKHQDQ  Y14, Y13, Y14
 901  	VMOVDQA      Y4, Y13
 902  	VPSUBW       Y7, Y8, Y4
 903  	VPSUBW       Y9, Y10, Y5
 904  	VPSUBW       Y11, Y12, Y6
 905  	VPADDW       Y7, Y8, Y7
 906  	VPADDW       Y9, Y10, Y9
 907  	VPADDW       Y11, Y12, Y11
 908  	VPMULLW      Y4, Y0, Y8
 909  	VPMULLW      Y5, Y0, Y10
 910  	VPSUBW       Y13, Y14, Y0
 911  	VPMULLW      Y6, Y2, Y12
 912  	VPADDW       Y13, Y14, Y13
 913  	VPMULLW      Y0, Y2, Y14
 914  	VPMULHW      Y4, Y1, Y4
 915  	VPMULHW      Y5, Y1, Y5
 916  	VPMULHW      Y6, Y3, Y6
 917  	VPMULHW      Y0, Y3, Y0
 918  	VPMULHW      Y8, Y15, Y8
 919  	VPMULHW      Y10, Y15, Y10
 920  	VPMULHW      Y12, Y15, Y12
 921  	VPMULHW      Y14, Y15, Y14
 922  	VPSUBW       Y8, Y4, Y8
 923  	VPSUBW       Y10, Y5, Y10
 924  	VPSUBW       Y12, Y6, Y12
 925  	VPSUBW       Y14, Y0, Y14
 926  	VPBROADCASTW 2080(CX), Y0
 927  	VPBROADCASTW 2082(CX), Y1
 928  	VPBROADCASTW 2084(CX), Y2
 929  	VPBROADCASTW 2086(CX), Y3
 930  	VPERM2I128   $0x20, Y9, Y7, Y4
 931  	VPERM2I128   $0x31, Y9, Y7, Y9
 932  	VMOVDQA      Y4, Y7
 933  	VPERM2I128   $0x20, Y10, Y8, Y4
 934  	VPERM2I128   $0x31, Y10, Y8, Y10
 935  	VMOVDQA      Y4, Y8
 936  	VPERM2I128   $0x20, Y13, Y11, Y4
 937  	VPERM2I128   $0x31, Y13, Y11, Y13
 938  	VMOVDQA      Y4, Y11
 939  	VPERM2I128   $0x20, Y14, Y12, Y4
 940  	VPERM2I128   $0x31, Y14, Y12, Y14
 941  	VMOVDQA      Y4, Y12
 942  	VPSUBW       Y7, Y9, Y4
 943  	VPSUBW       Y8, Y10, Y5
 944  	VPSUBW       Y11, Y13, Y6
 945  	VPADDW       Y7, Y9, Y7
 946  	VPADDW       Y8, Y10, Y8
 947  	VPADDW       Y11, Y13, Y11
 948  	VPMULLW      Y4, Y0, Y9
 949  	VPMULLW      Y5, Y0, Y10
 950  	VPSUBW       Y12, Y14, Y0
 951  	VPMULLW      Y6, Y2, Y13
 952  	VPADDW       Y12, Y14, Y12
 953  	VPMULLW      Y0, Y2, Y14
 954  	VPMULHW      Y4, Y1, Y4
 955  	VPMULHW      Y5, Y1, Y5
 956  	VPMULHW      Y6, Y3, Y6
 957  	VPMULHW      Y0, Y3, Y0
 958  	VPMULHW      Y9, Y15, Y9
 959  	VPMULHW      Y10, Y15, Y10
 960  	VPMULHW      Y13, Y15, Y13
 961  	VPMULHW      Y14, Y15, Y14
 962  	VPSUBW       Y9, Y4, Y9
 963  	VPSUBW       Y10, Y5, Y10
 964  	VPSUBW       Y13, Y6, Y13
 965  	VPSUBW       Y14, Y0, Y14
 966  	MOVL         $0x00004ebf, DX
 967  	VMOVD        DX, X0
 968  	VPBROADCASTW X0, Y4
 969  	VPMULHW      Y4, Y7, Y5
 970  	VPSRAW       $0x0a, Y5, Y5
 971  	VPMULLW      Y15, Y5, Y5
 972  	VPSUBW       Y5, Y7, Y7
 973  	VPMULHW      Y4, Y11, Y5
 974  	VPSRAW       $0x0a, Y5, Y5
 975  	VPMULLW      Y15, Y5, Y5
 976  	VPSUBW       Y5, Y11, Y11
 977  	VPBROADCASTW 2096(CX), Y0
 978  	VPBROADCASTW 2098(CX), Y1
 979  	VPSUBW       Y7, Y11, Y4
 980  	VPSUBW       Y8, Y12, Y5
 981  	VPSUBW       Y9, Y13, Y6
 982  	VPADDW       Y7, Y11, Y7
 983  	VPADDW       Y8, Y12, Y8
 984  	VPADDW       Y9, Y13, Y9
 985  	VPMULLW      Y4, Y0, Y11
 986  	VPMULLW      Y5, Y0, Y12
 987  	VPSUBW       Y10, Y14, Y2
 988  	VPMULLW      Y6, Y0, Y13
 989  	VPADDW       Y10, Y14, Y10
 990  	VPMULLW      Y2, Y0, Y14
 991  	VPMULHW      Y4, Y1, Y4
 992  	VPMULHW      Y5, Y1, Y5
 993  	VPMULHW      Y6, Y1, Y6
 994  	VPMULHW      Y2, Y1, Y2
 995  	VPMULHW      Y11, Y15, Y11
 996  	VPMULHW      Y12, Y15, Y12
 997  	VPMULHW      Y13, Y15, Y13
 998  	VPMULHW      Y14, Y15, Y14
 999  	VPSUBW       Y11, Y4, Y11
1000  	VPSUBW       Y12, Y5, Y12
1001  	VPSUBW       Y13, Y6, Y13
1002  	VPSUBW       Y14, Y2, Y14
1003  	VMOVDQU      Y7, (AX)
1004  	VMOVDQU      Y8, 32(AX)
1005  	VMOVDQU      Y9, 64(AX)
1006  	VMOVDQU      Y10, 96(AX)
1007  	VMOVDQU      Y11, 128(AX)
1008  	VMOVDQU      Y12, 160(AX)
1009  	VMOVDQU      Y13, 192(AX)
1010  	VMOVDQU      Y14, 224(AX)
1011  	VMOVDQU      256(AX), Y7
1012  	VMOVDQU      288(AX), Y8
1013  	VMOVDQU      320(AX), Y9
1014  	VMOVDQU      352(AX), Y10
1015  	VMOVDQU      384(AX), Y11
1016  	VMOVDQU      416(AX), Y12
1017  	VMOVDQU      448(AX), Y13
1018  	VMOVDQU      480(AX), Y14
1019  	VMOVDQU      1184(CX), Y0
1020  	VMOVDQU      1216(CX), Y1
1021  	VMOVDQU      1248(CX), Y2
1022  	VMOVDQU      1280(CX), Y3
1023  	VPSUBW       Y7, Y9, Y4
1024  	VPSUBW       Y8, Y10, Y5
1025  	VPSUBW       Y11, Y13, Y6
1026  	VPADDW       Y7, Y9, Y7
1027  	VPADDW       Y8, Y10, Y8
1028  	VPADDW       Y11, Y13, Y11
1029  	VPMULLW      Y4, Y0, Y9
1030  	VPMULLW      Y5, Y0, Y10
1031  	VPSUBW       Y12, Y14, Y0
1032  	VPMULLW      Y6, Y2, Y13
1033  	VPADDW       Y12, Y14, Y12
1034  	VPMULLW      Y0, Y2, Y14
1035  	VPMULHW      Y4, Y1, Y4
1036  	VPMULHW      Y5, Y1, Y5
1037  	VPMULHW      Y6, Y3, Y6
1038  	VPMULHW      Y0, Y3, Y0
1039  	VPMULHW      Y9, Y15, Y9
1040  	VPMULHW      Y10, Y15, Y10
1041  	VPMULHW      Y13, Y15, Y13
1042  	VPMULHW      Y14, Y15, Y14
1043  	VPSUBW       Y9, Y4, Y9
1044  	VPSUBW       Y10, Y5, Y10
1045  	VPSUBW       Y13, Y6, Y13
1046  	VPSUBW       Y14, Y0, Y14
1047  	VMOVDQU      1440(CX), Y0
1048  	VMOVDQU      1472(CX), Y1
1049  	VMOVDQU      1504(CX), Y2
1050  	VMOVDQU      1536(CX), Y3
1051  	VPSLLD       $0x10, Y8, Y4
1052  	VPBLENDW     $0xaa, Y4, Y7, Y4
1053  	VPSRLD       $0x10, Y7, Y7
1054  	VPBLENDW     $0xaa, Y8, Y7, Y8
1055  	VMOVDQA      Y4, Y7
1056  	VPSLLD       $0x10, Y10, Y4
1057  	VPBLENDW     $0xaa, Y4, Y9, Y4
1058  	VPSRLD       $0x10, Y9, Y9
1059  	VPBLENDW     $0xaa, Y10, Y9, Y10
1060  	VMOVDQA      Y4, Y9
1061  	VPSLLD       $0x10, Y12, Y4
1062  	VPBLENDW     $0xaa, Y4, Y11, Y4
1063  	VPSRLD       $0x10, Y11, Y11
1064  	VPBLENDW     $0xaa, Y12, Y11, Y12
1065  	VMOVDQA      Y4, Y11
1066  	VPSLLD       $0x10, Y14, Y4
1067  	VPBLENDW     $0xaa, Y4, Y13, Y4
1068  	VPSRLD       $0x10, Y13, Y13
1069  	VPBLENDW     $0xaa, Y14, Y13, Y14
1070  	VMOVDQA      Y4, Y13
1071  	VPSUBW       Y7, Y8, Y4
1072  	VPSUBW       Y9, Y10, Y5
1073  	VPSUBW       Y11, Y12, Y6
1074  	VPADDW       Y7, Y8, Y7
1075  	VPADDW       Y9, Y10, Y9
1076  	VPADDW       Y11, Y12, Y11
1077  	VPMULLW      Y4, Y0, Y8
1078  	VPMULLW      Y5, Y0, Y10
1079  	VPSUBW       Y13, Y14, Y0
1080  	VPMULLW      Y6, Y2, Y12
1081  	VPADDW       Y13, Y14, Y13
1082  	VPMULLW      Y0, Y2, Y14
1083  	VPMULHW      Y4, Y1, Y4
1084  	VPMULHW      Y5, Y1, Y5
1085  	VPMULHW      Y6, Y3, Y6
1086  	VPMULHW      Y0, Y3, Y0
1087  	VPMULHW      Y8, Y15, Y8
1088  	VPMULHW      Y10, Y15, Y10
1089  	VPMULHW      Y12, Y15, Y12
1090  	VPMULHW      Y14, Y15, Y14
1091  	VPSUBW       Y8, Y4, Y8
1092  	VPSUBW       Y10, Y5, Y10
1093  	VPSUBW       Y12, Y6, Y12
1094  	VPSUBW       Y14, Y0, Y14
1095  	VMOVDQU      1696(CX), Y0
1096  	VMOVDQU      1728(CX), Y1
1097  	VMOVDQU      1760(CX), Y2
1098  	VMOVDQU      1792(CX), Y3
1099  	VMOVSLDUP    Y9, Y4
1100  	VPBLENDD     $0xaa, Y4, Y7, Y4
1101  	VPSRLQ       $0x20, Y7, Y7
1102  	VPBLENDD     $0xaa, Y9, Y7, Y9
1103  	VMOVDQA      Y4, Y7
1104  	VMOVSLDUP    Y10, Y4
1105  	VPBLENDD     $0xaa, Y4, Y8, Y4
1106  	VPSRLQ       $0x20, Y8, Y8
1107  	VPBLENDD     $0xaa, Y10, Y8, Y10
1108  	VMOVDQA      Y4, Y8
1109  	VMOVSLDUP    Y13, Y4
1110  	VPBLENDD     $0xaa, Y4, Y11, Y4
1111  	VPSRLQ       $0x20, Y11, Y11
1112  	VPBLENDD     $0xaa, Y13, Y11, Y13
1113  	VMOVDQA      Y4, Y11
1114  	VMOVSLDUP    Y14, Y4
1115  	VPBLENDD     $0xaa, Y4, Y12, Y4
1116  	VPSRLQ       $0x20, Y12, Y12
1117  	VPBLENDD     $0xaa, Y14, Y12, Y14
1118  	VMOVDQA      Y4, Y12
1119  	VPSUBW       Y7, Y9, Y4
1120  	VPSUBW       Y8, Y10, Y5
1121  	VPSUBW       Y11, Y13, Y6
1122  	VPADDW       Y7, Y9, Y7
1123  	VPADDW       Y8, Y10, Y8
1124  	VPADDW       Y11, Y13, Y11
1125  	VPMULLW      Y4, Y0, Y9
1126  	VPMULLW      Y5, Y0, Y10
1127  	VPSUBW       Y12, Y14, Y0
1128  	VPMULLW      Y6, Y2, Y13
1129  	VPADDW       Y12, Y14, Y12
1130  	VPMULLW      Y0, Y2, Y14
1131  	VPMULHW      Y4, Y1, Y4
1132  	VPMULHW      Y5, Y1, Y5
1133  	VPMULHW      Y6, Y3, Y6
1134  	VPMULHW      Y0, Y3, Y0
1135  	VPMULHW      Y9, Y15, Y9
1136  	VPMULHW      Y10, Y15, Y10
1137  	VPMULHW      Y13, Y15, Y13
1138  	VPMULHW      Y14, Y15, Y14
1139  	VPSUBW       Y9, Y4, Y9
1140  	VPSUBW       Y10, Y5, Y10
1141  	VPSUBW       Y13, Y6, Y13
1142  	VPSUBW       Y14, Y0, Y14
1143  	MOVL         $0x00004ebf, DX
1144  	VMOVD        DX, X0
1145  	VPBROADCASTW X0, Y4
1146  	VPMULHW      Y4, Y7, Y5
1147  	VPSRAW       $0x0a, Y5, Y5
1148  	VPMULLW      Y15, Y5, Y5
1149  	VPSUBW       Y5, Y7, Y7
1150  	VPMULHW      Y4, Y11, Y5
1151  	VPSRAW       $0x0a, Y5, Y5
1152  	VPMULLW      Y15, Y5, Y5
1153  	VPSUBW       Y5, Y11, Y11
1154  	VMOVDQU      1952(CX), Y0
1155  	VMOVDQU      1984(CX), Y1
1156  	VMOVDQU      2016(CX), Y2
1157  	VMOVDQU      2048(CX), Y3
1158  	VPUNPCKLQDQ  Y8, Y7, Y4
1159  	VPUNPCKHQDQ  Y8, Y7, Y8
1160  	VMOVDQA      Y4, Y7
1161  	VPUNPCKLQDQ  Y10, Y9, Y4
1162  	VPUNPCKHQDQ  Y10, Y9, Y10
1163  	VMOVDQA      Y4, Y9
1164  	VPUNPCKLQDQ  Y12, Y11, Y4
1165  	VPUNPCKHQDQ  Y12, Y11, Y12
1166  	VMOVDQA      Y4, Y11
1167  	VPUNPCKLQDQ  Y14, Y13, Y4
1168  	VPUNPCKHQDQ  Y14, Y13, Y14
1169  	VMOVDQA      Y4, Y13
1170  	VPSUBW       Y7, Y8, Y4
1171  	VPSUBW       Y9, Y10, Y5
1172  	VPSUBW       Y11, Y12, Y6
1173  	VPADDW       Y7, Y8, Y7
1174  	VPADDW       Y9, Y10, Y9
1175  	VPADDW       Y11, Y12, Y11
1176  	VPMULLW      Y4, Y0, Y8
1177  	VPMULLW      Y5, Y0, Y10
1178  	VPSUBW       Y13, Y14, Y0
1179  	VPMULLW      Y6, Y2, Y12
1180  	VPADDW       Y13, Y14, Y13
1181  	VPMULLW      Y0, Y2, Y14
1182  	VPMULHW      Y4, Y1, Y4
1183  	VPMULHW      Y5, Y1, Y5
1184  	VPMULHW      Y6, Y3, Y6
1185  	VPMULHW      Y0, Y3, Y0
1186  	VPMULHW      Y8, Y15, Y8
1187  	VPMULHW      Y10, Y15, Y10
1188  	VPMULHW      Y12, Y15, Y12
1189  	VPMULHW      Y14, Y15, Y14
1190  	VPSUBW       Y8, Y4, Y8
1191  	VPSUBW       Y10, Y5, Y10
1192  	VPSUBW       Y12, Y6, Y12
1193  	VPSUBW       Y14, Y0, Y14
1194  	VPBROADCASTW 2088(CX), Y0
1195  	VPBROADCASTW 2090(CX), Y1
1196  	VPBROADCASTW 2092(CX), Y2
1197  	VPBROADCASTW 2094(CX), Y3
1198  	VPERM2I128   $0x20, Y9, Y7, Y4
1199  	VPERM2I128   $0x31, Y9, Y7, Y9
1200  	VMOVDQA      Y4, Y7
1201  	VPERM2I128   $0x20, Y10, Y8, Y4
1202  	VPERM2I128   $0x31, Y10, Y8, Y10
1203  	VMOVDQA      Y4, Y8
1204  	VPERM2I128   $0x20, Y13, Y11, Y4
1205  	VPERM2I128   $0x31, Y13, Y11, Y13
1206  	VMOVDQA      Y4, Y11
1207  	VPERM2I128   $0x20, Y14, Y12, Y4
1208  	VPERM2I128   $0x31, Y14, Y12, Y14
1209  	VMOVDQA      Y4, Y12
1210  	VPSUBW       Y7, Y9, Y4
1211  	VPSUBW       Y8, Y10, Y5
1212  	VPSUBW       Y11, Y13, Y6
1213  	VPADDW       Y7, Y9, Y7
1214  	VPADDW       Y8, Y10, Y8
1215  	VPADDW       Y11, Y13, Y11
1216  	VPMULLW      Y4, Y0, Y9
1217  	VPMULLW      Y5, Y0, Y10
1218  	VPSUBW       Y12, Y14, Y0
1219  	VPMULLW      Y6, Y2, Y13
1220  	VPADDW       Y12, Y14, Y12
1221  	VPMULLW      Y0, Y2, Y14
1222  	VPMULHW      Y4, Y1, Y4
1223  	VPMULHW      Y5, Y1, Y5
1224  	VPMULHW      Y6, Y3, Y6
1225  	VPMULHW      Y0, Y3, Y0
1226  	VPMULHW      Y9, Y15, Y9
1227  	VPMULHW      Y10, Y15, Y10
1228  	VPMULHW      Y13, Y15, Y13
1229  	VPMULHW      Y14, Y15, Y14
1230  	VPSUBW       Y9, Y4, Y9
1231  	VPSUBW       Y10, Y5, Y10
1232  	VPSUBW       Y13, Y6, Y13
1233  	VPSUBW       Y14, Y0, Y14
1234  	MOVL         $0x00004ebf, DX
1235  	VMOVD        DX, X0
1236  	VPBROADCASTW X0, Y4
1237  	VPMULHW      Y4, Y7, Y5
1238  	VPSRAW       $0x0a, Y5, Y5
1239  	VPMULLW      Y15, Y5, Y5
1240  	VPSUBW       Y5, Y7, Y7
1241  	VPMULHW      Y4, Y11, Y5
1242  	VPSRAW       $0x0a, Y5, Y5
1243  	VPMULLW      Y15, Y5, Y5
1244  	VPSUBW       Y5, Y11, Y11
1245  	VPBROADCASTW 2100(CX), Y0
1246  	VPBROADCASTW 2102(CX), Y1
1247  	VPSUBW       Y7, Y11, Y4
1248  	VPSUBW       Y8, Y12, Y5
1249  	VPSUBW       Y9, Y13, Y6
1250  	VPADDW       Y7, Y11, Y7
1251  	VPADDW       Y8, Y12, Y8
1252  	VPADDW       Y9, Y13, Y9
1253  	VPMULLW      Y4, Y0, Y11
1254  	VPMULLW      Y5, Y0, Y12
1255  	VPSUBW       Y10, Y14, Y2
1256  	VPMULLW      Y6, Y0, Y13
1257  	VPADDW       Y10, Y14, Y10
1258  	VPMULLW      Y2, Y0, Y14
1259  	VPMULHW      Y4, Y1, Y4
1260  	VPMULHW      Y5, Y1, Y5
1261  	VPMULHW      Y6, Y1, Y6
1262  	VPMULHW      Y2, Y1, Y2
1263  	VPMULHW      Y11, Y15, Y11
1264  	VPMULHW      Y12, Y15, Y12
1265  	VPMULHW      Y13, Y15, Y13
1266  	VPMULHW      Y14, Y15, Y14
1267  	VPSUBW       Y11, Y4, Y11
1268  	VPSUBW       Y12, Y5, Y12
1269  	VPSUBW       Y13, Y6, Y13
1270  	VPSUBW       Y14, Y2, Y14
1271  	VMOVDQU      Y7, 256(AX)
1272  	VMOVDQU      Y8, 288(AX)
1273  	VMOVDQU      Y9, 320(AX)
1274  	VMOVDQU      Y10, 352(AX)
1275  	VMOVDQU      Y11, 384(AX)
1276  	VMOVDQU      Y12, 416(AX)
1277  	VMOVDQU      Y13, 448(AX)
1278  	VMOVDQU      Y14, 480(AX)
1279  	VPBROADCASTW 2104(CX), Y0
1280  	VPBROADCASTW 2106(CX), Y1
1281  	VMOVDQU      (AX), Y7
1282  	VMOVDQU      32(AX), Y8
1283  	VMOVDQU      64(AX), Y9
1284  	VMOVDQU      96(AX), Y10
1285  	VMOVDQU      256(AX), Y11
1286  	VMOVDQU      288(AX), Y12
1287  	VMOVDQU      320(AX), Y13
1288  	VMOVDQU      352(AX), Y14
1289  	VPSUBW       Y7, Y11, Y2
1290  	VPSUBW       Y8, Y12, Y3
1291  	VPSUBW       Y9, Y13, Y4
1292  	VPADDW       Y7, Y11, Y7
1293  	VPADDW       Y8, Y12, Y8
1294  	VPADDW       Y9, Y13, Y9
1295  	VPMULLW      Y2, Y0, Y11
1296  	VPMULLW      Y3, Y0, Y12
1297  	VPSUBW       Y10, Y14, Y5
1298  	VPMULLW      Y4, Y0, Y13
1299  	VPADDW       Y10, Y14, Y10
1300  	VPMULLW      Y5, Y0, Y14
1301  	VPMULHW      Y2, Y1, Y2
1302  	VPMULHW      Y3, Y1, Y3
1303  	VPMULHW      Y4, Y1, Y4
1304  	VPMULHW      Y5, Y1, Y5
1305  	VPMULHW      Y11, Y15, Y11
1306  	VPMULHW      Y12, Y15, Y12
1307  	VPMULHW      Y13, Y15, Y13
1308  	VPMULHW      Y14, Y15, Y14
1309  	VPSUBW       Y11, Y2, Y11
1310  	VPSUBW       Y12, Y3, Y12
1311  	VPSUBW       Y13, Y4, Y13
1312  	VPSUBW       Y14, Y5, Y14
1313  	MOVL         $0xffffd8a1, DX
1314  	VMOVD        DX, X0
1315  	VPBROADCASTW X0, Y0
1316  	MOVL         $0x000005a1, DX
1317  	VMOVD        DX, X1
1318  	VPBROADCASTW X1, Y1
1319  	VPMULLW      Y7, Y0, Y2
1320  	VPMULLW      Y8, Y0, Y3
1321  	VPMULLW      Y9, Y0, Y4
1322  	VPMULLW      Y10, Y0, Y5
1323  	VPMULHW      Y7, Y1, Y7
1324  	VPMULHW      Y8, Y1, Y8
1325  	VPMULHW      Y9, Y1, Y9
1326  	VPMULHW      Y10, Y1, Y10
1327  	VPMULHW      Y2, Y15, Y2
1328  	VPMULHW      Y3, Y15, Y3
1329  	VPMULHW      Y4, Y15, Y4
1330  	VPMULHW      Y5, Y15, Y5
1331  	VPSUBW       Y2, Y7, Y7
1332  	VPSUBW       Y3, Y8, Y8
1333  	VPSUBW       Y4, Y9, Y9
1334  	VPSUBW       Y5, Y10, Y10
1335  	VPMULLW      Y11, Y0, Y2
1336  	VPMULLW      Y12, Y0, Y3
1337  	VPMULLW      Y13, Y0, Y4
1338  	VPMULLW      Y14, Y0, Y5
1339  	VPMULHW      Y11, Y1, Y11
1340  	VPMULHW      Y12, Y1, Y12
1341  	VPMULHW      Y13, Y1, Y13
1342  	VPMULHW      Y14, Y1, Y14
1343  	VPMULHW      Y2, Y15, Y2
1344  	VPMULHW      Y3, Y15, Y3
1345  	VPMULHW      Y4, Y15, Y4
1346  	VPMULHW      Y5, Y15, Y5
1347  	VPSUBW       Y2, Y11, Y11
1348  	VPSUBW       Y3, Y12, Y12
1349  	VPSUBW       Y4, Y13, Y13
1350  	VPSUBW       Y5, Y14, Y14
1351  	VMOVDQU      Y7, (AX)
1352  	VMOVDQU      Y8, 32(AX)
1353  	VMOVDQU      Y9, 64(AX)
1354  	VMOVDQU      Y10, 96(AX)
1355  	VMOVDQU      Y11, 256(AX)
1356  	VMOVDQU      Y12, 288(AX)
1357  	VMOVDQU      Y13, 320(AX)
1358  	VMOVDQU      Y14, 352(AX)
1359  	VPBROADCASTW 2104(CX), Y0
1360  	VPBROADCASTW 2106(CX), Y1
1361  	VMOVDQU      128(AX), Y7
1362  	VMOVDQU      160(AX), Y8
1363  	VMOVDQU      192(AX), Y9
1364  	VMOVDQU      224(AX), Y10
1365  	VMOVDQU      384(AX), Y11
1366  	VMOVDQU      416(AX), Y12
1367  	VMOVDQU      448(AX), Y13
1368  	VMOVDQU      480(AX), Y14
1369  	VPSUBW       Y7, Y11, Y2
1370  	VPSUBW       Y8, Y12, Y3
1371  	VPSUBW       Y9, Y13, Y4
1372  	VPADDW       Y7, Y11, Y7
1373  	VPADDW       Y8, Y12, Y8
1374  	VPADDW       Y9, Y13, Y9
1375  	VPMULLW      Y2, Y0, Y11
1376  	VPMULLW      Y3, Y0, Y12
1377  	VPSUBW       Y10, Y14, Y5
1378  	VPMULLW      Y4, Y0, Y13
1379  	VPADDW       Y10, Y14, Y10
1380  	VPMULLW      Y5, Y0, Y14
1381  	VPMULHW      Y2, Y1, Y2
1382  	VPMULHW      Y3, Y1, Y3
1383  	VPMULHW      Y4, Y1, Y4
1384  	VPMULHW      Y5, Y1, Y5
1385  	VPMULHW      Y11, Y15, Y11
1386  	VPMULHW      Y12, Y15, Y12
1387  	VPMULHW      Y13, Y15, Y13
1388  	VPMULHW      Y14, Y15, Y14
1389  	VPSUBW       Y11, Y2, Y11
1390  	VPSUBW       Y12, Y3, Y12
1391  	VPSUBW       Y13, Y4, Y13
1392  	VPSUBW       Y14, Y5, Y14
1393  	MOVL         $0xffffd8a1, CX
1394  	VMOVD        CX, X0
1395  	VPBROADCASTW X0, Y0
1396  	MOVL         $0x000005a1, CX
1397  	VMOVD        CX, X1
1398  	VPBROADCASTW X1, Y1
1399  	VPMULLW      Y7, Y0, Y2
1400  	VPMULLW      Y8, Y0, Y3
1401  	VPMULLW      Y9, Y0, Y4
1402  	VPMULLW      Y10, Y0, Y5
1403  	VPMULHW      Y7, Y1, Y7
1404  	VPMULHW      Y8, Y1, Y8
1405  	VPMULHW      Y9, Y1, Y9
1406  	VPMULHW      Y10, Y1, Y10
1407  	VPMULHW      Y2, Y15, Y2
1408  	VPMULHW      Y3, Y15, Y3
1409  	VPMULHW      Y4, Y15, Y4
1410  	VPMULHW      Y5, Y15, Y5
1411  	VPSUBW       Y2, Y7, Y7
1412  	VPSUBW       Y3, Y8, Y8
1413  	VPSUBW       Y4, Y9, Y9
1414  	VPSUBW       Y5, Y10, Y10
1415  	VPMULLW      Y11, Y0, Y2
1416  	VPMULLW      Y12, Y0, Y3
1417  	VPMULLW      Y13, Y0, Y4
1418  	VPMULLW      Y14, Y0, Y5
1419  	VPMULHW      Y11, Y1, Y11
1420  	VPMULHW      Y12, Y1, Y12
1421  	VPMULHW      Y13, Y1, Y13
1422  	VPMULHW      Y14, Y1, Y14
1423  	VPMULHW      Y2, Y15, Y2
1424  	VPMULHW      Y3, Y15, Y3
1425  	VPMULHW      Y4, Y15, Y4
1426  	VPMULHW      Y5, Y15, Y5
1427  	VPSUBW       Y2, Y11, Y11
1428  	VPSUBW       Y3, Y12, Y12
1429  	VPSUBW       Y4, Y13, Y13
1430  	VPSUBW       Y5, Y14, Y14
1431  	VMOVDQU      Y7, 128(AX)
1432  	VMOVDQU      Y8, 160(AX)
1433  	VMOVDQU      Y9, 192(AX)
1434  	VMOVDQU      Y10, 224(AX)
1435  	VMOVDQU      Y11, 384(AX)
1436  	VMOVDQU      Y12, 416(AX)
1437  	VMOVDQU      Y13, 448(AX)
1438  	VMOVDQU      Y14, 480(AX)
1439  	RET
1440  
1441  // func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
1442  // Requires: AVX, AVX2
1443  TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24
1444  	MOVQ         p+0(FP), AX
1445  	MOVQ         a+8(FP), CX
1446  	MOVQ         b+16(FP), DX
1447  	LEAQ         ·ZetasAVX2+0(SB), BX
1448  	MOVL         $0xfffff301, SI
1449  	VMOVD        SI, X0
1450  	VPBROADCASTW X0, Y14
1451  	MOVL         $0x00000d01, SI
1452  	VMOVD        SI, X0
1453  	VPBROADCASTW X0, Y15
1454  	VMOVDQU      (CX), Y0
1455  	VMOVDQU      32(CX), Y1
1456  	VMOVDQU      64(CX), Y2
1457  	VMOVDQU      96(CX), Y3
1458  	VMOVDQU      (DX), Y4
1459  	VMOVDQU      32(DX), Y5
1460  	VMOVDQU      64(DX), Y6
1461  	VMOVDQU      96(DX), Y7
1462  	VPMULLW      Y1, Y5, Y8
1463  	VPMULLW      Y0, Y4, Y9
1464  	VPMULLW      Y0, Y5, Y10
1465  	VPMULLW      Y1, Y4, Y11
1466  	VPMULLW      Y8, Y14, Y8
1467  	VPMULLW      Y9, Y14, Y9
1468  	VPMULLW      Y10, Y14, Y10
1469  	VPMULLW      Y11, Y14, Y11
1470  	VPMULHW      Y1, Y5, Y12
1471  	VPMULHW      Y0, Y4, Y13
1472  	VPMULHW      Y0, Y5, Y0
1473  	VPMULHW      Y1, Y4, Y1
1474  	VMOVDQA      Y12, Y4
1475  	VMOVDQA      Y13, Y5
1476  	VPMULHW      Y8, Y15, Y8
1477  	VPMULHW      Y9, Y15, Y9
1478  	VPMULHW      Y10, Y15, Y10
1479  	VPMULHW      Y11, Y15, Y11
1480  	VPSUBW       Y8, Y4, Y4
1481  	VPSUBW       Y9, Y5, Y5
1482  	VPSUBW       Y10, Y0, Y0
1483  	VPSUBW       Y11, Y1, Y1
1484  	VMOVDQU      800(BX), Y12
1485  	VMOVDQU      832(BX), Y13
1486  	VPMULLW      Y4, Y12, Y8
1487  	VPMULHW      Y4, Y13, Y4
1488  	VPMULHW      Y8, Y15, Y8
1489  	VPSUBW       Y8, Y4, Y4
1490  	VPADDW       Y4, Y5, Y4
1491  	VPADDW       Y0, Y1, Y5
1492  	VPMULLW      Y3, Y7, Y8
1493  	VPMULLW      Y2, Y6, Y9
1494  	VPMULLW      Y2, Y7, Y10
1495  	VPMULLW      Y3, Y6, Y11
1496  	VPMULLW      Y8, Y14, Y8
1497  	VPMULLW      Y9, Y14, Y9
1498  	VPMULLW      Y10, Y14, Y10
1499  	VPMULLW      Y11, Y14, Y11
1500  	VPMULHW      Y3, Y7, Y12
1501  	VPMULHW      Y2, Y6, Y13
1502  	VPMULHW      Y2, Y7, Y2
1503  	VPMULHW      Y3, Y6, Y3
1504  	VMOVDQA      Y12, Y6
1505  	VMOVDQA      Y13, Y7
1506  	VPMULHW      Y8, Y15, Y8
1507  	VPMULHW      Y9, Y15, Y9
1508  	VPMULHW      Y10, Y15, Y10
1509  	VPMULHW      Y11, Y15, Y11
1510  	VPSUBW       Y8, Y6, Y6
1511  	VPSUBW       Y9, Y7, Y7
1512  	VPSUBW       Y10, Y2, Y2
1513  	VPSUBW       Y11, Y3, Y3
1514  	VMOVDQU      800(BX), Y12
1515  	VMOVDQU      832(BX), Y13
1516  	VPMULLW      Y6, Y12, Y8
1517  	VPMULHW      Y6, Y13, Y6
1518  	VPMULHW      Y8, Y15, Y8
1519  	VPSUBW       Y8, Y6, Y6
1520  	VPSUBW       Y6, Y7, Y6
1521  	VPADDW       Y2, Y3, Y7
1522  	VMOVDQU      Y4, (AX)
1523  	VMOVDQU      Y5, 32(AX)
1524  	VMOVDQU      Y6, 64(AX)
1525  	VMOVDQU      Y7, 96(AX)
1526  	VMOVDQU      128(CX), Y0
1527  	VMOVDQU      160(CX), Y1
1528  	VMOVDQU      192(CX), Y2
1529  	VMOVDQU      224(CX), Y3
1530  	VMOVDQU      128(DX), Y4
1531  	VMOVDQU      160(DX), Y5
1532  	VMOVDQU      192(DX), Y6
1533  	VMOVDQU      224(DX), Y7
1534  	VPMULLW      Y1, Y5, Y8
1535  	VPMULLW      Y0, Y4, Y9
1536  	VPMULLW      Y0, Y5, Y10
1537  	VPMULLW      Y1, Y4, Y11
1538  	VPMULLW      Y8, Y14, Y8
1539  	VPMULLW      Y9, Y14, Y9
1540  	VPMULLW      Y10, Y14, Y10
1541  	VPMULLW      Y11, Y14, Y11
1542  	VPMULHW      Y1, Y5, Y12
1543  	VPMULHW      Y0, Y4, Y13
1544  	VPMULHW      Y0, Y5, Y0
1545  	VPMULHW      Y1, Y4, Y1
1546  	VMOVDQA      Y12, Y4
1547  	VMOVDQA      Y13, Y5
1548  	VPMULHW      Y8, Y15, Y8
1549  	VPMULHW      Y9, Y15, Y9
1550  	VPMULHW      Y10, Y15, Y10
1551  	VPMULHW      Y11, Y15, Y11
1552  	VPSUBW       Y8, Y4, Y4
1553  	VPSUBW       Y9, Y5, Y5
1554  	VPSUBW       Y10, Y0, Y0
1555  	VPSUBW       Y11, Y1, Y1
1556  	VMOVDQU      864(BX), Y12
1557  	VMOVDQU      896(BX), Y13
1558  	VPMULLW      Y4, Y12, Y8
1559  	VPMULHW      Y4, Y13, Y4
1560  	VPMULHW      Y8, Y15, Y8
1561  	VPSUBW       Y8, Y4, Y4
1562  	VPADDW       Y4, Y5, Y4
1563  	VPADDW       Y0, Y1, Y5
1564  	VPMULLW      Y3, Y7, Y8
1565  	VPMULLW      Y2, Y6, Y9
1566  	VPMULLW      Y2, Y7, Y10
1567  	VPMULLW      Y3, Y6, Y11
1568  	VPMULLW      Y8, Y14, Y8
1569  	VPMULLW      Y9, Y14, Y9
1570  	VPMULLW      Y10, Y14, Y10
1571  	VPMULLW      Y11, Y14, Y11
1572  	VPMULHW      Y3, Y7, Y12
1573  	VPMULHW      Y2, Y6, Y13
1574  	VPMULHW      Y2, Y7, Y2
1575  	VPMULHW      Y3, Y6, Y3
1576  	VMOVDQA      Y12, Y6
1577  	VMOVDQA      Y13, Y7
1578  	VPMULHW      Y8, Y15, Y8
1579  	VPMULHW      Y9, Y15, Y9
1580  	VPMULHW      Y10, Y15, Y10
1581  	VPMULHW      Y11, Y15, Y11
1582  	VPSUBW       Y8, Y6, Y6
1583  	VPSUBW       Y9, Y7, Y7
1584  	VPSUBW       Y10, Y2, Y2
1585  	VPSUBW       Y11, Y3, Y3
1586  	VMOVDQU      864(BX), Y12
1587  	VMOVDQU      896(BX), Y13
1588  	VPMULLW      Y6, Y12, Y8
1589  	VPMULHW      Y6, Y13, Y6
1590  	VPMULHW      Y8, Y15, Y8
1591  	VPSUBW       Y8, Y6, Y6
1592  	VPSUBW       Y6, Y7, Y6
1593  	VPADDW       Y2, Y3, Y7
1594  	VMOVDQU      Y4, 128(AX)
1595  	VMOVDQU      Y5, 160(AX)
1596  	VMOVDQU      Y6, 192(AX)
1597  	VMOVDQU      Y7, 224(AX)
1598  	VMOVDQU      256(CX), Y0
1599  	VMOVDQU      288(CX), Y1
1600  	VMOVDQU      320(CX), Y2
1601  	VMOVDQU      352(CX), Y3
1602  	VMOVDQU      256(DX), Y4
1603  	VMOVDQU      288(DX), Y5
1604  	VMOVDQU      320(DX), Y6
1605  	VMOVDQU      352(DX), Y7
1606  	VPMULLW      Y1, Y5, Y8
1607  	VPMULLW      Y0, Y4, Y9
1608  	VPMULLW      Y0, Y5, Y10
1609  	VPMULLW      Y1, Y4, Y11
1610  	VPMULLW      Y8, Y14, Y8
1611  	VPMULLW      Y9, Y14, Y9
1612  	VPMULLW      Y10, Y14, Y10
1613  	VPMULLW      Y11, Y14, Y11
1614  	VPMULHW      Y1, Y5, Y12
1615  	VPMULHW      Y0, Y4, Y13
1616  	VPMULHW      Y0, Y5, Y0
1617  	VPMULHW      Y1, Y4, Y1
1618  	VMOVDQA      Y12, Y4
1619  	VMOVDQA      Y13, Y5
1620  	VPMULHW      Y8, Y15, Y8
1621  	VPMULHW      Y9, Y15, Y9
1622  	VPMULHW      Y10, Y15, Y10
1623  	VPMULHW      Y11, Y15, Y11
1624  	VPSUBW       Y8, Y4, Y4
1625  	VPSUBW       Y9, Y5, Y5
1626  	VPSUBW       Y10, Y0, Y0
1627  	VPSUBW       Y11, Y1, Y1
1628  	VMOVDQU      928(BX), Y12
1629  	VMOVDQU      960(BX), Y13
1630  	VPMULLW      Y4, Y12, Y8
1631  	VPMULHW      Y4, Y13, Y4
1632  	VPMULHW      Y8, Y15, Y8
1633  	VPSUBW       Y8, Y4, Y4
1634  	VPADDW       Y4, Y5, Y4
1635  	VPADDW       Y0, Y1, Y5
1636  	VPMULLW      Y3, Y7, Y8
1637  	VPMULLW      Y2, Y6, Y9
1638  	VPMULLW      Y2, Y7, Y10
1639  	VPMULLW      Y3, Y6, Y11
1640  	VPMULLW      Y8, Y14, Y8
1641  	VPMULLW      Y9, Y14, Y9
1642  	VPMULLW      Y10, Y14, Y10
1643  	VPMULLW      Y11, Y14, Y11
1644  	VPMULHW      Y3, Y7, Y12
1645  	VPMULHW      Y2, Y6, Y13
1646  	VPMULHW      Y2, Y7, Y2
1647  	VPMULHW      Y3, Y6, Y3
1648  	VMOVDQA      Y12, Y6
1649  	VMOVDQA      Y13, Y7
1650  	VPMULHW      Y8, Y15, Y8
1651  	VPMULHW      Y9, Y15, Y9
1652  	VPMULHW      Y10, Y15, Y10
1653  	VPMULHW      Y11, Y15, Y11
1654  	VPSUBW       Y8, Y6, Y6
1655  	VPSUBW       Y9, Y7, Y7
1656  	VPSUBW       Y10, Y2, Y2
1657  	VPSUBW       Y11, Y3, Y3
1658  	VMOVDQU      928(BX), Y12
1659  	VMOVDQU      960(BX), Y13
1660  	VPMULLW      Y6, Y12, Y8
1661  	VPMULHW      Y6, Y13, Y6
1662  	VPMULHW      Y8, Y15, Y8
1663  	VPSUBW       Y8, Y6, Y6
1664  	VPSUBW       Y6, Y7, Y6
1665  	VPADDW       Y2, Y3, Y7
1666  	VMOVDQU      Y4, 256(AX)
1667  	VMOVDQU      Y5, 288(AX)
1668  	VMOVDQU      Y6, 320(AX)
1669  	VMOVDQU      Y7, 352(AX)
1670  	VMOVDQU      384(CX), Y0
1671  	VMOVDQU      416(CX), Y1
1672  	VMOVDQU      448(CX), Y2
1673  	VMOVDQU      480(CX), Y3
1674  	VMOVDQU      384(DX), Y4
1675  	VMOVDQU      416(DX), Y5
1676  	VMOVDQU      448(DX), Y6
1677  	VMOVDQU      480(DX), Y7
1678  	VPMULLW      Y1, Y5, Y8
1679  	VPMULLW      Y0, Y4, Y9
1680  	VPMULLW      Y0, Y5, Y10
1681  	VPMULLW      Y1, Y4, Y11
1682  	VPMULLW      Y8, Y14, Y8
1683  	VPMULLW      Y9, Y14, Y9
1684  	VPMULLW      Y10, Y14, Y10
1685  	VPMULLW      Y11, Y14, Y11
1686  	VPMULHW      Y1, Y5, Y12
1687  	VPMULHW      Y0, Y4, Y13
1688  	VPMULHW      Y0, Y5, Y0
1689  	VPMULHW      Y1, Y4, Y1
1690  	VMOVDQA      Y12, Y4
1691  	VMOVDQA      Y13, Y5
1692  	VPMULHW      Y8, Y15, Y8
1693  	VPMULHW      Y9, Y15, Y9
1694  	VPMULHW      Y10, Y15, Y10
1695  	VPMULHW      Y11, Y15, Y11
1696  	VPSUBW       Y8, Y4, Y4
1697  	VPSUBW       Y9, Y5, Y5
1698  	VPSUBW       Y10, Y0, Y0
1699  	VPSUBW       Y11, Y1, Y1
1700  	VMOVDQU      992(BX), Y12
1701  	VMOVDQU      1024(BX), Y13
1702  	VPMULLW      Y4, Y12, Y8
1703  	VPMULHW      Y4, Y13, Y4
1704  	VPMULHW      Y8, Y15, Y8
1705  	VPSUBW       Y8, Y4, Y4
1706  	VPADDW       Y4, Y5, Y4
1707  	VPADDW       Y0, Y1, Y5
1708  	VPMULLW      Y3, Y7, Y8
1709  	VPMULLW      Y2, Y6, Y9
1710  	VPMULLW      Y2, Y7, Y10
1711  	VPMULLW      Y3, Y6, Y11
1712  	VPMULLW      Y8, Y14, Y8
1713  	VPMULLW      Y9, Y14, Y9
1714  	VPMULLW      Y10, Y14, Y10
1715  	VPMULLW      Y11, Y14, Y11
1716  	VPMULHW      Y3, Y7, Y12
1717  	VPMULHW      Y2, Y6, Y13
1718  	VPMULHW      Y2, Y7, Y2
1719  	VPMULHW      Y3, Y6, Y3
1720  	VMOVDQA      Y12, Y6
1721  	VMOVDQA      Y13, Y7
1722  	VPMULHW      Y8, Y15, Y8
1723  	VPMULHW      Y9, Y15, Y9
1724  	VPMULHW      Y10, Y15, Y10
1725  	VPMULHW      Y11, Y15, Y11
1726  	VPSUBW       Y8, Y6, Y6
1727  	VPSUBW       Y9, Y7, Y7
1728  	VPSUBW       Y10, Y2, Y2
1729  	VPSUBW       Y11, Y3, Y3
1730  	VMOVDQU      992(BX), Y12
1731  	VMOVDQU      1024(BX), Y13
1732  	VPMULLW      Y6, Y12, Y8
1733  	VPMULHW      Y6, Y13, Y6
1734  	VPMULHW      Y8, Y15, Y8
1735  	VPSUBW       Y8, Y6, Y6
1736  	VPSUBW       Y6, Y7, Y6
1737  	VPADDW       Y2, Y3, Y7
1738  	VMOVDQU      Y4, 384(AX)
1739  	VMOVDQU      Y5, 416(AX)
1740  	VMOVDQU      Y6, 448(AX)
1741  	VMOVDQU      Y7, 480(AX)
1742  	RET
1743  
1744  // func detangleAVX2(p *[256]int16)
1745  // Requires: AVX, AVX2
1746  TEXT ·detangleAVX2(SB), NOSPLIT, $0-8
1747  	MOVQ        p+0(FP), AX
1748  	VMOVDQU     (AX), Y0
1749  	VMOVDQU     32(AX), Y1
1750  	VMOVDQU     64(AX), Y2
1751  	VMOVDQU     96(AX), Y3
1752  	VMOVDQU     128(AX), Y4
1753  	VMOVDQU     160(AX), Y5
1754  	VMOVDQU     192(AX), Y6
1755  	VMOVDQU     224(AX), Y7
1756  	VPSLLD      $0x10, Y1, Y8
1757  	VPBLENDW    $0xaa, Y8, Y0, Y8
1758  	VPSRLD      $0x10, Y0, Y0
1759  	VPBLENDW    $0xaa, Y1, Y0, Y1
1760  	VMOVDQA     Y8, Y0
1761  	VPSLLD      $0x10, Y3, Y8
1762  	VPBLENDW    $0xaa, Y8, Y2, Y8
1763  	VPSRLD      $0x10, Y2, Y2
1764  	VPBLENDW    $0xaa, Y3, Y2, Y3
1765  	VMOVDQA     Y8, Y2
1766  	VPSLLD      $0x10, Y5, Y8
1767  	VPBLENDW    $0xaa, Y8, Y4, Y8
1768  	VPSRLD      $0x10, Y4, Y4
1769  	VPBLENDW    $0xaa, Y5, Y4, Y5
1770  	VMOVDQA     Y8, Y4
1771  	VPSLLD      $0x10, Y7, Y8
1772  	VPBLENDW    $0xaa, Y8, Y6, Y8
1773  	VPSRLD      $0x10, Y6, Y6
1774  	VPBLENDW    $0xaa, Y7, Y6, Y7
1775  	VMOVDQA     Y8, Y6
1776  	VMOVSLDUP   Y2, Y8
1777  	VPBLENDD    $0xaa, Y8, Y0, Y8
1778  	VPSRLQ      $0x20, Y0, Y0
1779  	VPBLENDD    $0xaa, Y2, Y0, Y2
1780  	VMOVDQA     Y8, Y0
1781  	VMOVSLDUP   Y3, Y8
1782  	VPBLENDD    $0xaa, Y8, Y1, Y8
1783  	VPSRLQ      $0x20, Y1, Y1
1784  	VPBLENDD    $0xaa, Y3, Y1, Y3
1785  	VMOVDQA     Y8, Y1
1786  	VMOVSLDUP   Y6, Y8
1787  	VPBLENDD    $0xaa, Y8, Y4, Y8
1788  	VPSRLQ      $0x20, Y4, Y4
1789  	VPBLENDD    $0xaa, Y6, Y4, Y6
1790  	VMOVDQA     Y8, Y4
1791  	VMOVSLDUP   Y7, Y8
1792  	VPBLENDD    $0xaa, Y8, Y5, Y8
1793  	VPSRLQ      $0x20, Y5, Y5
1794  	VPBLENDD    $0xaa, Y7, Y5, Y7
1795  	VMOVDQA     Y8, Y5
1796  	VPUNPCKLQDQ Y1, Y0, Y8
1797  	VPUNPCKHQDQ Y1, Y0, Y1
1798  	VMOVDQA     Y8, Y0
1799  	VPUNPCKLQDQ Y3, Y2, Y8
1800  	VPUNPCKHQDQ Y3, Y2, Y3
1801  	VMOVDQA     Y8, Y2
1802  	VPUNPCKLQDQ Y5, Y4, Y8
1803  	VPUNPCKHQDQ Y5, Y4, Y5
1804  	VMOVDQA     Y8, Y4
1805  	VPUNPCKLQDQ Y7, Y6, Y8
1806  	VPUNPCKHQDQ Y7, Y6, Y7
1807  	VMOVDQA     Y8, Y6
1808  	VPERM2I128  $0x20, Y2, Y0, Y8
1809  	VPERM2I128  $0x31, Y2, Y0, Y2
1810  	VMOVDQA     Y8, Y0
1811  	VPERM2I128  $0x20, Y3, Y1, Y8
1812  	VPERM2I128  $0x31, Y3, Y1, Y3
1813  	VMOVDQA     Y8, Y1
1814  	VPERM2I128  $0x20, Y6, Y4, Y8
1815  	VPERM2I128  $0x31, Y6, Y4, Y6
1816  	VMOVDQA     Y8, Y4
1817  	VPERM2I128  $0x20, Y7, Y5, Y8
1818  	VPERM2I128  $0x31, Y7, Y5, Y7
1819  	VMOVDQA     Y8, Y5
1820  	VMOVDQU     Y0, (AX)
1821  	VMOVDQU     Y1, 32(AX)
1822  	VMOVDQU     Y2, 64(AX)
1823  	VMOVDQU     Y3, 96(AX)
1824  	VMOVDQU     Y4, 128(AX)
1825  	VMOVDQU     Y5, 160(AX)
1826  	VMOVDQU     Y6, 192(AX)
1827  	VMOVDQU     Y7, 224(AX)
1828  	VMOVDQU     256(AX), Y0
1829  	VMOVDQU     288(AX), Y1
1830  	VMOVDQU     320(AX), Y2
1831  	VMOVDQU     352(AX), Y3
1832  	VMOVDQU     384(AX), Y4
1833  	VMOVDQU     416(AX), Y5
1834  	VMOVDQU     448(AX), Y6
1835  	VMOVDQU     480(AX), Y7
1836  	VPSLLD      $0x10, Y1, Y8
1837  	VPBLENDW    $0xaa, Y8, Y0, Y8
1838  	VPSRLD      $0x10, Y0, Y0
1839  	VPBLENDW    $0xaa, Y1, Y0, Y1
1840  	VMOVDQA     Y8, Y0
1841  	VPSLLD      $0x10, Y3, Y8
1842  	VPBLENDW    $0xaa, Y8, Y2, Y8
1843  	VPSRLD      $0x10, Y2, Y2
1844  	VPBLENDW    $0xaa, Y3, Y2, Y3
1845  	VMOVDQA     Y8, Y2
1846  	VPSLLD      $0x10, Y5, Y8
1847  	VPBLENDW    $0xaa, Y8, Y4, Y8
1848  	VPSRLD      $0x10, Y4, Y4
1849  	VPBLENDW    $0xaa, Y5, Y4, Y5
1850  	VMOVDQA     Y8, Y4
1851  	VPSLLD      $0x10, Y7, Y8
1852  	VPBLENDW    $0xaa, Y8, Y6, Y8
1853  	VPSRLD      $0x10, Y6, Y6
1854  	VPBLENDW    $0xaa, Y7, Y6, Y7
1855  	VMOVDQA     Y8, Y6
1856  	VMOVSLDUP   Y2, Y8
1857  	VPBLENDD    $0xaa, Y8, Y0, Y8
1858  	VPSRLQ      $0x20, Y0, Y0
1859  	VPBLENDD    $0xaa, Y2, Y0, Y2
1860  	VMOVDQA     Y8, Y0
1861  	VMOVSLDUP   Y3, Y8
1862  	VPBLENDD    $0xaa, Y8, Y1, Y8
1863  	VPSRLQ      $0x20, Y1, Y1
1864  	VPBLENDD    $0xaa, Y3, Y1, Y3
1865  	VMOVDQA     Y8, Y1
1866  	VMOVSLDUP   Y6, Y8
1867  	VPBLENDD    $0xaa, Y8, Y4, Y8
1868  	VPSRLQ      $0x20, Y4, Y4
1869  	VPBLENDD    $0xaa, Y6, Y4, Y6
1870  	VMOVDQA     Y8, Y4
1871  	VMOVSLDUP   Y7, Y8
1872  	VPBLENDD    $0xaa, Y8, Y5, Y8
1873  	VPSRLQ      $0x20, Y5, Y5
1874  	VPBLENDD    $0xaa, Y7, Y5, Y7
1875  	VMOVDQA     Y8, Y5
1876  	VPUNPCKLQDQ Y1, Y0, Y8
1877  	VPUNPCKHQDQ Y1, Y0, Y1
1878  	VMOVDQA     Y8, Y0
1879  	VPUNPCKLQDQ Y3, Y2, Y8
1880  	VPUNPCKHQDQ Y3, Y2, Y3
1881  	VMOVDQA     Y8, Y2
1882  	VPUNPCKLQDQ Y5, Y4, Y8
1883  	VPUNPCKHQDQ Y5, Y4, Y5
1884  	VMOVDQA     Y8, Y4
1885  	VPUNPCKLQDQ Y7, Y6, Y8
1886  	VPUNPCKHQDQ Y7, Y6, Y7
1887  	VMOVDQA     Y8, Y6
1888  	VPERM2I128  $0x20, Y2, Y0, Y8
1889  	VPERM2I128  $0x31, Y2, Y0, Y2
1890  	VMOVDQA     Y8, Y0
1891  	VPERM2I128  $0x20, Y3, Y1, Y8
1892  	VPERM2I128  $0x31, Y3, Y1, Y3
1893  	VMOVDQA     Y8, Y1
1894  	VPERM2I128  $0x20, Y6, Y4, Y8
1895  	VPERM2I128  $0x31, Y6, Y4, Y6
1896  	VMOVDQA     Y8, Y4
1897  	VPERM2I128  $0x20, Y7, Y5, Y8
1898  	VPERM2I128  $0x31, Y7, Y5, Y7
1899  	VMOVDQA     Y8, Y5
1900  	VMOVDQU     Y0, 256(AX)
1901  	VMOVDQU     Y1, 288(AX)
1902  	VMOVDQU     Y2, 320(AX)
1903  	VMOVDQU     Y3, 352(AX)
1904  	VMOVDQU     Y4, 384(AX)
1905  	VMOVDQU     Y5, 416(AX)
1906  	VMOVDQU     Y6, 448(AX)
1907  	VMOVDQU     Y7, 480(AX)
1908  	RET
1909  
1910  // func tangleAVX2(p *[256]int16)
1911  // Requires: AVX, AVX2
1912  TEXT ·tangleAVX2(SB), NOSPLIT, $0-8
1913  	MOVQ        p+0(FP), AX
1914  	VMOVDQU     (AX), Y0
1915  	VMOVDQU     32(AX), Y1
1916  	VMOVDQU     64(AX), Y2
1917  	VMOVDQU     96(AX), Y3
1918  	VMOVDQU     128(AX), Y4
1919  	VMOVDQU     160(AX), Y5
1920  	VMOVDQU     192(AX), Y6
1921  	VMOVDQU     224(AX), Y7
1922  	VPERM2I128  $0x20, Y2, Y0, Y8
1923  	VPERM2I128  $0x31, Y2, Y0, Y2
1924  	VMOVDQA     Y8, Y0
1925  	VPERM2I128  $0x20, Y3, Y1, Y8
1926  	VPERM2I128  $0x31, Y3, Y1, Y3
1927  	VMOVDQA     Y8, Y1
1928  	VPERM2I128  $0x20, Y6, Y4, Y8
1929  	VPERM2I128  $0x31, Y6, Y4, Y6
1930  	VMOVDQA     Y8, Y4
1931  	VPERM2I128  $0x20, Y7, Y5, Y8
1932  	VPERM2I128  $0x31, Y7, Y5, Y7
1933  	VMOVDQA     Y8, Y5
1934  	VPUNPCKLQDQ Y1, Y0, Y8
1935  	VPUNPCKHQDQ Y1, Y0, Y1
1936  	VMOVDQA     Y8, Y0
1937  	VPUNPCKLQDQ Y3, Y2, Y8
1938  	VPUNPCKHQDQ Y3, Y2, Y3
1939  	VMOVDQA     Y8, Y2
1940  	VPUNPCKLQDQ Y5, Y4, Y8
1941  	VPUNPCKHQDQ Y5, Y4, Y5
1942  	VMOVDQA     Y8, Y4
1943  	VPUNPCKLQDQ Y7, Y6, Y8
1944  	VPUNPCKHQDQ Y7, Y6, Y7
1945  	VMOVDQA     Y8, Y6
1946  	VMOVSLDUP   Y2, Y8
1947  	VPBLENDD    $0xaa, Y8, Y0, Y8
1948  	VPSRLQ      $0x20, Y0, Y0
1949  	VPBLENDD    $0xaa, Y2, Y0, Y2
1950  	VMOVDQA     Y8, Y0
1951  	VMOVSLDUP   Y3, Y8
1952  	VPBLENDD    $0xaa, Y8, Y1, Y8
1953  	VPSRLQ      $0x20, Y1, Y1
1954  	VPBLENDD    $0xaa, Y3, Y1, Y3
1955  	VMOVDQA     Y8, Y1
1956  	VMOVSLDUP   Y6, Y8
1957  	VPBLENDD    $0xaa, Y8, Y4, Y8
1958  	VPSRLQ      $0x20, Y4, Y4
1959  	VPBLENDD    $0xaa, Y6, Y4, Y6
1960  	VMOVDQA     Y8, Y4
1961  	VMOVSLDUP   Y7, Y8
1962  	VPBLENDD    $0xaa, Y8, Y5, Y8
1963  	VPSRLQ      $0x20, Y5, Y5
1964  	VPBLENDD    $0xaa, Y7, Y5, Y7
1965  	VMOVDQA     Y8, Y5
1966  	VPSLLD      $0x10, Y1, Y8
1967  	VPBLENDW    $0xaa, Y8, Y0, Y8
1968  	VPSRLD      $0x10, Y0, Y0
1969  	VPBLENDW    $0xaa, Y1, Y0, Y1
1970  	VMOVDQA     Y8, Y0
1971  	VPSLLD      $0x10, Y3, Y8
1972  	VPBLENDW    $0xaa, Y8, Y2, Y8
1973  	VPSRLD      $0x10, Y2, Y2
1974  	VPBLENDW    $0xaa, Y3, Y2, Y3
1975  	VMOVDQA     Y8, Y2
1976  	VPSLLD      $0x10, Y5, Y8
1977  	VPBLENDW    $0xaa, Y8, Y4, Y8
1978  	VPSRLD      $0x10, Y4, Y4
1979  	VPBLENDW    $0xaa, Y5, Y4, Y5
1980  	VMOVDQA     Y8, Y4
1981  	VPSLLD      $0x10, Y7, Y8
1982  	VPBLENDW    $0xaa, Y8, Y6, Y8
1983  	VPSRLD      $0x10, Y6, Y6
1984  	VPBLENDW    $0xaa, Y7, Y6, Y7
1985  	VMOVDQA     Y8, Y6
1986  	VMOVDQU     Y0, (AX)
1987  	VMOVDQU     Y1, 32(AX)
1988  	VMOVDQU     Y2, 64(AX)
1989  	VMOVDQU     Y3, 96(AX)
1990  	VMOVDQU     Y4, 128(AX)
1991  	VMOVDQU     Y5, 160(AX)
1992  	VMOVDQU     Y6, 192(AX)
1993  	VMOVDQU     Y7, 224(AX)
1994  	VMOVDQU     256(AX), Y0
1995  	VMOVDQU     288(AX), Y1
1996  	VMOVDQU     320(AX), Y2
1997  	VMOVDQU     352(AX), Y3
1998  	VMOVDQU     384(AX), Y4
1999  	VMOVDQU     416(AX), Y5
2000  	VMOVDQU     448(AX), Y6
2001  	VMOVDQU     480(AX), Y7
2002  	VPERM2I128  $0x20, Y2, Y0, Y8
2003  	VPERM2I128  $0x31, Y2, Y0, Y2
2004  	VMOVDQA     Y8, Y0
2005  	VPERM2I128  $0x20, Y3, Y1, Y8
2006  	VPERM2I128  $0x31, Y3, Y1, Y3
2007  	VMOVDQA     Y8, Y1
2008  	VPERM2I128  $0x20, Y6, Y4, Y8
2009  	VPERM2I128  $0x31, Y6, Y4, Y6
2010  	VMOVDQA     Y8, Y4
2011  	VPERM2I128  $0x20, Y7, Y5, Y8
2012  	VPERM2I128  $0x31, Y7, Y5, Y7
2013  	VMOVDQA     Y8, Y5
2014  	VPUNPCKLQDQ Y1, Y0, Y8
2015  	VPUNPCKHQDQ Y1, Y0, Y1
2016  	VMOVDQA     Y8, Y0
2017  	VPUNPCKLQDQ Y3, Y2, Y8
2018  	VPUNPCKHQDQ Y3, Y2, Y3
2019  	VMOVDQA     Y8, Y2
2020  	VPUNPCKLQDQ Y5, Y4, Y8
2021  	VPUNPCKHQDQ Y5, Y4, Y5
2022  	VMOVDQA     Y8, Y4
2023  	VPUNPCKLQDQ Y7, Y6, Y8
2024  	VPUNPCKHQDQ Y7, Y6, Y7
2025  	VMOVDQA     Y8, Y6
2026  	VMOVSLDUP   Y2, Y8
2027  	VPBLENDD    $0xaa, Y8, Y0, Y8
2028  	VPSRLQ      $0x20, Y0, Y0
2029  	VPBLENDD    $0xaa, Y2, Y0, Y2
2030  	VMOVDQA     Y8, Y0
2031  	VMOVSLDUP   Y3, Y8
2032  	VPBLENDD    $0xaa, Y8, Y1, Y8
2033  	VPSRLQ      $0x20, Y1, Y1
2034  	VPBLENDD    $0xaa, Y3, Y1, Y3
2035  	VMOVDQA     Y8, Y1
2036  	VMOVSLDUP   Y6, Y8
2037  	VPBLENDD    $0xaa, Y8, Y4, Y8
2038  	VPSRLQ      $0x20, Y4, Y4
2039  	VPBLENDD    $0xaa, Y6, Y4, Y6
2040  	VMOVDQA     Y8, Y4
2041  	VMOVSLDUP   Y7, Y8
2042  	VPBLENDD    $0xaa, Y8, Y5, Y8
2043  	VPSRLQ      $0x20, Y5, Y5
2044  	VPBLENDD    $0xaa, Y7, Y5, Y7
2045  	VMOVDQA     Y8, Y5
2046  	VPSLLD      $0x10, Y1, Y8
2047  	VPBLENDW    $0xaa, Y8, Y0, Y8
2048  	VPSRLD      $0x10, Y0, Y0
2049  	VPBLENDW    $0xaa, Y1, Y0, Y1
2050  	VMOVDQA     Y8, Y0
2051  	VPSLLD      $0x10, Y3, Y8
2052  	VPBLENDW    $0xaa, Y8, Y2, Y8
2053  	VPSRLD      $0x10, Y2, Y2
2054  	VPBLENDW    $0xaa, Y3, Y2, Y3
2055  	VMOVDQA     Y8, Y2
2056  	VPSLLD      $0x10, Y5, Y8
2057  	VPBLENDW    $0xaa, Y8, Y4, Y8
2058  	VPSRLD      $0x10, Y4, Y4
2059  	VPBLENDW    $0xaa, Y5, Y4, Y5
2060  	VMOVDQA     Y8, Y4
2061  	VPSLLD      $0x10, Y7, Y8
2062  	VPBLENDW    $0xaa, Y8, Y6, Y8
2063  	VPSRLD      $0x10, Y6, Y6
2064  	VPBLENDW    $0xaa, Y7, Y6, Y7
2065  	VMOVDQA     Y8, Y6
2066  	VMOVDQU     Y0, 256(AX)
2067  	VMOVDQU     Y1, 288(AX)
2068  	VMOVDQU     Y2, 320(AX)
2069  	VMOVDQU     Y3, 352(AX)
2070  	VMOVDQU     Y4, 384(AX)
2071  	VMOVDQU     Y5, 416(AX)
2072  	VMOVDQU     Y6, 448(AX)
2073  	VMOVDQU     Y7, 480(AX)
2074  	RET
2075  
2076  // func barrettReduceAVX2(p *[256]int16)
2077  // Requires: AVX, AVX2
2078  TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8
2079  	MOVQ         p+0(FP), AX
2080  	MOVL         $0x00000d01, CX
2081  	VMOVD        CX, X0
2082  	VPBROADCASTW X0, Y9
2083  	MOVL         $0x00004ebf, CX
2084  	VMOVD        CX, X0
2085  	VPBROADCASTW X0, Y8
2086  	VMOVDQU      (AX), Y0
2087  	VMOVDQU      32(AX), Y1
2088  	VMOVDQU      64(AX), Y2
2089  	VMOVDQU      96(AX), Y3
2090  	VPMULHW      Y8, Y0, Y4
2091  	VPMULHW      Y8, Y1, Y5
2092  	VPMULHW      Y8, Y2, Y6
2093  	VPMULHW      Y8, Y3, Y7
2094  	VPSRAW       $0x0a, Y4, Y4
2095  	VPSRAW       $0x0a, Y5, Y5
2096  	VPSRAW       $0x0a, Y6, Y6
2097  	VPSRAW       $0x0a, Y7, Y7
2098  	VPMULLW      Y9, Y4, Y4
2099  	VPMULLW      Y9, Y5, Y5
2100  	VPMULLW      Y9, Y6, Y6
2101  	VPMULLW      Y9, Y7, Y7
2102  	VPSUBW       Y4, Y0, Y0
2103  	VPSUBW       Y5, Y1, Y1
2104  	VPSUBW       Y6, Y2, Y2
2105  	VPSUBW       Y7, Y3, Y3
2106  	VMOVDQU      Y0, (AX)
2107  	VMOVDQU      Y1, 32(AX)
2108  	VMOVDQU      Y2, 64(AX)
2109  	VMOVDQU      Y3, 96(AX)
2110  	VMOVDQU      128(AX), Y0
2111  	VMOVDQU      160(AX), Y1
2112  	VMOVDQU      192(AX), Y2
2113  	VMOVDQU      224(AX), Y3
2114  	VPMULHW      Y8, Y0, Y4
2115  	VPMULHW      Y8, Y1, Y5
2116  	VPMULHW      Y8, Y2, Y6
2117  	VPMULHW      Y8, Y3, Y7
2118  	VPSRAW       $0x0a, Y4, Y4
2119  	VPSRAW       $0x0a, Y5, Y5
2120  	VPSRAW       $0x0a, Y6, Y6
2121  	VPSRAW       $0x0a, Y7, Y7
2122  	VPMULLW      Y9, Y4, Y4
2123  	VPMULLW      Y9, Y5, Y5
2124  	VPMULLW      Y9, Y6, Y6
2125  	VPMULLW      Y9, Y7, Y7
2126  	VPSUBW       Y4, Y0, Y0
2127  	VPSUBW       Y5, Y1, Y1
2128  	VPSUBW       Y6, Y2, Y2
2129  	VPSUBW       Y7, Y3, Y3
2130  	VMOVDQU      Y0, 128(AX)
2131  	VMOVDQU      Y1, 160(AX)
2132  	VMOVDQU      Y2, 192(AX)
2133  	VMOVDQU      Y3, 224(AX)
2134  	VMOVDQU      256(AX), Y0
2135  	VMOVDQU      288(AX), Y1
2136  	VMOVDQU      320(AX), Y2
2137  	VMOVDQU      352(AX), Y3
2138  	VPMULHW      Y8, Y0, Y4
2139  	VPMULHW      Y8, Y1, Y5
2140  	VPMULHW      Y8, Y2, Y6
2141  	VPMULHW      Y8, Y3, Y7
2142  	VPSRAW       $0x0a, Y4, Y4
2143  	VPSRAW       $0x0a, Y5, Y5
2144  	VPSRAW       $0x0a, Y6, Y6
2145  	VPSRAW       $0x0a, Y7, Y7
2146  	VPMULLW      Y9, Y4, Y4
2147  	VPMULLW      Y9, Y5, Y5
2148  	VPMULLW      Y9, Y6, Y6
2149  	VPMULLW      Y9, Y7, Y7
2150  	VPSUBW       Y4, Y0, Y0
2151  	VPSUBW       Y5, Y1, Y1
2152  	VPSUBW       Y6, Y2, Y2
2153  	VPSUBW       Y7, Y3, Y3
2154  	VMOVDQU      Y0, 256(AX)
2155  	VMOVDQU      Y1, 288(AX)
2156  	VMOVDQU      Y2, 320(AX)
2157  	VMOVDQU      Y3, 352(AX)
2158  	VMOVDQU      384(AX), Y0
2159  	VMOVDQU      416(AX), Y1
2160  	VMOVDQU      448(AX), Y2
2161  	VMOVDQU      480(AX), Y3
2162  	VPMULHW      Y8, Y0, Y4
2163  	VPMULHW      Y8, Y1, Y5
2164  	VPMULHW      Y8, Y2, Y6
2165  	VPMULHW      Y8, Y3, Y7
2166  	VPSRAW       $0x0a, Y4, Y4
2167  	VPSRAW       $0x0a, Y5, Y5
2168  	VPSRAW       $0x0a, Y6, Y6
2169  	VPSRAW       $0x0a, Y7, Y7
2170  	VPMULLW      Y9, Y4, Y4
2171  	VPMULLW      Y9, Y5, Y5
2172  	VPMULLW      Y9, Y6, Y6
2173  	VPMULLW      Y9, Y7, Y7
2174  	VPSUBW       Y4, Y0, Y0
2175  	VPSUBW       Y5, Y1, Y1
2176  	VPSUBW       Y6, Y2, Y2
2177  	VPSUBW       Y7, Y3, Y3
2178  	VMOVDQU      Y0, 384(AX)
2179  	VMOVDQU      Y1, 416(AX)
2180  	VMOVDQU      Y2, 448(AX)
2181  	VMOVDQU      Y3, 480(AX)
2182  	RET
2183  
2184  // func normalizeAVX2(p *[256]int16)
2185  // Requires: AVX, AVX2
2186  TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8
2187  	MOVQ         p+0(FP), AX
2188  	MOVL         $0x00000d01, CX
2189  	VMOVD        CX, X0
2190  	VPBROADCASTW X0, Y9
2191  	MOVL         $0x00004ebf, CX
2192  	VMOVD        CX, X0
2193  	VPBROADCASTW X0, Y8
2194  	VMOVDQU      (AX), Y0
2195  	VMOVDQU      32(AX), Y1
2196  	VMOVDQU      64(AX), Y2
2197  	VMOVDQU      96(AX), Y3
2198  	VPMULHW      Y8, Y0, Y4
2199  	VPMULHW      Y8, Y1, Y5
2200  	VPMULHW      Y8, Y2, Y6
2201  	VPMULHW      Y8, Y3, Y7
2202  	VPSRAW       $0x0a, Y4, Y4
2203  	VPSRAW       $0x0a, Y5, Y5
2204  	VPSRAW       $0x0a, Y6, Y6
2205  	VPSRAW       $0x0a, Y7, Y7
2206  	VPMULLW      Y9, Y4, Y4
2207  	VPMULLW      Y9, Y5, Y5
2208  	VPMULLW      Y9, Y6, Y6
2209  	VPMULLW      Y9, Y7, Y7
2210  	VPSUBW       Y4, Y0, Y0
2211  	VPSUBW       Y5, Y1, Y1
2212  	VPSUBW       Y6, Y2, Y2
2213  	VPSUBW       Y7, Y3, Y3
2214  	VPSUBW       Y9, Y0, Y0
2215  	VPSUBW       Y9, Y1, Y1
2216  	VPSUBW       Y9, Y2, Y2
2217  	VPSUBW       Y9, Y3, Y3
2218  	VPSRAW       $0x0f, Y0, Y4
2219  	VPSRAW       $0x0f, Y1, Y5
2220  	VPSRAW       $0x0f, Y2, Y6
2221  	VPSRAW       $0x0f, Y3, Y7
2222  	VPAND        Y4, Y9, Y4
2223  	VPAND        Y5, Y9, Y5
2224  	VPAND        Y6, Y9, Y6
2225  	VPAND        Y7, Y9, Y7
2226  	VPADDW       Y0, Y4, Y0
2227  	VPADDW       Y1, Y5, Y1
2228  	VPADDW       Y2, Y6, Y2
2229  	VPADDW       Y3, Y7, Y3
2230  	VMOVDQU      Y0, (AX)
2231  	VMOVDQU      Y1, 32(AX)
2232  	VMOVDQU      Y2, 64(AX)
2233  	VMOVDQU      Y3, 96(AX)
2234  	VMOVDQU      128(AX), Y0
2235  	VMOVDQU      160(AX), Y1
2236  	VMOVDQU      192(AX), Y2
2237  	VMOVDQU      224(AX), Y3
2238  	VPMULHW      Y8, Y0, Y4
2239  	VPMULHW      Y8, Y1, Y5
2240  	VPMULHW      Y8, Y2, Y6
2241  	VPMULHW      Y8, Y3, Y7
2242  	VPSRAW       $0x0a, Y4, Y4
2243  	VPSRAW       $0x0a, Y5, Y5
2244  	VPSRAW       $0x0a, Y6, Y6
2245  	VPSRAW       $0x0a, Y7, Y7
2246  	VPMULLW      Y9, Y4, Y4
2247  	VPMULLW      Y9, Y5, Y5
2248  	VPMULLW      Y9, Y6, Y6
2249  	VPMULLW      Y9, Y7, Y7
2250  	VPSUBW       Y4, Y0, Y0
2251  	VPSUBW       Y5, Y1, Y1
2252  	VPSUBW       Y6, Y2, Y2
2253  	VPSUBW       Y7, Y3, Y3
2254  	VPSUBW       Y9, Y0, Y0
2255  	VPSUBW       Y9, Y1, Y1
2256  	VPSUBW       Y9, Y2, Y2
2257  	VPSUBW       Y9, Y3, Y3
2258  	VPSRAW       $0x0f, Y0, Y4
2259  	VPSRAW       $0x0f, Y1, Y5
2260  	VPSRAW       $0x0f, Y2, Y6
2261  	VPSRAW       $0x0f, Y3, Y7
2262  	VPAND        Y4, Y9, Y4
2263  	VPAND        Y5, Y9, Y5
2264  	VPAND        Y6, Y9, Y6
2265  	VPAND        Y7, Y9, Y7
2266  	VPADDW       Y0, Y4, Y0
2267  	VPADDW       Y1, Y5, Y1
2268  	VPADDW       Y2, Y6, Y2
2269  	VPADDW       Y3, Y7, Y3
2270  	VMOVDQU      Y0, 128(AX)
2271  	VMOVDQU      Y1, 160(AX)
2272  	VMOVDQU      Y2, 192(AX)
2273  	VMOVDQU      Y3, 224(AX)
2274  	VMOVDQU      256(AX), Y0
2275  	VMOVDQU      288(AX), Y1
2276  	VMOVDQU      320(AX), Y2
2277  	VMOVDQU      352(AX), Y3
2278  	VPMULHW      Y8, Y0, Y4
2279  	VPMULHW      Y8, Y1, Y5
2280  	VPMULHW      Y8, Y2, Y6
2281  	VPMULHW      Y8, Y3, Y7
2282  	VPSRAW       $0x0a, Y4, Y4
2283  	VPSRAW       $0x0a, Y5, Y5
2284  	VPSRAW       $0x0a, Y6, Y6
2285  	VPSRAW       $0x0a, Y7, Y7
2286  	VPMULLW      Y9, Y4, Y4
2287  	VPMULLW      Y9, Y5, Y5
2288  	VPMULLW      Y9, Y6, Y6
2289  	VPMULLW      Y9, Y7, Y7
2290  	VPSUBW       Y4, Y0, Y0
2291  	VPSUBW       Y5, Y1, Y1
2292  	VPSUBW       Y6, Y2, Y2
2293  	VPSUBW       Y7, Y3, Y3
2294  	VPSUBW       Y9, Y0, Y0
2295  	VPSUBW       Y9, Y1, Y1
2296  	VPSUBW       Y9, Y2, Y2
2297  	VPSUBW       Y9, Y3, Y3
2298  	VPSRAW       $0x0f, Y0, Y4
2299  	VPSRAW       $0x0f, Y1, Y5
2300  	VPSRAW       $0x0f, Y2, Y6
2301  	VPSRAW       $0x0f, Y3, Y7
2302  	VPAND        Y4, Y9, Y4
2303  	VPAND        Y5, Y9, Y5
2304  	VPAND        Y6, Y9, Y6
2305  	VPAND        Y7, Y9, Y7
2306  	VPADDW       Y0, Y4, Y0
2307  	VPADDW       Y1, Y5, Y1
2308  	VPADDW       Y2, Y6, Y2
2309  	VPADDW       Y3, Y7, Y3
2310  	VMOVDQU      Y0, 256(AX)
2311  	VMOVDQU      Y1, 288(AX)
2312  	VMOVDQU      Y2, 320(AX)
2313  	VMOVDQU      Y3, 352(AX)
2314  	VMOVDQU      384(AX), Y0
2315  	VMOVDQU      416(AX), Y1
2316  	VMOVDQU      448(AX), Y2
2317  	VMOVDQU      480(AX), Y3
2318  	VPMULHW      Y8, Y0, Y4
2319  	VPMULHW      Y8, Y1, Y5
2320  	VPMULHW      Y8, Y2, Y6
2321  	VPMULHW      Y8, Y3, Y7
2322  	VPSRAW       $0x0a, Y4, Y4
2323  	VPSRAW       $0x0a, Y5, Y5
2324  	VPSRAW       $0x0a, Y6, Y6
2325  	VPSRAW       $0x0a, Y7, Y7
2326  	VPMULLW      Y9, Y4, Y4
2327  	VPMULLW      Y9, Y5, Y5
2328  	VPMULLW      Y9, Y6, Y6
2329  	VPMULLW      Y9, Y7, Y7
2330  	VPSUBW       Y4, Y0, Y0
2331  	VPSUBW       Y5, Y1, Y1
2332  	VPSUBW       Y6, Y2, Y2
2333  	VPSUBW       Y7, Y3, Y3
2334  	VPSUBW       Y9, Y0, Y0
2335  	VPSUBW       Y9, Y1, Y1
2336  	VPSUBW       Y9, Y2, Y2
2337  	VPSUBW       Y9, Y3, Y3
2338  	VPSRAW       $0x0f, Y0, Y4
2339  	VPSRAW       $0x0f, Y1, Y5
2340  	VPSRAW       $0x0f, Y2, Y6
2341  	VPSRAW       $0x0f, Y3, Y7
2342  	VPAND        Y4, Y9, Y4
2343  	VPAND        Y5, Y9, Y5
2344  	VPAND        Y6, Y9, Y6
2345  	VPAND        Y7, Y9, Y7
2346  	VPADDW       Y0, Y4, Y0
2347  	VPADDW       Y1, Y5, Y1
2348  	VPADDW       Y2, Y6, Y2
2349  	VPADDW       Y3, Y7, Y3
2350  	VMOVDQU      Y0, 384(AX)
2351  	VMOVDQU      Y1, 416(AX)
2352  	VMOVDQU      Y2, 448(AX)
2353  	VMOVDQU      Y3, 480(AX)
2354  	RET
2355