// Code generated by command: go run src.go -out ../amd64.s -stubs ../stubs_amd64.go -pkg common. DO NOT EDIT.

//go:build amd64 && !purego

#include "textflag.h"

// func addAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
// Requires: AVX, AVX2
TEXT ·addAVX2(SB), NOSPLIT, $0-24
	MOVQ    p+0(FP), AX
	MOVQ    a+8(FP), CX
	MOVQ    b+16(FP), DX
	VMOVDQU (CX), Y0
	VMOVDQU 32(CX), Y2
	VMOVDQU 64(CX), Y4
	VMOVDQU 96(CX), Y6
	VMOVDQU 128(CX), Y8
	VMOVDQU 160(CX), Y10
	VMOVDQU 192(CX), Y12
	VMOVDQU 224(CX), Y14
	VMOVDQU (DX), Y1
	VMOVDQU 32(DX), Y3
	VMOVDQU 64(DX), Y5
	VMOVDQU 96(DX), Y7
	VMOVDQU 128(DX), Y9
	VMOVDQU 160(DX), Y11
	VMOVDQU 192(DX), Y13
	VMOVDQU 224(DX), Y15
	VPADDW  Y0, Y1, Y1
	VPADDW  Y2, Y3, Y3
	VPADDW  Y4, Y5, Y5
	VPADDW  Y6, Y7, Y7
	VPADDW  Y8, Y9, Y9
	VPADDW  Y10, Y11, Y11
	VPADDW  Y12, Y13, Y13
	VPADDW  Y14, Y15, Y15
	VMOVDQU Y1, (AX)
	VMOVDQU Y3, 32(AX)
	VMOVDQU Y5, 64(AX)
	VMOVDQU Y7, 96(AX)
	VMOVDQU Y9, 128(AX)
	VMOVDQU Y11, 160(AX)
	VMOVDQU Y13, 192(AX)
	VMOVDQU Y15, 224(AX)
	VMOVDQU 256(CX), Y0
	VMOVDQU 288(CX), Y2
	VMOVDQU 320(CX), Y4
	VMOVDQU 352(CX), Y6
	VMOVDQU 384(CX), Y8
	VMOVDQU 416(CX), Y10
	VMOVDQU 448(CX), Y12
	VMOVDQU 480(CX), Y14
	VMOVDQU 256(DX), Y1
	VMOVDQU 288(DX), Y3
	VMOVDQU 320(DX), Y5
	VMOVDQU 352(DX), Y7
	VMOVDQU 384(DX), Y9
	VMOVDQU 416(DX), Y11
	VMOVDQU 448(DX), Y13
	VMOVDQU 480(DX), Y15
	VPADDW  Y0, Y1, Y1
	VPADDW  Y2, Y3, Y3
	VPADDW  Y4, Y5, Y5
	VPADDW  Y6, Y7, Y7
	VPADDW  Y8, Y9, Y9
	VPADDW  Y10, Y11, Y11
	VPADDW  Y12, Y13, Y13
	VPADDW  Y14, Y15, Y15
	VMOVDQU Y1, 256(AX)
	VMOVDQU Y3, 288(AX)
	VMOVDQU Y5, 320(AX)
	VMOVDQU Y7, 352(AX)
	VMOVDQU Y9, 384(AX)
	VMOVDQU Y11, 416(AX)
	VMOVDQU Y13, 448(AX)
	VMOVDQU Y15, 480(AX)
	RET

// func subAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
// Requires: AVX, AVX2
TEXT ·subAVX2(SB), NOSPLIT, $0-24
	MOVQ    p+0(FP), AX
	MOVQ    a+8(FP), CX
	MOVQ    b+16(FP), DX
	VMOVDQU (CX), Y0
	VMOVDQU 32(CX), Y2
	VMOVDQU 64(CX), Y4
	VMOVDQU 96(CX), Y6
	VMOVDQU 128(CX), Y8
	VMOVDQU 160(CX), Y10
	VMOVDQU 192(CX), Y12
	VMOVDQU 224(CX), Y14
	VMOVDQU (DX), Y1
	VMOVDQU 32(DX), Y3
	VMOVDQU 64(DX), Y5
	VMOVDQU 96(DX), Y7
	VMOVDQU 128(DX), Y9
	VMOVDQU 160(DX), Y11
	VMOVDQU 192(DX), Y13
	VMOVDQU 224(DX), Y15
	VPSUBW  Y1, Y0, Y1
	VPSUBW  Y3, Y2, Y3
	VPSUBW  Y5, Y4, Y5
	VPSUBW  Y7, Y6, Y7
	VPSUBW  Y9, Y8, Y9
	VPSUBW  Y11, Y10, Y11
	VPSUBW  Y13, Y12, Y13
	VPSUBW  Y15, Y14, Y15
	VMOVDQU Y1, (AX)
	VMOVDQU Y3, 32(AX)
	VMOVDQU Y5, 64(AX)
	VMOVDQU Y7, 96(AX)
	VMOVDQU Y9, 128(AX)
	VMOVDQU Y11, 160(AX)
	VMOVDQU Y13, 192(AX)
	VMOVDQU Y15, 224(AX)
	VMOVDQU 256(CX), Y0
	VMOVDQU 288(CX), Y2
	VMOVDQU 320(CX), Y4
	VMOVDQU 352(CX), Y6
	VMOVDQU 384(CX), Y8
	VMOVDQU 416(CX), Y10
	VMOVDQU 448(CX), Y12
	VMOVDQU 480(CX), Y14
	VMOVDQU 256(DX), Y1
	VMOVDQU 288(DX), Y3
	VMOVDQU 320(DX), Y5
	VMOVDQU 352(DX), Y7
	VMOVDQU 384(DX), Y9
	VMOVDQU 416(DX), Y11
	VMOVDQU 448(DX), Y13
	VMOVDQU 480(DX), Y15
	VPSUBW  Y1, Y0, Y1
	VPSUBW  Y3, Y2, Y3
	VPSUBW  Y5, Y4, Y5
	VPSUBW  Y7, Y6, Y7
	VPSUBW  Y9, Y8, Y9
	VPSUBW  Y11, Y10, Y11
	VPSUBW  Y13, Y12, Y13
	VPSUBW  Y15, Y14, Y15
	VMOVDQU Y1, 256(AX)
	VMOVDQU Y3, 288(AX)
	VMOVDQU Y5, 320(AX)
	VMOVDQU Y7, 352(AX)
	VMOVDQU Y9, 384(AX)
	VMOVDQU Y11, 416(AX)
	VMOVDQU Y13, 448(AX)
	VMOVDQU Y15, 480(AX)
	RET

// func nttAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·nttAVX2(SB), NOSPLIT, $0-8
	MOVQ         p+0(FP), AX
	LEAQ         ·ZetasAVX2+0(SB), CX
	MOVL         $0x00000d01, DX
	VMOVD        DX, X0
	VPBROADCASTW X0, Y15
	VPBROADCASTW (CX), Y0
	VPBROADCASTW 2(CX), Y1
	VMOVDQU      (AX), Y7
	VMOVDQU      32(AX), Y8
	VMOVDQU      64(AX), Y9
	VMOVDQU      96(AX), Y10
	VMOVDQU      256(AX), Y11
	VMOVDQU      288(AX), Y12
	VMOVDQU      320(AX), Y13
	VMOVDQU      352(AX), Y14
	VPMULLW      Y11, Y0, Y2
	VPMULLW      Y12, Y0, Y3
	VPMULLW      Y13, Y0, Y4
	VPMULLW      Y14, Y0, Y5
	VPMULHW      Y11, Y1, Y11
	VPMULHW      Y12, Y1, Y12
	VPMULHW      Y13, Y1, Y13
	VPMULHW      Y14, Y1, Y14
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y11, Y2
	VPSUBW       Y3, Y12, Y3
	VPSUBW       Y4, Y13, Y4
	VPSUBW       Y5, Y14, Y5
	VPSUBW       Y2, Y7, Y11
	VPSUBW       Y3, Y8, Y12
	VPSUBW       Y4, Y9, Y13
	VPSUBW       Y5, Y10, Y14
	VPADDW       Y2, Y7, Y7
	VPADDW       Y3, Y8, Y8
	VPADDW       Y4, Y9, Y9
	VPADDW       Y5, Y10, Y10
	VMOVDQU      Y7, (AX)
	VMOVDQU      Y8, 32(AX)
	VMOVDQU      Y9, 64(AX)
	VMOVDQU      Y10, 96(AX)
	VMOVDQU      Y11, 256(AX)
	VMOVDQU      Y12, 288(AX)
	VMOVDQU      Y13, 320(AX)
	VMOVDQU      Y14, 352(AX)
	VMOVDQU      128(AX), Y7
	VMOVDQU      160(AX), Y8
	VMOVDQU      192(AX), Y9
	VMOVDQU      224(AX), Y10
	VMOVDQU      384(AX), Y11
	VMOVDQU      416(AX), Y12
	VMOVDQU      448(AX), Y13
	VMOVDQU      480(AX), Y14
	VPMULLW      Y11, Y0, Y2
	VPMULLW      Y12, Y0, Y3
	VPMULLW      Y13, Y0, Y4
	VPMULLW      Y14, Y0, Y5
	VPMULHW      Y11, Y1, Y11
	VPMULHW      Y12, Y1, Y12
	VPMULHW      Y13, Y1, Y13
	VPMULHW      Y14, Y1, Y14
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y11, Y2
	VPSUBW       Y3, Y12, Y3
	VPSUBW       Y4, Y13, Y4
	VPSUBW       Y5, Y14, Y5
	VPSUBW       Y2, Y7, Y11
	VPSUBW       Y3, Y8, Y12
	VPSUBW       Y4, Y9, Y13
	VPSUBW       Y5, Y10, Y14
	VPADDW       Y2, Y7, Y7
	VPADDW       Y3, Y8, Y8
	VPADDW       Y4, Y9, Y9
	VPADDW       Y5, Y10, Y10
	VMOVDQU      Y7, 128(AX)
	VMOVDQU      Y8, 160(AX)
	VMOVDQU      Y9, 192(AX)
	VMOVDQU      Y10, 224(AX)
	VMOVDQU      Y11, 384(AX)
	VMOVDQU      Y12, 416(AX)
	VMOVDQU      Y13, 448(AX)
	VMOVDQU      Y14, 480(AX)
	VPBROADCASTW 4(CX), Y0
	VPBROADCASTW 6(CX), Y1
	VMOVDQU      (AX), Y7
	VMOVDQU      32(AX), Y8
	VMOVDQU      64(AX), Y9
	VMOVDQU      96(AX), Y10
	VMOVDQU      128(AX), Y11
	VMOVDQU      160(AX), Y12
	VMOVDQU      192(AX), Y13
	VMOVDQU      224(AX), Y14
	VPMULLW      Y11, Y0, Y2
	VPMULLW      Y12, Y0, Y3
	VPMULLW      Y13, Y0, Y4
	VPMULLW      Y14, Y0, Y5
	VPMULHW      Y11, Y1, Y11
	VPMULHW      Y12, Y1, Y12
	VPMULHW      Y13, Y1, Y13
	VPMULHW      Y14, Y1, Y14
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y11, Y2
	VPSUBW       Y3, Y12, Y3
	VPSUBW       Y4, Y13, Y4
	VPSUBW       Y5, Y14, Y5
	VPSUBW       Y2, Y7, Y11
	VPSUBW       Y3, Y8, Y12
	VPSUBW       Y4, Y9, Y13
	VPSUBW       Y5, Y10, Y14
	VPADDW       Y2, Y7, Y7
	VPADDW       Y3, Y8, Y8
	VPADDW       Y4, Y9, Y9
	VPADDW       Y5, Y10, Y10
	VPBROADCASTW 12(CX), Y0
	VPBROADCASTW 14(CX), Y1
	VPBROADCASTW 16(CX), Y2
	VPBROADCASTW 18(CX), Y3
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y13, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y13, Y3, Y13
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y9, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y13, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y9
	VPSUBW       Y5, Y8, Y10
	VPSUBW       Y6, Y11, Y13
	VPSUBW       Y0, Y12, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y8, Y8
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y12, Y12
	VMOVDQU      32(CX), Y0
	VMOVDQU      64(CX), Y1
	VMOVDQU      96(CX), Y2
	VMOVDQU      128(CX), Y3
	VPERM2I128   $0x20, Y9, Y7, Y4
	VPERM2I128   $0x31, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VPERM2I128   $0x20, Y10, Y8, Y4
	VPERM2I128   $0x31, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VPERM2I128   $0x20, Y13, Y11, Y4
	VPERM2I128   $0x31, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VPERM2I128   $0x20, Y14, Y12, Y4
	VPERM2I128   $0x31, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPMULLW      Y8, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y12, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y8, Y1, Y8
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y12, Y3, Y12
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y8, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y12, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y8
	VPSUBW       Y5, Y9, Y10
	VPSUBW       Y6, Y11, Y12
	VPSUBW       Y0, Y13, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y9, Y9
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y13, Y13
	VMOVDQU      288(CX), Y0
	VMOVDQU      320(CX), Y1
	VMOVDQU      352(CX), Y2
	VMOVDQU      384(CX), Y3
	VPUNPCKLQDQ  Y8, Y7, Y4
	VPUNPCKHQDQ  Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPUNPCKLQDQ  Y10, Y9, Y4
	VPUNPCKHQDQ  Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPUNPCKLQDQ  Y12, Y11, Y4
	VPUNPCKHQDQ  Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPUNPCKLQDQ  Y14, Y13, Y4
	VPUNPCKHQDQ  Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y13, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y13, Y3, Y13
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y9, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y13, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y9
	VPSUBW       Y5, Y8, Y10
	VPSUBW       Y6, Y11, Y13
	VPSUBW       Y0, Y12, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y8, Y8
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y12, Y12
	VMOVDQU      544(CX), Y0
	VMOVDQU      576(CX), Y1
	VMOVDQU      608(CX), Y2
	VMOVDQU      640(CX), Y3
	VMOVSLDUP    Y9, Y4
	VPBLENDD     $0xaa, Y4, Y7, Y4
	VPSRLQ       $0x20, Y7, Y7
	VPBLENDD     $0xaa, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VMOVSLDUP    Y10, Y4
	VPBLENDD     $0xaa, Y4, Y8, Y4
	VPSRLQ       $0x20, Y8, Y8
	VPBLENDD     $0xaa, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VMOVSLDUP    Y13, Y4
	VPBLENDD     $0xaa, Y4, Y11, Y4
	VPSRLQ       $0x20, Y11, Y11
	VPBLENDD     $0xaa, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VMOVSLDUP    Y14, Y4
	VPBLENDD     $0xaa, Y4, Y12, Y4
	VPSRLQ       $0x20, Y12, Y12
	VPBLENDD     $0xaa, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPMULLW      Y8, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y12, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y8, Y1, Y8
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y12, Y3, Y12
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y8, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y12, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y8
	VPSUBW       Y5, Y9, Y10
	VPSUBW       Y6, Y11, Y12
	VPSUBW       Y0, Y13, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y9, Y9
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y13, Y13
	VMOVDQU      800(CX), Y0
	VMOVDQU      832(CX), Y1
	VMOVDQU      864(CX), Y2
	VMOVDQU      896(CX), Y3
	VPSLLD       $0x10, Y8, Y4
	VPBLENDW     $0xaa, Y4, Y7, Y4
	VPSRLD       $0x10, Y7, Y7
	VPBLENDW     $0xaa, Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPSLLD       $0x10, Y10, Y4
	VPBLENDW     $0xaa, Y4, Y9, Y4
	VPSRLD       $0x10, Y9, Y9
	VPBLENDW     $0xaa, Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPSLLD       $0x10, Y12, Y4
	VPBLENDW     $0xaa, Y4, Y11, Y4
	VPSRLD       $0x10, Y11, Y11
	VPBLENDW     $0xaa, Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPSLLD       $0x10, Y14, Y4
	VPBLENDW     $0xaa, Y4, Y13, Y4
	VPSRLD       $0x10, Y13, Y13
	VPBLENDW     $0xaa, Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y13, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y13, Y3, Y13
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y9, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y13, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y9
	VPSUBW       Y5, Y8, Y10
	VPSUBW       Y6, Y11, Y13
	VPSUBW       Y0, Y12, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y8, Y8
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y12, Y12
	VMOVDQU      Y7, (AX)
	VMOVDQU      Y8, 32(AX)
	VMOVDQU      Y9, 64(AX)
	VMOVDQU      Y10, 96(AX)
	VMOVDQU      Y11, 128(AX)
	VMOVDQU      Y12, 160(AX)
	VMOVDQU      Y13, 192(AX)
	VMOVDQU      Y14, 224(AX)
	VPBROADCASTW 8(CX), Y0
	VPBROADCASTW 10(CX), Y1
	VMOVDQU      256(AX), Y7
	VMOVDQU      288(AX), Y8
	VMOVDQU      320(AX), Y9
	VMOVDQU      352(AX), Y10
	VMOVDQU      384(AX), Y11
	VMOVDQU      416(AX), Y12
	VMOVDQU      448(AX), Y13
	VMOVDQU      480(AX), Y14
	VPMULLW      Y11, Y0, Y2
	VPMULLW      Y12, Y0, Y3
	VPMULLW      Y13, Y0, Y4
	VPMULLW      Y14, Y0, Y5
	VPMULHW      Y11, Y1, Y11
	VPMULHW      Y12, Y1, Y12
	VPMULHW      Y13, Y1, Y13
	VPMULHW      Y14, Y1, Y14
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y11, Y2
	VPSUBW       Y3, Y12, Y3
	VPSUBW       Y4, Y13, Y4
	VPSUBW       Y5, Y14, Y5
	VPSUBW       Y2, Y7, Y11
	VPSUBW       Y3, Y8, Y12
	VPSUBW       Y4, Y9, Y13
	VPSUBW       Y5, Y10, Y14
	VPADDW       Y2, Y7, Y7
	VPADDW       Y3, Y8, Y8
	VPADDW       Y4, Y9, Y9
	VPADDW       Y5, Y10, Y10
	VPBROADCASTW 20(CX), Y0
	VPBROADCASTW 22(CX), Y1
	VPBROADCASTW 24(CX), Y2
	VPBROADCASTW 26(CX), Y3
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y13, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y13, Y3, Y13
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y9, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y13, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y9
	VPSUBW       Y5, Y8, Y10
	VPSUBW       Y6, Y11, Y13
	VPSUBW       Y0, Y12, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y8, Y8
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y12, Y12
	VMOVDQU      160(CX), Y0
	VMOVDQU      192(CX), Y1
	VMOVDQU      224(CX), Y2
	VMOVDQU      256(CX), Y3
	VPERM2I128   $0x20, Y9, Y7, Y4
	VPERM2I128   $0x31, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VPERM2I128   $0x20, Y10, Y8, Y4
	VPERM2I128   $0x31, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VPERM2I128   $0x20, Y13, Y11, Y4
	VPERM2I128   $0x31, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VPERM2I128   $0x20, Y14, Y12, Y4
	VPERM2I128   $0x31, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPMULLW      Y8, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y12, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y8, Y1, Y8
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y12, Y3, Y12
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y8, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y12, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y8
	VPSUBW       Y5, Y9, Y10
	VPSUBW       Y6, Y11, Y12
	VPSUBW       Y0, Y13, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y9, Y9
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y13, Y13
	VMOVDQU      416(CX), Y0
	VMOVDQU      448(CX), Y1
	VMOVDQU      480(CX), Y2
	VMOVDQU      512(CX), Y3
	VPUNPCKLQDQ  Y8, Y7, Y4
	VPUNPCKHQDQ  Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPUNPCKLQDQ  Y10, Y9, Y4
	VPUNPCKHQDQ  Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPUNPCKLQDQ  Y12, Y11, Y4
	VPUNPCKHQDQ  Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPUNPCKLQDQ  Y14, Y13, Y4
	VPUNPCKHQDQ  Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y13, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y13, Y3, Y13
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y9, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y13, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y9
	VPSUBW       Y5, Y8, Y10
	VPSUBW       Y6, Y11, Y13
	VPSUBW       Y0, Y12, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y8, Y8
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y12, Y12
	VMOVDQU      672(CX), Y0
	VMOVDQU      704(CX), Y1
	VMOVDQU      736(CX), Y2
	VMOVDQU      768(CX), Y3
	VMOVSLDUP    Y9, Y4
	VPBLENDD     $0xaa, Y4, Y7, Y4
	VPSRLQ       $0x20, Y7, Y7
	VPBLENDD     $0xaa, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VMOVSLDUP    Y10, Y4
	VPBLENDD     $0xaa, Y4, Y8, Y4
	VPSRLQ       $0x20, Y8, Y8
	VPBLENDD     $0xaa, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VMOVSLDUP    Y13, Y4
	VPBLENDD     $0xaa, Y4, Y11, Y4
	VPSRLQ       $0x20, Y11, Y11
	VPBLENDD     $0xaa, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VMOVSLDUP    Y14, Y4
	VPBLENDD     $0xaa, Y4, Y12, Y4
	VPSRLQ       $0x20, Y12, Y12
	VPBLENDD     $0xaa, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPMULLW      Y8, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y12, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y8, Y1, Y8
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y12, Y3, Y12
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y8, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y12, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y8
	VPSUBW       Y5, Y9, Y10
	VPSUBW       Y6, Y11, Y12
	VPSUBW       Y0, Y13, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y9, Y9
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y13, Y13
	VMOVDQU      928(CX), Y0
	VMOVDQU      960(CX), Y1
	VMOVDQU      992(CX), Y2
	VMOVDQU      1024(CX), Y3
	VPSLLD       $0x10, Y8, Y4
	VPBLENDW     $0xaa, Y4, Y7, Y4
	VPSRLD       $0x10, Y7, Y7
	VPBLENDW     $0xaa, Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPSLLD       $0x10, Y10, Y4
	VPBLENDW     $0xaa, Y4, Y9, Y4
	VPSRLD       $0x10, Y9, Y9
	VPBLENDW     $0xaa, Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPSLLD       $0x10, Y12, Y4
	VPBLENDW     $0xaa, Y4, Y11, Y4
	VPSRLD       $0x10, Y11, Y11
	VPBLENDW     $0xaa, Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPSLLD       $0x10, Y14, Y4
	VPBLENDW     $0xaa, Y4, Y13, Y4
	VPSRLD       $0x10, Y13, Y13
	VPBLENDW     $0xaa, Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULLW      Y13, Y2, Y6
	VPMULLW      Y14, Y2, Y0
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y13, Y3, Y13
	VPMULHW      Y14, Y3, Y14
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPMULHW      Y6, Y15, Y6
	VPMULHW      Y0, Y15, Y0
	VPSUBW       Y4, Y9, Y4
	VPSUBW       Y5, Y10, Y5
	VPSUBW       Y6, Y13, Y6
	VPSUBW       Y0, Y14, Y0
	VPSUBW       Y4, Y7, Y9
	VPSUBW       Y5, Y8, Y10
	VPSUBW       Y6, Y11, Y13
	VPSUBW       Y0, Y12, Y14
	VPADDW       Y4, Y7, Y7
	VPADDW       Y5, Y8, Y8
	VPADDW       Y6, Y11, Y11
	VPADDW       Y0, Y12, Y12
	VMOVDQU      Y7, 256(AX)
	VMOVDQU      Y8, 288(AX)
	VMOVDQU      Y9, 320(AX)
	VMOVDQU      Y10, 352(AX)
	VMOVDQU      Y11, 384(AX)
	VMOVDQU      Y12, 416(AX)
	VMOVDQU      Y13, 448(AX)
	VMOVDQU      Y14, 480(AX)
	RET

// func invNttAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·invNttAVX2(SB), NOSPLIT, $0-8
	MOVQ         p+0(FP), AX
	LEAQ         ·ZetasAVX2+0(SB), CX
	MOVL         $0x00000d01, DX
	VMOVD        DX, X0
	VPBROADCASTW X0, Y15
	VMOVDQU      (AX), Y7
	VMOVDQU      32(AX), Y8
	VMOVDQU      64(AX), Y9
	VMOVDQU      96(AX), Y10
	VMOVDQU      128(AX), Y11
	VMOVDQU      160(AX), Y12
	VMOVDQU      192(AX), Y13
	VMOVDQU      224(AX), Y14
	VMOVDQU      1056(CX), Y0
	VMOVDQU      1088(CX), Y1
	VMOVDQU      1120(CX), Y2
	VMOVDQU      1152(CX), Y3
	VPSUBW       Y7, Y9, Y4
	VPSUBW       Y8, Y10, Y5
	VPSUBW       Y11, Y13, Y6
	VPADDW       Y7, Y9, Y7
	VPADDW       Y8, Y10, Y8
	VPADDW       Y11, Y13, Y11
	VPMULLW      Y4, Y0, Y9
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y12, Y14, Y0
	VPMULLW      Y6, Y2, Y13
	VPADDW       Y12, Y14, Y12
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y9, Y4, Y9
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y0, Y14
	VMOVDQU      1312(CX), Y0
	VMOVDQU      1344(CX), Y1
	VMOVDQU      1376(CX), Y2
	VMOVDQU      1408(CX), Y3
	VPSLLD       $0x10, Y8, Y4
	VPBLENDW     $0xaa, Y4, Y7, Y4
	VPSRLD       $0x10, Y7, Y7
	VPBLENDW     $0xaa, Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPSLLD       $0x10, Y10, Y4
	VPBLENDW     $0xaa, Y4, Y9, Y4
	VPSRLD       $0x10, Y9, Y9
	VPBLENDW     $0xaa, Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPSLLD       $0x10, Y12, Y4
	VPBLENDW     $0xaa, Y4, Y11, Y4
	VPSRLD       $0x10, Y11, Y11
	VPBLENDW     $0xaa, Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPSLLD       $0x10, Y14, Y4
	VPBLENDW     $0xaa, Y4, Y13, Y4
	VPSRLD       $0x10, Y13, Y13
	VPBLENDW     $0xaa, Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPSUBW       Y7, Y8, Y4
	VPSUBW       Y9, Y10, Y5
	VPSUBW       Y11, Y12, Y6
	VPADDW       Y7, Y8, Y7
	VPADDW       Y9, Y10, Y9
	VPADDW       Y11, Y12, Y11
	VPMULLW      Y4, Y0, Y8
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y13, Y14, Y0
	VPMULLW      Y6, Y2, Y12
	VPADDW       Y13, Y14, Y13
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y8, Y4, Y8
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y12, Y6, Y12
	VPSUBW       Y14, Y0, Y14
	VMOVDQU      1568(CX), Y0
	VMOVDQU      1600(CX), Y1
	VMOVDQU      1632(CX), Y2
	VMOVDQU      1664(CX), Y3
	VMOVSLDUP    Y9, Y4
	VPBLENDD     $0xaa, Y4, Y7, Y4
	VPSRLQ       $0x20, Y7, Y7
	VPBLENDD     $0xaa, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VMOVSLDUP    Y10, Y4
	VPBLENDD     $0xaa, Y4, Y8, Y4
	VPSRLQ       $0x20, Y8, Y8
	VPBLENDD     $0xaa, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VMOVSLDUP    Y13, Y4
	VPBLENDD     $0xaa, Y4, Y11, Y4
	VPSRLQ       $0x20, Y11, Y11
	VPBLENDD     $0xaa, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VMOVSLDUP    Y14, Y4
	VPBLENDD     $0xaa, Y4, Y12, Y4
	VPSRLQ       $0x20, Y12, Y12
	VPBLENDD     $0xaa, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPSUBW       Y7, Y9, Y4
	VPSUBW       Y8, Y10, Y5
	VPSUBW       Y11, Y13, Y6
	VPADDW       Y7, Y9, Y7
	VPADDW       Y8, Y10, Y8
	VPADDW       Y11, Y13, Y11
	VPMULLW      Y4, Y0, Y9
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y12, Y14, Y0
	VPMULLW      Y6, Y2, Y13
	VPADDW       Y12, Y14, Y12
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y9, Y4, Y9
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y0, Y14
	MOVL         $0x00004ebf, DX
	VMOVD        DX, X0
	VPBROADCASTW X0, Y4
	VPMULHW      Y4, Y7, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y7, Y7
	VPMULHW      Y4, Y11, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y11, Y11
	VMOVDQU      1824(CX), Y0
	VMOVDQU      1856(CX), Y1
	VMOVDQU      1888(CX), Y2
	VMOVDQU      1920(CX), Y3
	VPUNPCKLQDQ  Y8, Y7, Y4
	VPUNPCKHQDQ  Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPUNPCKLQDQ  Y10, Y9, Y4
	VPUNPCKHQDQ  Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPUNPCKLQDQ  Y12, Y11, Y4
	VPUNPCKHQDQ  Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPUNPCKLQDQ  Y14, Y13, Y4
	VPUNPCKHQDQ  Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPSUBW       Y7, Y8, Y4
	VPSUBW       Y9, Y10, Y5
	VPSUBW       Y11, Y12, Y6
	VPADDW       Y7, Y8, Y7
	VPADDW       Y9, Y10, Y9
	VPADDW       Y11, Y12, Y11
	VPMULLW      Y4, Y0, Y8
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y13, Y14, Y0
	VPMULLW      Y6, Y2, Y12
	VPADDW       Y13, Y14, Y13
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y8, Y4, Y8
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y12, Y6, Y12
	VPSUBW       Y14, Y0, Y14
	VPBROADCASTW 2080(CX), Y0
	VPBROADCASTW 2082(CX), Y1
	VPBROADCASTW 2084(CX), Y2
	VPBROADCASTW 2086(CX), Y3
	VPERM2I128   $0x20, Y9, Y7, Y4
	VPERM2I128   $0x31, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VPERM2I128   $0x20, Y10, Y8, Y4
	VPERM2I128   $0x31, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VPERM2I128   $0x20, Y13, Y11, Y4
	VPERM2I128   $0x31, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VPERM2I128   $0x20, Y14, Y12, Y4
	VPERM2I128   $0x31, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPSUBW       Y7, Y9, Y4
	VPSUBW       Y8, Y10, Y5
	VPSUBW       Y11, Y13, Y6
	VPADDW       Y7, Y9, Y7
	VPADDW       Y8, Y10, Y8
	VPADDW       Y11, Y13, Y11
	VPMULLW      Y4, Y0, Y9
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y12, Y14, Y0
	VPMULLW      Y6, Y2, Y13
	VPADDW       Y12, Y14, Y12
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y9, Y4, Y9
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y0, Y14
	MOVL         $0x00004ebf, DX
	VMOVD        DX, X0
	VPBROADCASTW X0, Y4
	VPMULHW      Y4, Y7, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y7, Y7
	VPMULHW      Y4, Y11, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y11, Y11
	VPBROADCASTW 2096(CX), Y0
	VPBROADCASTW 2098(CX), Y1
	VPSUBW       Y7, Y11, Y4
	VPSUBW       Y8, Y12, Y5
	VPSUBW       Y9, Y13, Y6
	VPADDW       Y7, Y11, Y7
	VPADDW       Y8, Y12, Y8
	VPADDW       Y9, Y13, Y9
	VPMULLW      Y4, Y0, Y11
	VPMULLW      Y5, Y0, Y12
	VPSUBW       Y10, Y14, Y2
	VPMULLW      Y6, Y0, Y13
	VPADDW       Y10, Y14, Y10
	VPMULLW      Y2, Y0, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y1, Y6
	VPMULHW      Y2, Y1, Y2
	VPMULHW      Y11, Y15, Y11
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y11, Y4, Y11
	VPSUBW       Y12, Y5, Y12
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y2, Y14
	VMOVDQU      Y7, (AX)
	VMOVDQU      Y8, 32(AX)
	VMOVDQU      Y9, 64(AX)
	VMOVDQU      Y10, 96(AX)
	VMOVDQU      Y11, 128(AX)
	VMOVDQU      Y12, 160(AX)
	VMOVDQU      Y13, 192(AX)
	VMOVDQU      Y14, 224(AX)
	VMOVDQU      256(AX), Y7
	VMOVDQU      288(AX), Y8
	VMOVDQU      320(AX), Y9
	VMOVDQU      352(AX), Y10
	VMOVDQU      384(AX), Y11
	VMOVDQU      416(AX), Y12
	VMOVDQU      448(AX), Y13
	VMOVDQU      480(AX), Y14
	VMOVDQU      1184(CX), Y0
	VMOVDQU      1216(CX), Y1
	VMOVDQU      1248(CX), Y2
	VMOVDQU      1280(CX), Y3
	VPSUBW       Y7, Y9, Y4
	VPSUBW       Y8, Y10, Y5
	VPSUBW       Y11, Y13, Y6
	VPADDW       Y7, Y9, Y7
	VPADDW       Y8, Y10, Y8
	VPADDW       Y11, Y13, Y11
	VPMULLW      Y4, Y0, Y9
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y12, Y14, Y0
	VPMULLW      Y6, Y2, Y13
	VPADDW       Y12, Y14, Y12
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y9, Y4, Y9
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y0, Y14
	VMOVDQU      1440(CX), Y0
	VMOVDQU      1472(CX), Y1
	VMOVDQU      1504(CX), Y2
	VMOVDQU      1536(CX), Y3
	VPSLLD       $0x10, Y8, Y4
	VPBLENDW     $0xaa, Y4, Y7, Y4
	VPSRLD       $0x10, Y7, Y7
	VPBLENDW     $0xaa, Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPSLLD       $0x10, Y10, Y4
	VPBLENDW     $0xaa, Y4, Y9, Y4
	VPSRLD       $0x10, Y9, Y9
	VPBLENDW     $0xaa, Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPSLLD       $0x10, Y12, Y4
	VPBLENDW     $0xaa, Y4, Y11, Y4
	VPSRLD       $0x10, Y11, Y11
	VPBLENDW     $0xaa, Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPSLLD       $0x10, Y14, Y4
	VPBLENDW     $0xaa, Y4, Y13, Y4
	VPSRLD       $0x10, Y13, Y13
	VPBLENDW     $0xaa, Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPSUBW       Y7, Y8, Y4
	VPSUBW       Y9, Y10, Y5
	VPSUBW       Y11, Y12, Y6
	VPADDW       Y7, Y8, Y7
	VPADDW       Y9, Y10, Y9
	VPADDW       Y11, Y12, Y11
	VPMULLW      Y4, Y0, Y8
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y13, Y14, Y0
	VPMULLW      Y6, Y2, Y12
	VPADDW       Y13, Y14, Y13
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y8, Y4, Y8
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y12, Y6, Y12
	VPSUBW       Y14, Y0, Y14
	VMOVDQU      1696(CX), Y0
	VMOVDQU      1728(CX), Y1
	VMOVDQU      1760(CX), Y2
	VMOVDQU      1792(CX), Y3
	VMOVSLDUP    Y9, Y4
	VPBLENDD     $0xaa, Y4, Y7, Y4
	VPSRLQ       $0x20, Y7, Y7
	VPBLENDD     $0xaa, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VMOVSLDUP    Y10, Y4
	VPBLENDD     $0xaa, Y4, Y8, Y4
	VPSRLQ       $0x20, Y8, Y8
	VPBLENDD     $0xaa, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VMOVSLDUP    Y13, Y4
	VPBLENDD     $0xaa, Y4, Y11, Y4
	VPSRLQ       $0x20, Y11, Y11
	VPBLENDD     $0xaa, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VMOVSLDUP    Y14, Y4
	VPBLENDD     $0xaa, Y4, Y12, Y4
	VPSRLQ       $0x20, Y12, Y12
	VPBLENDD     $0xaa, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPSUBW       Y7, Y9, Y4
	VPSUBW       Y8, Y10, Y5
	VPSUBW       Y11, Y13, Y6
	VPADDW       Y7, Y9, Y7
	VPADDW       Y8, Y10, Y8
	VPADDW       Y11, Y13, Y11
	VPMULLW      Y4, Y0, Y9
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y12, Y14, Y0
	VPMULLW      Y6, Y2, Y13
	VPADDW       Y12, Y14, Y12
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y9, Y4, Y9
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y0, Y14
	MOVL         $0x00004ebf, DX
	VMOVD        DX, X0
	VPBROADCASTW X0, Y4
	VPMULHW      Y4, Y7, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y7, Y7
	VPMULHW      Y4, Y11, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y11, Y11
	VMOVDQU      1952(CX), Y0
	VMOVDQU      1984(CX), Y1
	VMOVDQU      2016(CX), Y2
	VMOVDQU      2048(CX), Y3
	VPUNPCKLQDQ  Y8, Y7, Y4
	VPUNPCKHQDQ  Y8, Y7, Y8
	VMOVDQA      Y4, Y7
	VPUNPCKLQDQ  Y10, Y9, Y4
	VPUNPCKHQDQ  Y10, Y9, Y10
	VMOVDQA      Y4, Y9
	VPUNPCKLQDQ  Y12, Y11, Y4
	VPUNPCKHQDQ  Y12, Y11, Y12
	VMOVDQA      Y4, Y11
	VPUNPCKLQDQ  Y14, Y13, Y4
	VPUNPCKHQDQ  Y14, Y13, Y14
	VMOVDQA      Y4, Y13
	VPSUBW       Y7, Y8, Y4
	VPSUBW       Y9, Y10, Y5
	VPSUBW       Y11, Y12, Y6
	VPADDW       Y7, Y8, Y7
	VPADDW       Y9, Y10, Y9
	VPADDW       Y11, Y12, Y11
	VPMULLW      Y4, Y0, Y8
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y13, Y14, Y0
	VPMULLW      Y6, Y2, Y12
	VPADDW       Y13, Y14, Y13
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y8, Y4, Y8
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y12, Y6, Y12
	VPSUBW       Y14, Y0, Y14
	VPBROADCASTW 2088(CX), Y0
	VPBROADCASTW 2090(CX), Y1
	VPBROADCASTW 2092(CX), Y2
	VPBROADCASTW 2094(CX), Y3
	VPERM2I128   $0x20, Y9, Y7, Y4
	VPERM2I128   $0x31, Y9, Y7, Y9
	VMOVDQA      Y4, Y7
	VPERM2I128   $0x20, Y10, Y8, Y4
	VPERM2I128   $0x31, Y10, Y8, Y10
	VMOVDQA      Y4, Y8
	VPERM2I128   $0x20, Y13, Y11, Y4
	VPERM2I128   $0x31, Y13, Y11, Y13
	VMOVDQA      Y4, Y11
	VPERM2I128   $0x20, Y14, Y12, Y4
	VPERM2I128   $0x31, Y14, Y12, Y14
	VMOVDQA      Y4, Y12
	VPSUBW       Y7, Y9, Y4
	VPSUBW       Y8, Y10, Y5
	VPSUBW       Y11, Y13, Y6
	VPADDW       Y7, Y9, Y7
	VPADDW       Y8, Y10, Y8
	VPADDW       Y11, Y13, Y11
	VPMULLW      Y4, Y0, Y9
	VPMULLW      Y5, Y0, Y10
	VPSUBW       Y12, Y14, Y0
	VPMULLW      Y6, Y2, Y13
	VPADDW       Y12, Y14, Y12
	VPMULLW      Y0, Y2, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y3, Y6
	VPMULHW      Y0, Y3, Y0
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y9, Y4, Y9
	VPSUBW       Y10, Y5, Y10
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y0, Y14
	MOVL         $0x00004ebf, DX
	VMOVD        DX, X0
	VPBROADCASTW X0, Y4
	VPMULHW      Y4, Y7, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y7, Y7
	VPMULHW      Y4, Y11, Y5
	VPSRAW       $0x0a, Y5, Y5
	VPMULLW      Y15, Y5, Y5
	VPSUBW       Y5, Y11, Y11
	VPBROADCASTW 2100(CX), Y0
	VPBROADCASTW 2102(CX), Y1
	VPSUBW       Y7, Y11, Y4
	VPSUBW       Y8, Y12, Y5
	VPSUBW       Y9, Y13, Y6
	VPADDW       Y7, Y11, Y7
	VPADDW       Y8, Y12, Y8
	VPADDW       Y9, Y13, Y9
	VPMULLW      Y4, Y0, Y11
	VPMULLW      Y5, Y0, Y12
	VPSUBW       Y10, Y14, Y2
	VPMULLW      Y6, Y0, Y13
	VPADDW       Y10, Y14, Y10
	VPMULLW      Y2, Y0, Y14
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y6, Y1, Y6
	VPMULHW      Y2, Y1, Y2
	VPMULHW      Y11, Y15, Y11
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y11, Y4, Y11
	VPSUBW       Y12, Y5, Y12
	VPSUBW       Y13, Y6, Y13
	VPSUBW       Y14, Y2, Y14
	VMOVDQU      Y7, 256(AX)
	VMOVDQU      Y8, 288(AX)
	VMOVDQU      Y9, 320(AX)
	VMOVDQU      Y10, 352(AX)
	VMOVDQU      Y11, 384(AX)
	VMOVDQU      Y12, 416(AX)
	VMOVDQU      Y13, 448(AX)
	VMOVDQU      Y14, 480(AX)
	VPBROADCASTW 2104(CX), Y0
	VPBROADCASTW 2106(CX), Y1
	VMOVDQU      (AX), Y7
	VMOVDQU      32(AX), Y8
	VMOVDQU      64(AX), Y9
	VMOVDQU      96(AX), Y10
	VMOVDQU      256(AX), Y11
	VMOVDQU      288(AX), Y12
	VMOVDQU      320(AX), Y13
	VMOVDQU      352(AX), Y14
	VPSUBW       Y7, Y11, Y2
	VPSUBW       Y8, Y12, Y3
	VPSUBW       Y9, Y13, Y4
	VPADDW       Y7, Y11, Y7
	VPADDW       Y8, Y12, Y8
	VPADDW       Y9, Y13, Y9
	VPMULLW      Y2, Y0, Y11
	VPMULLW      Y3, Y0, Y12
	VPSUBW       Y10, Y14, Y5
	VPMULLW      Y4, Y0, Y13
	VPADDW       Y10, Y14, Y10
	VPMULLW      Y5, Y0, Y14
	VPMULHW      Y2, Y1, Y2
	VPMULHW      Y3, Y1, Y3
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y11, Y15, Y11
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y11, Y2, Y11
	VPSUBW       Y12, Y3, Y12
	VPSUBW       Y13, Y4, Y13
	VPSUBW       Y14, Y5, Y14
	MOVL         $0xffffd8a1, DX
	VMOVD        DX, X0
	VPBROADCASTW X0, Y0
	MOVL         $0x000005a1, DX
	VMOVD        DX, X1
	VPBROADCASTW X1, Y1
	VPMULLW      Y7, Y0, Y2
	VPMULLW      Y8, Y0, Y3
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULHW      Y7, Y1, Y7
	VPMULHW      Y8, Y1, Y8
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y7, Y7
	VPSUBW       Y3, Y8, Y8
	VPSUBW       Y4, Y9, Y9
	VPSUBW       Y5, Y10, Y10
	VPMULLW      Y11, Y0, Y2
	VPMULLW      Y12, Y0, Y3
	VPMULLW      Y13, Y0, Y4
	VPMULLW      Y14, Y0, Y5
	VPMULHW      Y11, Y1, Y11
	VPMULHW      Y12, Y1, Y12
	VPMULHW      Y13, Y1, Y13
	VPMULHW      Y14, Y1, Y14
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y11, Y11
	VPSUBW       Y3, Y12, Y12
	VPSUBW       Y4, Y13, Y13
	VPSUBW       Y5, Y14, Y14
	VMOVDQU      Y7, (AX)
	VMOVDQU      Y8, 32(AX)
	VMOVDQU      Y9, 64(AX)
	VMOVDQU      Y10, 96(AX)
	VMOVDQU      Y11, 256(AX)
	VMOVDQU      Y12, 288(AX)
	VMOVDQU      Y13, 320(AX)
	VMOVDQU      Y14, 352(AX)
	VPBROADCASTW 2104(CX), Y0
	VPBROADCASTW 2106(CX), Y1
	VMOVDQU      128(AX), Y7
	VMOVDQU      160(AX), Y8
	VMOVDQU      192(AX), Y9
	VMOVDQU      224(AX), Y10
	VMOVDQU      384(AX), Y11
	VMOVDQU      416(AX), Y12
	VMOVDQU      448(AX), Y13
	VMOVDQU      480(AX), Y14
	VPSUBW       Y7, Y11, Y2
	VPSUBW       Y8, Y12, Y3
	VPSUBW       Y9, Y13, Y4
	VPADDW       Y7, Y11, Y7
	VPADDW       Y8, Y12, Y8
	VPADDW       Y9, Y13, Y9
	VPMULLW      Y2, Y0, Y11
	VPMULLW      Y3, Y0, Y12
	VPSUBW       Y10, Y14, Y5
	VPMULLW      Y4, Y0, Y13
	VPADDW       Y10, Y14, Y10
	VPMULLW      Y5, Y0, Y14
	VPMULHW      Y2, Y1, Y2
	VPMULHW      Y3, Y1, Y3
	VPMULHW      Y4, Y1, Y4
	VPMULHW      Y5, Y1, Y5
	VPMULHW      Y11, Y15, Y11
	VPMULHW      Y12, Y15, Y12
	VPMULHW      Y13, Y15, Y13
	VPMULHW      Y14, Y15, Y14
	VPSUBW       Y11, Y2, Y11
	VPSUBW       Y12, Y3, Y12
	VPSUBW       Y13, Y4, Y13
	VPSUBW       Y14, Y5, Y14
	MOVL         $0xffffd8a1, CX
	VMOVD        CX, X0
	VPBROADCASTW X0, Y0
	MOVL         $0x000005a1, CX
	VMOVD        CX, X1
	VPBROADCASTW X1, Y1
	VPMULLW      Y7, Y0, Y2
	VPMULLW      Y8, Y0, Y3
	VPMULLW      Y9, Y0, Y4
	VPMULLW      Y10, Y0, Y5
	VPMULHW      Y7, Y1, Y7
	VPMULHW      Y8, Y1, Y8
	VPMULHW      Y9, Y1, Y9
	VPMULHW      Y10, Y1, Y10
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y7, Y7
	VPSUBW       Y3, Y8, Y8
	VPSUBW       Y4, Y9, Y9
	VPSUBW       Y5, Y10, Y10
	VPMULLW      Y11, Y0, Y2
	VPMULLW      Y12, Y0, Y3
	VPMULLW      Y13, Y0, Y4
	VPMULLW      Y14, Y0, Y5
	VPMULHW      Y11, Y1, Y11
	VPMULHW      Y12, Y1, Y12
	VPMULHW      Y13, Y1, Y13
	VPMULHW      Y14, Y1, Y14
	VPMULHW      Y2, Y15, Y2
	VPMULHW      Y3, Y15, Y3
	VPMULHW      Y4, Y15, Y4
	VPMULHW      Y5, Y15, Y5
	VPSUBW       Y2, Y11, Y11
	VPSUBW       Y3, Y12, Y12
	VPSUBW       Y4, Y13, Y13
	VPSUBW       Y5, Y14, Y14
	VMOVDQU      Y7, 128(AX)
	VMOVDQU      Y8, 160(AX)
	VMOVDQU      Y9, 192(AX)
	VMOVDQU      Y10, 224(AX)
	VMOVDQU      Y11, 384(AX)
	VMOVDQU      Y12, 416(AX)
	VMOVDQU      Y13, 448(AX)
	VMOVDQU      Y14, 480(AX)
	RET

// func mulHatAVX2(p *[256]int16, a *[256]int16, b *[256]int16)
// Requires: AVX, AVX2
TEXT ·mulHatAVX2(SB), NOSPLIT, $8-24
	MOVQ         p+0(FP), AX
	MOVQ         a+8(FP), CX
	MOVQ         b+16(FP), DX
	LEAQ         ·ZetasAVX2+0(SB), BX
	MOVL         $0xfffff301, SI
	VMOVD        SI, X0
	VPBROADCASTW X0, Y14
	MOVL         $0x00000d01, SI
	VMOVD        SI, X0
	VPBROADCASTW X0, Y15
	VMOVDQU      (CX), Y0
	VMOVDQU      32(CX), Y1
	VMOVDQU      64(CX), Y2
	VMOVDQU      96(CX), Y3
	VMOVDQU      (DX), Y4
	VMOVDQU      32(DX), Y5
	VMOVDQU      64(DX), Y6
	VMOVDQU      96(DX), Y7
	VPMULLW      Y1, Y5, Y8
	VPMULLW      Y0, Y4, Y9
	VPMULLW      Y0, Y5, Y10
	VPMULLW      Y1, Y4, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y1, Y5, Y12
	VPMULHW      Y0, Y4, Y13
	VPMULHW      Y0, Y5, Y0
	VPMULHW      Y1, Y4, Y1
	VMOVDQA      Y12, Y4
	VMOVDQA      Y13, Y5
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y4, Y4
	VPSUBW       Y9, Y5, Y5
	VPSUBW       Y10, Y0, Y0
	VPSUBW       Y11, Y1, Y1
	VMOVDQU      800(BX), Y12
	VMOVDQU      832(BX), Y13
	VPMULLW      Y4, Y12, Y8
	VPMULHW      Y4, Y13, Y4
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y4, Y4
	VPADDW       Y4, Y5, Y4
	VPADDW       Y0, Y1, Y5
	VPMULLW      Y3, Y7, Y8
	VPMULLW      Y2, Y6, Y9
	VPMULLW      Y2, Y7, Y10
	VPMULLW      Y3, Y6, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y3, Y7, Y12
	VPMULHW      Y2, Y6, Y13
	VPMULHW      Y2, Y7, Y2
	VPMULHW      Y3, Y6, Y3
	VMOVDQA      Y12, Y6
	VMOVDQA      Y13, Y7
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y9, Y7, Y7
	VPSUBW       Y10, Y2, Y2
	VPSUBW       Y11, Y3, Y3
	VMOVDQU      800(BX), Y12
	VMOVDQU      832(BX), Y13
	VPMULLW      Y6, Y12, Y8
	VPMULHW      Y6, Y13, Y6
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y6, Y7, Y6
	VPADDW       Y2, Y3, Y7
	VMOVDQU      Y4, (AX)
	VMOVDQU      Y5, 32(AX)
	VMOVDQU      Y6, 64(AX)
	VMOVDQU      Y7, 96(AX)
	VMOVDQU      128(CX), Y0
	VMOVDQU      160(CX), Y1
	VMOVDQU      192(CX), Y2
	VMOVDQU      224(CX), Y3
	VMOVDQU      128(DX), Y4
	VMOVDQU      160(DX), Y5
	VMOVDQU      192(DX), Y6
	VMOVDQU      224(DX), Y7
	VPMULLW      Y1, Y5, Y8
	VPMULLW      Y0, Y4, Y9
	VPMULLW      Y0, Y5, Y10
	VPMULLW      Y1, Y4, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y1, Y5, Y12
	VPMULHW      Y0, Y4, Y13
	VPMULHW      Y0, Y5, Y0
	VPMULHW      Y1, Y4, Y1
	VMOVDQA      Y12, Y4
	VMOVDQA      Y13, Y5
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y4, Y4
	VPSUBW       Y9, Y5, Y5
	VPSUBW       Y10, Y0, Y0
	VPSUBW       Y11, Y1, Y1
	VMOVDQU      864(BX), Y12
	VMOVDQU      896(BX), Y13
	VPMULLW      Y4, Y12, Y8
	VPMULHW      Y4, Y13, Y4
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y4, Y4
	VPADDW       Y4, Y5, Y4
	VPADDW       Y0, Y1, Y5
	VPMULLW      Y3, Y7, Y8
	VPMULLW      Y2, Y6, Y9
	VPMULLW      Y2, Y7, Y10
	VPMULLW      Y3, Y6, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y3, Y7, Y12
	VPMULHW      Y2, Y6, Y13
	VPMULHW      Y2, Y7, Y2
	VPMULHW      Y3, Y6, Y3
	VMOVDQA      Y12, Y6
	VMOVDQA      Y13, Y7
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y9, Y7, Y7
	VPSUBW       Y10, Y2, Y2
	VPSUBW       Y11, Y3, Y3
	VMOVDQU      864(BX), Y12
	VMOVDQU      896(BX), Y13
	VPMULLW      Y6, Y12, Y8
	VPMULHW      Y6, Y13, Y6
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y6, Y7, Y6
	VPADDW       Y2, Y3, Y7
	VMOVDQU      Y4, 128(AX)
	VMOVDQU      Y5, 160(AX)
	VMOVDQU      Y6, 192(AX)
	VMOVDQU      Y7, 224(AX)
	VMOVDQU      256(CX), Y0
	VMOVDQU      288(CX), Y1
	VMOVDQU      320(CX), Y2
	VMOVDQU      352(CX), Y3
	VMOVDQU      256(DX), Y4
	VMOVDQU      288(DX), Y5
	VMOVDQU      320(DX), Y6
	VMOVDQU      352(DX), Y7
	VPMULLW      Y1, Y5, Y8
	VPMULLW      Y0, Y4, Y9
	VPMULLW      Y0, Y5, Y10
	VPMULLW      Y1, Y4, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y1, Y5, Y12
	VPMULHW      Y0, Y4, Y13
	VPMULHW      Y0, Y5, Y0
	VPMULHW      Y1, Y4, Y1
	VMOVDQA      Y12, Y4
	VMOVDQA      Y13, Y5
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y4, Y4
	VPSUBW       Y9, Y5, Y5
	VPSUBW       Y10, Y0, Y0
	VPSUBW       Y11, Y1, Y1
	VMOVDQU      928(BX), Y12
	VMOVDQU      960(BX), Y13
	VPMULLW      Y4, Y12, Y8
	VPMULHW      Y4, Y13, Y4
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y4, Y4
	VPADDW       Y4, Y5, Y4
	VPADDW       Y0, Y1, Y5
	VPMULLW      Y3, Y7, Y8
	VPMULLW      Y2, Y6, Y9
	VPMULLW      Y2, Y7, Y10
	VPMULLW      Y3, Y6, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y3, Y7, Y12
	VPMULHW      Y2, Y6, Y13
	VPMULHW      Y2, Y7, Y2
	VPMULHW      Y3, Y6, Y3
	VMOVDQA      Y12, Y6
	VMOVDQA      Y13, Y7
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y9, Y7, Y7
	VPSUBW       Y10, Y2, Y2
	VPSUBW       Y11, Y3, Y3
	VMOVDQU      928(BX), Y12
	VMOVDQU      960(BX), Y13
	VPMULLW      Y6, Y12, Y8
	VPMULHW      Y6, Y13, Y6
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y6, Y7, Y6
	VPADDW       Y2, Y3, Y7
	VMOVDQU      Y4, 256(AX)
	VMOVDQU      Y5, 288(AX)
	VMOVDQU      Y6, 320(AX)
	VMOVDQU      Y7, 352(AX)
	VMOVDQU      384(CX), Y0
	VMOVDQU      416(CX), Y1
	VMOVDQU      448(CX), Y2
	VMOVDQU      480(CX), Y3
	VMOVDQU      384(DX), Y4
	VMOVDQU      416(DX), Y5
	VMOVDQU      448(DX), Y6
	VMOVDQU      480(DX), Y7
	VPMULLW      Y1, Y5, Y8
	VPMULLW      Y0, Y4, Y9
	VPMULLW      Y0, Y5, Y10
	VPMULLW      Y1, Y4, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y1, Y5, Y12
	VPMULHW      Y0, Y4, Y13
	VPMULHW      Y0, Y5, Y0
	VPMULHW      Y1, Y4, Y1
	VMOVDQA      Y12, Y4
	VMOVDQA      Y13, Y5
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y4, Y4
	VPSUBW       Y9, Y5, Y5
	VPSUBW       Y10, Y0, Y0
	VPSUBW       Y11, Y1, Y1
	VMOVDQU      992(BX), Y12
	VMOVDQU      1024(BX), Y13
	VPMULLW      Y4, Y12, Y8
	VPMULHW      Y4, Y13, Y4
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y4, Y4
	VPADDW       Y4, Y5, Y4
	VPADDW       Y0, Y1, Y5
	VPMULLW      Y3, Y7, Y8
	VPMULLW      Y2, Y6, Y9
	VPMULLW      Y2, Y7, Y10
	VPMULLW      Y3, Y6, Y11
	VPMULLW      Y8, Y14, Y8
	VPMULLW      Y9, Y14, Y9
	VPMULLW      Y10, Y14, Y10
	VPMULLW      Y11, Y14, Y11
	VPMULHW      Y3, Y7, Y12
	VPMULHW      Y2, Y6, Y13
	VPMULHW      Y2, Y7, Y2
	VPMULHW      Y3, Y6, Y3
	VMOVDQA      Y12, Y6
	VMOVDQA      Y13, Y7
	VPMULHW      Y8, Y15, Y8
	VPMULHW      Y9, Y15, Y9
	VPMULHW      Y10, Y15, Y10
	VPMULHW      Y11, Y15, Y11
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y9, Y7, Y7
	VPSUBW       Y10, Y2, Y2
	VPSUBW       Y11, Y3, Y3
	VMOVDQU      992(BX), Y12
	VMOVDQU      1024(BX), Y13
	VPMULLW      Y6, Y12, Y8
	VPMULHW      Y6, Y13, Y6
	VPMULHW      Y8, Y15, Y8
	VPSUBW       Y8, Y6, Y6
	VPSUBW       Y6, Y7, Y6
	VPADDW       Y2, Y3, Y7
	VMOVDQU      Y4, 384(AX)
	VMOVDQU      Y5, 416(AX)
	VMOVDQU      Y6, 448(AX)
	VMOVDQU      Y7, 480(AX)
	RET

// func detangleAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·detangleAVX2(SB), NOSPLIT, $0-8
	MOVQ        p+0(FP), AX
	VMOVDQU     (AX), Y0
	VMOVDQU     32(AX), Y1
	VMOVDQU     64(AX), Y2
	VMOVDQU     96(AX), Y3
	VMOVDQU     128(AX), Y4
	VMOVDQU     160(AX), Y5
	VMOVDQU     192(AX), Y6
	VMOVDQU     224(AX), Y7
	VPSLLD      $0x10, Y1, Y8
	VPBLENDW    $0xaa, Y8, Y0, Y8
	VPSRLD      $0x10, Y0, Y0
	VPBLENDW    $0xaa, Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPSLLD      $0x10, Y3, Y8
	VPBLENDW    $0xaa, Y8, Y2, Y8
	VPSRLD      $0x10, Y2, Y2
	VPBLENDW    $0xaa, Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPSLLD      $0x10, Y5, Y8
	VPBLENDW    $0xaa, Y8, Y4, Y8
	VPSRLD      $0x10, Y4, Y4
	VPBLENDW    $0xaa, Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPSLLD      $0x10, Y7, Y8
	VPBLENDW    $0xaa, Y8, Y6, Y8
	VPSRLD      $0x10, Y6, Y6
	VPBLENDW    $0xaa, Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VMOVSLDUP   Y2, Y8
	VPBLENDD    $0xaa, Y8, Y0, Y8
	VPSRLQ      $0x20, Y0, Y0
	VPBLENDD    $0xaa, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VMOVSLDUP   Y3, Y8
	VPBLENDD    $0xaa, Y8, Y1, Y8
	VPSRLQ      $0x20, Y1, Y1
	VPBLENDD    $0xaa, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VMOVSLDUP   Y6, Y8
	VPBLENDD    $0xaa, Y8, Y4, Y8
	VPSRLQ      $0x20, Y4, Y4
	VPBLENDD    $0xaa, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VMOVSLDUP   Y7, Y8
	VPBLENDD    $0xaa, Y8, Y5, Y8
	VPSRLQ      $0x20, Y5, Y5
	VPBLENDD    $0xaa, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VPUNPCKLQDQ Y1, Y0, Y8
	VPUNPCKHQDQ Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPUNPCKLQDQ Y3, Y2, Y8
	VPUNPCKHQDQ Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPUNPCKLQDQ Y5, Y4, Y8
	VPUNPCKHQDQ Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPUNPCKLQDQ Y7, Y6, Y8
	VPUNPCKHQDQ Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VPERM2I128  $0x20, Y2, Y0, Y8
	VPERM2I128  $0x31, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VPERM2I128  $0x20, Y3, Y1, Y8
	VPERM2I128  $0x31, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VPERM2I128  $0x20, Y6, Y4, Y8
	VPERM2I128  $0x31, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VPERM2I128  $0x20, Y7, Y5, Y8
	VPERM2I128  $0x31, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VMOVDQU     Y0, (AX)
	VMOVDQU     Y1, 32(AX)
	VMOVDQU     Y2, 64(AX)
	VMOVDQU     Y3, 96(AX)
	VMOVDQU     Y4, 128(AX)
	VMOVDQU     Y5, 160(AX)
	VMOVDQU     Y6, 192(AX)
	VMOVDQU     Y7, 224(AX)
	VMOVDQU     256(AX), Y0
	VMOVDQU     288(AX), Y1
	VMOVDQU     320(AX), Y2
	VMOVDQU     352(AX), Y3
	VMOVDQU     384(AX), Y4
	VMOVDQU     416(AX), Y5
	VMOVDQU     448(AX), Y6
	VMOVDQU     480(AX), Y7
	VPSLLD      $0x10, Y1, Y8
	VPBLENDW    $0xaa, Y8, Y0, Y8
	VPSRLD      $0x10, Y0, Y0
	VPBLENDW    $0xaa, Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPSLLD      $0x10, Y3, Y8
	VPBLENDW    $0xaa, Y8, Y2, Y8
	VPSRLD      $0x10, Y2, Y2
	VPBLENDW    $0xaa, Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPSLLD      $0x10, Y5, Y8
	VPBLENDW    $0xaa, Y8, Y4, Y8
	VPSRLD      $0x10, Y4, Y4
	VPBLENDW    $0xaa, Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPSLLD      $0x10, Y7, Y8
	VPBLENDW    $0xaa, Y8, Y6, Y8
	VPSRLD      $0x10, Y6, Y6
	VPBLENDW    $0xaa, Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VMOVSLDUP   Y2, Y8
	VPBLENDD    $0xaa, Y8, Y0, Y8
	VPSRLQ      $0x20, Y0, Y0
	VPBLENDD    $0xaa, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VMOVSLDUP   Y3, Y8
	VPBLENDD    $0xaa, Y8, Y1, Y8
	VPSRLQ      $0x20, Y1, Y1
	VPBLENDD    $0xaa, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VMOVSLDUP   Y6, Y8
	VPBLENDD    $0xaa, Y8, Y4, Y8
	VPSRLQ      $0x20, Y4, Y4
	VPBLENDD    $0xaa, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VMOVSLDUP   Y7, Y8
	VPBLENDD    $0xaa, Y8, Y5, Y8
	VPSRLQ      $0x20, Y5, Y5
	VPBLENDD    $0xaa, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VPUNPCKLQDQ Y1, Y0, Y8
	VPUNPCKHQDQ Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPUNPCKLQDQ Y3, Y2, Y8
	VPUNPCKHQDQ Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPUNPCKLQDQ Y5, Y4, Y8
	VPUNPCKHQDQ Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPUNPCKLQDQ Y7, Y6, Y8
	VPUNPCKHQDQ Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VPERM2I128  $0x20, Y2, Y0, Y8
	VPERM2I128  $0x31, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VPERM2I128  $0x20, Y3, Y1, Y8
	VPERM2I128  $0x31, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VPERM2I128  $0x20, Y6, Y4, Y8
	VPERM2I128  $0x31, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VPERM2I128  $0x20, Y7, Y5, Y8
	VPERM2I128  $0x31, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VMOVDQU     Y0, 256(AX)
	VMOVDQU     Y1, 288(AX)
	VMOVDQU     Y2, 320(AX)
	VMOVDQU     Y3, 352(AX)
	VMOVDQU     Y4, 384(AX)
	VMOVDQU     Y5, 416(AX)
	VMOVDQU     Y6, 448(AX)
	VMOVDQU     Y7, 480(AX)
	RET

// func tangleAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·tangleAVX2(SB), NOSPLIT, $0-8
	MOVQ        p+0(FP), AX
	VMOVDQU     (AX), Y0
	VMOVDQU     32(AX), Y1
	VMOVDQU     64(AX), Y2
	VMOVDQU     96(AX), Y3
	VMOVDQU     128(AX), Y4
	VMOVDQU     160(AX), Y5
	VMOVDQU     192(AX), Y6
	VMOVDQU     224(AX), Y7
	VPERM2I128  $0x20, Y2, Y0, Y8
	VPERM2I128  $0x31, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VPERM2I128  $0x20, Y3, Y1, Y8
	VPERM2I128  $0x31, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VPERM2I128  $0x20, Y6, Y4, Y8
	VPERM2I128  $0x31, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VPERM2I128  $0x20, Y7, Y5, Y8
	VPERM2I128  $0x31, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VPUNPCKLQDQ Y1, Y0, Y8
	VPUNPCKHQDQ Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPUNPCKLQDQ Y3, Y2, Y8
	VPUNPCKHQDQ Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPUNPCKLQDQ Y5, Y4, Y8
	VPUNPCKHQDQ Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPUNPCKLQDQ Y7, Y6, Y8
	VPUNPCKHQDQ Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VMOVSLDUP   Y2, Y8
	VPBLENDD    $0xaa, Y8, Y0, Y8
	VPSRLQ      $0x20, Y0, Y0
	VPBLENDD    $0xaa, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VMOVSLDUP   Y3, Y8
	VPBLENDD    $0xaa, Y8, Y1, Y8
	VPSRLQ      $0x20, Y1, Y1
	VPBLENDD    $0xaa, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VMOVSLDUP   Y6, Y8
	VPBLENDD    $0xaa, Y8, Y4, Y8
	VPSRLQ      $0x20, Y4, Y4
	VPBLENDD    $0xaa, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VMOVSLDUP   Y7, Y8
	VPBLENDD    $0xaa, Y8, Y5, Y8
	VPSRLQ      $0x20, Y5, Y5
	VPBLENDD    $0xaa, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VPSLLD      $0x10, Y1, Y8
	VPBLENDW    $0xaa, Y8, Y0, Y8
	VPSRLD      $0x10, Y0, Y0
	VPBLENDW    $0xaa, Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPSLLD      $0x10, Y3, Y8
	VPBLENDW    $0xaa, Y8, Y2, Y8
	VPSRLD      $0x10, Y2, Y2
	VPBLENDW    $0xaa, Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPSLLD      $0x10, Y5, Y8
	VPBLENDW    $0xaa, Y8, Y4, Y8
	VPSRLD      $0x10, Y4, Y4
	VPBLENDW    $0xaa, Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPSLLD      $0x10, Y7, Y8
	VPBLENDW    $0xaa, Y8, Y6, Y8
	VPSRLD      $0x10, Y6, Y6
	VPBLENDW    $0xaa, Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VMOVDQU     Y0, (AX)
	VMOVDQU     Y1, 32(AX)
	VMOVDQU     Y2, 64(AX)
	VMOVDQU     Y3, 96(AX)
	VMOVDQU     Y4, 128(AX)
	VMOVDQU     Y5, 160(AX)
	VMOVDQU     Y6, 192(AX)
	VMOVDQU     Y7, 224(AX)
	VMOVDQU     256(AX), Y0
	VMOVDQU     288(AX), Y1
	VMOVDQU     320(AX), Y2
	VMOVDQU     352(AX), Y3
	VMOVDQU     384(AX), Y4
	VMOVDQU     416(AX), Y5
	VMOVDQU     448(AX), Y6
	VMOVDQU     480(AX), Y7
	VPERM2I128  $0x20, Y2, Y0, Y8
	VPERM2I128  $0x31, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VPERM2I128  $0x20, Y3, Y1, Y8
	VPERM2I128  $0x31, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VPERM2I128  $0x20, Y6, Y4, Y8
	VPERM2I128  $0x31, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VPERM2I128  $0x20, Y7, Y5, Y8
	VPERM2I128  $0x31, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VPUNPCKLQDQ Y1, Y0, Y8
	VPUNPCKHQDQ Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPUNPCKLQDQ Y3, Y2, Y8
	VPUNPCKHQDQ Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPUNPCKLQDQ Y5, Y4, Y8
	VPUNPCKHQDQ Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPUNPCKLQDQ Y7, Y6, Y8
	VPUNPCKHQDQ Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VMOVSLDUP   Y2, Y8
	VPBLENDD    $0xaa, Y8, Y0, Y8
	VPSRLQ      $0x20, Y0, Y0
	VPBLENDD    $0xaa, Y2, Y0, Y2
	VMOVDQA     Y8, Y0
	VMOVSLDUP   Y3, Y8
	VPBLENDD    $0xaa, Y8, Y1, Y8
	VPSRLQ      $0x20, Y1, Y1
	VPBLENDD    $0xaa, Y3, Y1, Y3
	VMOVDQA     Y8, Y1
	VMOVSLDUP   Y6, Y8
	VPBLENDD    $0xaa, Y8, Y4, Y8
	VPSRLQ      $0x20, Y4, Y4
	VPBLENDD    $0xaa, Y6, Y4, Y6
	VMOVDQA     Y8, Y4
	VMOVSLDUP   Y7, Y8
	VPBLENDD    $0xaa, Y8, Y5, Y8
	VPSRLQ      $0x20, Y5, Y5
	VPBLENDD    $0xaa, Y7, Y5, Y7
	VMOVDQA     Y8, Y5
	VPSLLD      $0x10, Y1, Y8
	VPBLENDW    $0xaa, Y8, Y0, Y8
	VPSRLD      $0x10, Y0, Y0
	VPBLENDW    $0xaa, Y1, Y0, Y1
	VMOVDQA     Y8, Y0
	VPSLLD      $0x10, Y3, Y8
	VPBLENDW    $0xaa, Y8, Y2, Y8
	VPSRLD      $0x10, Y2, Y2
	VPBLENDW    $0xaa, Y3, Y2, Y3
	VMOVDQA     Y8, Y2
	VPSLLD      $0x10, Y5, Y8
	VPBLENDW    $0xaa, Y8, Y4, Y8
	VPSRLD      $0x10, Y4, Y4
	VPBLENDW    $0xaa, Y5, Y4, Y5
	VMOVDQA     Y8, Y4
	VPSLLD      $0x10, Y7, Y8
	VPBLENDW    $0xaa, Y8, Y6, Y8
	VPSRLD      $0x10, Y6, Y6
	VPBLENDW    $0xaa, Y7, Y6, Y7
	VMOVDQA     Y8, Y6
	VMOVDQU     Y0, 256(AX)
	VMOVDQU     Y1, 288(AX)
	VMOVDQU     Y2, 320(AX)
	VMOVDQU     Y3, 352(AX)
	VMOVDQU     Y4, 384(AX)
	VMOVDQU     Y5, 416(AX)
	VMOVDQU     Y6, 448(AX)
	VMOVDQU     Y7, 480(AX)
	RET

// func barrettReduceAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·barrettReduceAVX2(SB), NOSPLIT, $0-8
	MOVQ         p+0(FP), AX
	MOVL         $0x00000d01, CX
	VMOVD        CX, X0
	VPBROADCASTW X0, Y9
	MOVL         $0x00004ebf, CX
	VMOVD        CX, X0
	VPBROADCASTW X0, Y8
	VMOVDQU      (AX), Y0
	VMOVDQU      32(AX), Y1
	VMOVDQU      64(AX), Y2
	VMOVDQU      96(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VMOVDQU      Y0, (AX)
	VMOVDQU      Y1, 32(AX)
	VMOVDQU      Y2, 64(AX)
	VMOVDQU      Y3, 96(AX)
	VMOVDQU      128(AX), Y0
	VMOVDQU      160(AX), Y1
	VMOVDQU      192(AX), Y2
	VMOVDQU      224(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VMOVDQU      Y0, 128(AX)
	VMOVDQU      Y1, 160(AX)
	VMOVDQU      Y2, 192(AX)
	VMOVDQU      Y3, 224(AX)
	VMOVDQU      256(AX), Y0
	VMOVDQU      288(AX), Y1
	VMOVDQU      320(AX), Y2
	VMOVDQU      352(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VMOVDQU      Y0, 256(AX)
	VMOVDQU      Y1, 288(AX)
	VMOVDQU      Y2, 320(AX)
	VMOVDQU      Y3, 352(AX)
	VMOVDQU      384(AX), Y0
	VMOVDQU      416(AX), Y1
	VMOVDQU      448(AX), Y2
	VMOVDQU      480(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VMOVDQU      Y0, 384(AX)
	VMOVDQU      Y1, 416(AX)
	VMOVDQU      Y2, 448(AX)
	VMOVDQU      Y3, 480(AX)
	RET

// func normalizeAVX2(p *[256]int16)
// Requires: AVX, AVX2
TEXT ·normalizeAVX2(SB), NOSPLIT, $0-8
	MOVQ         p+0(FP), AX
	MOVL         $0x00000d01, CX
	VMOVD        CX, X0
	VPBROADCASTW X0, Y9
	MOVL         $0x00004ebf, CX
	VMOVD        CX, X0
	VPBROADCASTW X0, Y8
	VMOVDQU      (AX), Y0
	VMOVDQU      32(AX), Y1
	VMOVDQU      64(AX), Y2
	VMOVDQU      96(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VPSUBW       Y9, Y0, Y0
	VPSUBW       Y9, Y1, Y1
	VPSUBW       Y9, Y2, Y2
	VPSUBW       Y9, Y3, Y3
	VPSRAW       $0x0f, Y0, Y4
	VPSRAW       $0x0f, Y1, Y5
	VPSRAW       $0x0f, Y2, Y6
	VPSRAW       $0x0f, Y3, Y7
	VPAND        Y4, Y9, Y4
	VPAND        Y5, Y9, Y5
	VPAND        Y6, Y9, Y6
	VPAND        Y7, Y9, Y7
	VPADDW       Y0, Y4, Y0
	VPADDW       Y1, Y5, Y1
	VPADDW       Y2, Y6, Y2
	VPADDW       Y3, Y7, Y3
	VMOVDQU      Y0, (AX)
	VMOVDQU      Y1, 32(AX)
	VMOVDQU      Y2, 64(AX)
	VMOVDQU      Y3, 96(AX)
	VMOVDQU      128(AX), Y0
	VMOVDQU      160(AX), Y1
	VMOVDQU      192(AX), Y2
	VMOVDQU      224(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VPSUBW       Y9, Y0, Y0
	VPSUBW       Y9, Y1, Y1
	VPSUBW       Y9, Y2, Y2
	VPSUBW       Y9, Y3, Y3
	VPSRAW       $0x0f, Y0, Y4
	VPSRAW       $0x0f, Y1, Y5
	VPSRAW       $0x0f, Y2, Y6
	VPSRAW       $0x0f, Y3, Y7
	VPAND        Y4, Y9, Y4
	VPAND        Y5, Y9, Y5
	VPAND        Y6, Y9, Y6
	VPAND        Y7, Y9, Y7
	VPADDW       Y0, Y4, Y0
	VPADDW       Y1, Y5, Y1
	VPADDW       Y2, Y6, Y2
	VPADDW       Y3, Y7, Y3
	VMOVDQU      Y0, 128(AX)
	VMOVDQU      Y1, 160(AX)
	VMOVDQU      Y2, 192(AX)
	VMOVDQU      Y3, 224(AX)
	VMOVDQU      256(AX), Y0
	VMOVDQU      288(AX), Y1
	VMOVDQU      320(AX), Y2
	VMOVDQU      352(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VPSUBW       Y9, Y0, Y0
	VPSUBW       Y9, Y1, Y1
	VPSUBW       Y9, Y2, Y2
	VPSUBW       Y9, Y3, Y3
	VPSRAW       $0x0f, Y0, Y4
	VPSRAW       $0x0f, Y1, Y5
	VPSRAW       $0x0f, Y2, Y6
	VPSRAW       $0x0f, Y3, Y7
	VPAND        Y4, Y9, Y4
	VPAND        Y5, Y9, Y5
	VPAND        Y6, Y9, Y6
	VPAND        Y7, Y9, Y7
	VPADDW       Y0, Y4, Y0
	VPADDW       Y1, Y5, Y1
	VPADDW       Y2, Y6, Y2
	VPADDW       Y3, Y7, Y3
	VMOVDQU      Y0, 256(AX)
	VMOVDQU      Y1, 288(AX)
	VMOVDQU      Y2, 320(AX)
	VMOVDQU      Y3, 352(AX)
	VMOVDQU      384(AX), Y0
	VMOVDQU      416(AX), Y1
	VMOVDQU      448(AX), Y2
	VMOVDQU      480(AX), Y3
	VPMULHW      Y8, Y0, Y4
	VPMULHW      Y8, Y1, Y5
	VPMULHW      Y8, Y2, Y6
	VPMULHW      Y8, Y3, Y7
	VPSRAW       $0x0a, Y4, Y4
	VPSRAW       $0x0a, Y5, Y5
	VPSRAW       $0x0a, Y6, Y6
	VPSRAW       $0x0a, Y7, Y7
	VPMULLW      Y9, Y4, Y4
	VPMULLW      Y9, Y5, Y5
	VPMULLW      Y9, Y6, Y6
	VPMULLW      Y9, Y7, Y7
	VPSUBW       Y4, Y0, Y0
	VPSUBW       Y5, Y1, Y1
	VPSUBW       Y6, Y2, Y2
	VPSUBW       Y7, Y3, Y3
	VPSUBW       Y9, Y0, Y0
	VPSUBW       Y9, Y1, Y1
	VPSUBW       Y9, Y2, Y2
	VPSUBW       Y9, Y3, Y3
	VPSRAW       $0x0f, Y0, Y4
	VPSRAW       $0x0f, Y1, Y5
	VPSRAW       $0x0f, Y2, Y6
	VPSRAW       $0x0f, Y3, Y7
	VPAND        Y4, Y9, Y4
	VPAND        Y5, Y9, Y5
	VPAND        Y6, Y9, Y6
	VPAND        Y7, Y9, Y7
	VPADDW       Y0, Y4, Y0
	VPADDW       Y1, Y5, Y1
	VPADDW       Y2, Y6, Y2
	VPADDW       Y3, Y7, Y3
	VMOVDQU      Y0, 384(AX)
	VMOVDQU      Y1, 416(AX)
	VMOVDQU      Y2, 448(AX)
	VMOVDQU      Y3, 480(AX)
	RET
