blake2s_386.s raw

   1  // Copyright 2016 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  //go:build 386 && gc && !purego
   6  
   7  #include "textflag.h"
   8  
   9  DATA iv0<>+0x00(SB)/4, $0x6a09e667
  10  DATA iv0<>+0x04(SB)/4, $0xbb67ae85
  11  DATA iv0<>+0x08(SB)/4, $0x3c6ef372
  12  DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
  13  GLOBL iv0<>(SB), (NOPTR+RODATA), $16
  14  
  15  DATA iv1<>+0x00(SB)/4, $0x510e527f
  16  DATA iv1<>+0x04(SB)/4, $0x9b05688c
  17  DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
  18  DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
  19  GLOBL iv1<>(SB), (NOPTR+RODATA), $16
  20  
  21  DATA rol16<>+0x00(SB)/8, $0x0504070601000302
  22  DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
  23  GLOBL rol16<>(SB), (NOPTR+RODATA), $16
  24  
  25  DATA rol8<>+0x00(SB)/8, $0x0407060500030201
  26  DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
  27  GLOBL rol8<>(SB), (NOPTR+RODATA), $16
  28  
  29  DATA counter<>+0x00(SB)/8, $0x40
  30  DATA counter<>+0x08(SB)/8, $0x0
  31  GLOBL counter<>(SB), (NOPTR+RODATA), $16
  32  
  33  #define ROTL_SSE2(n, t, v) \
  34  	MOVO  v, t;       \
  35  	PSLLL $n, t;      \
  36  	PSRLL $(32-n), v; \
  37  	PXOR  t, v
  38  
  39  #define ROTL_SSSE3(c, v) \
  40  	PSHUFB c, v
  41  
  42  #define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
  43  	PADDL  m0, v0;        \
  44  	PADDL  v1, v0;        \
  45  	PXOR   v0, v3;        \
  46  	ROTL_SSE2(16, t, v3); \
  47  	PADDL  v3, v2;        \
  48  	PXOR   v2, v1;        \
  49  	ROTL_SSE2(20, t, v1); \
  50  	PADDL  m1, v0;        \
  51  	PADDL  v1, v0;        \
  52  	PXOR   v0, v3;        \
  53  	ROTL_SSE2(24, t, v3); \
  54  	PADDL  v3, v2;        \
  55  	PXOR   v2, v1;        \
  56  	ROTL_SSE2(25, t, v1); \
  57  	PSHUFL $0x39, v1, v1; \
  58  	PSHUFL $0x4E, v2, v2; \
  59  	PSHUFL $0x93, v3, v3; \
  60  	PADDL  m2, v0;        \
  61  	PADDL  v1, v0;        \
  62  	PXOR   v0, v3;        \
  63  	ROTL_SSE2(16, t, v3); \
  64  	PADDL  v3, v2;        \
  65  	PXOR   v2, v1;        \
  66  	ROTL_SSE2(20, t, v1); \
  67  	PADDL  m3, v0;        \
  68  	PADDL  v1, v0;        \
  69  	PXOR   v0, v3;        \
  70  	ROTL_SSE2(24, t, v3); \
  71  	PADDL  v3, v2;        \
  72  	PXOR   v2, v1;        \
  73  	ROTL_SSE2(25, t, v1); \
  74  	PSHUFL $0x39, v3, v3; \
  75  	PSHUFL $0x4E, v2, v2; \
  76  	PSHUFL $0x93, v1, v1
  77  
  78  #define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
  79  	PADDL  m0, v0;        \
  80  	PADDL  v1, v0;        \
  81  	PXOR   v0, v3;        \
  82  	ROTL_SSSE3(c16, v3);  \
  83  	PADDL  v3, v2;        \
  84  	PXOR   v2, v1;        \
  85  	ROTL_SSE2(20, t, v1); \
  86  	PADDL  m1, v0;        \
  87  	PADDL  v1, v0;        \
  88  	PXOR   v0, v3;        \
  89  	ROTL_SSSE3(c8, v3);   \
  90  	PADDL  v3, v2;        \
  91  	PXOR   v2, v1;        \
  92  	ROTL_SSE2(25, t, v1); \
  93  	PSHUFL $0x39, v1, v1; \
  94  	PSHUFL $0x4E, v2, v2; \
  95  	PSHUFL $0x93, v3, v3; \
  96  	PADDL  m2, v0;        \
  97  	PADDL  v1, v0;        \
  98  	PXOR   v0, v3;        \
  99  	ROTL_SSSE3(c16, v3);  \
 100  	PADDL  v3, v2;        \
 101  	PXOR   v2, v1;        \
 102  	ROTL_SSE2(20, t, v1); \
 103  	PADDL  m3, v0;        \
 104  	PADDL  v1, v0;        \
 105  	PXOR   v0, v3;        \
 106  	ROTL_SSSE3(c8, v3);   \
 107  	PADDL  v3, v2;        \
 108  	PXOR   v2, v1;        \
 109  	ROTL_SSE2(25, t, v1); \
 110  	PSHUFL $0x39, v3, v3; \
 111  	PSHUFL $0x4E, v2, v2; \
 112  	PSHUFL $0x93, v1, v1
 113  
 114  #define PRECOMPUTE(dst, off, src, t) \
 115  	MOVL 0*4(src), t;          \
 116  	MOVL t, 0*4+off+0(dst);    \
 117  	MOVL t, 9*4+off+64(dst);   \
 118  	MOVL t, 5*4+off+128(dst);  \
 119  	MOVL t, 14*4+off+192(dst); \
 120  	MOVL t, 4*4+off+256(dst);  \
 121  	MOVL t, 2*4+off+320(dst);  \
 122  	MOVL t, 8*4+off+384(dst);  \
 123  	MOVL t, 12*4+off+448(dst); \
 124  	MOVL t, 3*4+off+512(dst);  \
 125  	MOVL t, 15*4+off+576(dst); \
 126  	MOVL 1*4(src), t;          \
 127  	MOVL t, 4*4+off+0(dst);    \
 128  	MOVL t, 8*4+off+64(dst);   \
 129  	MOVL t, 14*4+off+128(dst); \
 130  	MOVL t, 5*4+off+192(dst);  \
 131  	MOVL t, 12*4+off+256(dst); \
 132  	MOVL t, 11*4+off+320(dst); \
 133  	MOVL t, 1*4+off+384(dst);  \
 134  	MOVL t, 6*4+off+448(dst);  \
 135  	MOVL t, 10*4+off+512(dst); \
 136  	MOVL t, 3*4+off+576(dst);  \
 137  	MOVL 2*4(src), t;          \
 138  	MOVL t, 1*4+off+0(dst);    \
 139  	MOVL t, 13*4+off+64(dst);  \
 140  	MOVL t, 6*4+off+128(dst);  \
 141  	MOVL t, 8*4+off+192(dst);  \
 142  	MOVL t, 2*4+off+256(dst);  \
 143  	MOVL t, 0*4+off+320(dst);  \
 144  	MOVL t, 14*4+off+384(dst); \
 145  	MOVL t, 11*4+off+448(dst); \
 146  	MOVL t, 12*4+off+512(dst); \
 147  	MOVL t, 4*4+off+576(dst);  \
 148  	MOVL 3*4(src), t;          \
 149  	MOVL t, 5*4+off+0(dst);    \
 150  	MOVL t, 15*4+off+64(dst);  \
 151  	MOVL t, 9*4+off+128(dst);  \
 152  	MOVL t, 1*4+off+192(dst);  \
 153  	MOVL t, 11*4+off+256(dst); \
 154  	MOVL t, 7*4+off+320(dst);  \
 155  	MOVL t, 13*4+off+384(dst); \
 156  	MOVL t, 3*4+off+448(dst);  \
 157  	MOVL t, 6*4+off+512(dst);  \
 158  	MOVL t, 10*4+off+576(dst); \
 159  	MOVL 4*4(src), t;          \
 160  	MOVL t, 2*4+off+0(dst);    \
 161  	MOVL t, 1*4+off+64(dst);   \
 162  	MOVL t, 15*4+off+128(dst); \
 163  	MOVL t, 10*4+off+192(dst); \
 164  	MOVL t, 6*4+off+256(dst);  \
 165  	MOVL t, 8*4+off+320(dst);  \
 166  	MOVL t, 3*4+off+384(dst);  \
 167  	MOVL t, 13*4+off+448(dst); \
 168  	MOVL t, 14*4+off+512(dst); \
 169  	MOVL t, 5*4+off+576(dst);  \
 170  	MOVL 5*4(src), t;          \
 171  	MOVL t, 6*4+off+0(dst);    \
 172  	MOVL t, 11*4+off+64(dst);  \
 173  	MOVL t, 2*4+off+128(dst);  \
 174  	MOVL t, 9*4+off+192(dst);  \
 175  	MOVL t, 1*4+off+256(dst);  \
 176  	MOVL t, 13*4+off+320(dst); \
 177  	MOVL t, 4*4+off+384(dst);  \
 178  	MOVL t, 8*4+off+448(dst);  \
 179  	MOVL t, 15*4+off+512(dst); \
 180  	MOVL t, 7*4+off+576(dst);  \
 181  	MOVL 6*4(src), t;          \
 182  	MOVL t, 3*4+off+0(dst);    \
 183  	MOVL t, 7*4+off+64(dst);   \
 184  	MOVL t, 13*4+off+128(dst); \
 185  	MOVL t, 12*4+off+192(dst); \
 186  	MOVL t, 10*4+off+256(dst); \
 187  	MOVL t, 1*4+off+320(dst);  \
 188  	MOVL t, 9*4+off+384(dst);  \
 189  	MOVL t, 14*4+off+448(dst); \
 190  	MOVL t, 0*4+off+512(dst);  \
 191  	MOVL t, 6*4+off+576(dst);  \
 192  	MOVL 7*4(src), t;          \
 193  	MOVL t, 7*4+off+0(dst);    \
 194  	MOVL t, 14*4+off+64(dst);  \
 195  	MOVL t, 10*4+off+128(dst); \
 196  	MOVL t, 0*4+off+192(dst);  \
 197  	MOVL t, 5*4+off+256(dst);  \
 198  	MOVL t, 9*4+off+320(dst);  \
 199  	MOVL t, 12*4+off+384(dst); \
 200  	MOVL t, 1*4+off+448(dst);  \
 201  	MOVL t, 13*4+off+512(dst); \
 202  	MOVL t, 2*4+off+576(dst);  \
 203  	MOVL 8*4(src), t;          \
 204  	MOVL t, 8*4+off+0(dst);    \
 205  	MOVL t, 5*4+off+64(dst);   \
 206  	MOVL t, 4*4+off+128(dst);  \
 207  	MOVL t, 15*4+off+192(dst); \
 208  	MOVL t, 14*4+off+256(dst); \
 209  	MOVL t, 3*4+off+320(dst);  \
 210  	MOVL t, 11*4+off+384(dst); \
 211  	MOVL t, 10*4+off+448(dst); \
 212  	MOVL t, 7*4+off+512(dst);  \
 213  	MOVL t, 1*4+off+576(dst);  \
 214  	MOVL 9*4(src), t;          \
 215  	MOVL t, 12*4+off+0(dst);   \
 216  	MOVL t, 2*4+off+64(dst);   \
 217  	MOVL t, 11*4+off+128(dst); \
 218  	MOVL t, 4*4+off+192(dst);  \
 219  	MOVL t, 0*4+off+256(dst);  \
 220  	MOVL t, 15*4+off+320(dst); \
 221  	MOVL t, 10*4+off+384(dst); \
 222  	MOVL t, 7*4+off+448(dst);  \
 223  	MOVL t, 5*4+off+512(dst);  \
 224  	MOVL t, 9*4+off+576(dst);  \
 225  	MOVL 10*4(src), t;         \
 226  	MOVL t, 9*4+off+0(dst);    \
 227  	MOVL t, 4*4+off+64(dst);   \
 228  	MOVL t, 8*4+off+128(dst);  \
 229  	MOVL t, 13*4+off+192(dst); \
 230  	MOVL t, 3*4+off+256(dst);  \
 231  	MOVL t, 5*4+off+320(dst);  \
 232  	MOVL t, 7*4+off+384(dst);  \
 233  	MOVL t, 15*4+off+448(dst); \
 234  	MOVL t, 11*4+off+512(dst); \
 235  	MOVL t, 0*4+off+576(dst);  \
 236  	MOVL 11*4(src), t;         \
 237  	MOVL t, 13*4+off+0(dst);   \
 238  	MOVL t, 10*4+off+64(dst);  \
 239  	MOVL t, 0*4+off+128(dst);  \
 240  	MOVL t, 3*4+off+192(dst);  \
 241  	MOVL t, 9*4+off+256(dst);  \
 242  	MOVL t, 6*4+off+320(dst);  \
 243  	MOVL t, 15*4+off+384(dst); \
 244  	MOVL t, 4*4+off+448(dst);  \
 245  	MOVL t, 2*4+off+512(dst);  \
 246  	MOVL t, 12*4+off+576(dst); \
 247  	MOVL 12*4(src), t;         \
 248  	MOVL t, 10*4+off+0(dst);   \
 249  	MOVL t, 12*4+off+64(dst);  \
 250  	MOVL t, 1*4+off+128(dst);  \
 251  	MOVL t, 6*4+off+192(dst);  \
 252  	MOVL t, 13*4+off+256(dst); \
 253  	MOVL t, 4*4+off+320(dst);  \
 254  	MOVL t, 0*4+off+384(dst);  \
 255  	MOVL t, 2*4+off+448(dst);  \
 256  	MOVL t, 8*4+off+512(dst);  \
 257  	MOVL t, 14*4+off+576(dst); \
 258  	MOVL 13*4(src), t;         \
 259  	MOVL t, 14*4+off+0(dst);   \
 260  	MOVL t, 3*4+off+64(dst);   \
 261  	MOVL t, 7*4+off+128(dst);  \
 262  	MOVL t, 2*4+off+192(dst);  \
 263  	MOVL t, 15*4+off+256(dst); \
 264  	MOVL t, 12*4+off+320(dst); \
 265  	MOVL t, 6*4+off+384(dst);  \
 266  	MOVL t, 0*4+off+448(dst);  \
 267  	MOVL t, 9*4+off+512(dst);  \
 268  	MOVL t, 11*4+off+576(dst); \
 269  	MOVL 14*4(src), t;         \
 270  	MOVL t, 11*4+off+0(dst);   \
 271  	MOVL t, 0*4+off+64(dst);   \
 272  	MOVL t, 12*4+off+128(dst); \
 273  	MOVL t, 7*4+off+192(dst);  \
 274  	MOVL t, 8*4+off+256(dst);  \
 275  	MOVL t, 14*4+off+320(dst); \
 276  	MOVL t, 2*4+off+384(dst);  \
 277  	MOVL t, 5*4+off+448(dst);  \
 278  	MOVL t, 1*4+off+512(dst);  \
 279  	MOVL t, 13*4+off+576(dst); \
 280  	MOVL 15*4(src), t;         \
 281  	MOVL t, 15*4+off+0(dst);   \
 282  	MOVL t, 6*4+off+64(dst);   \
 283  	MOVL t, 3*4+off+128(dst);  \
 284  	MOVL t, 11*4+off+192(dst); \
 285  	MOVL t, 7*4+off+256(dst);  \
 286  	MOVL t, 10*4+off+320(dst); \
 287  	MOVL t, 5*4+off+384(dst);  \
 288  	MOVL t, 9*4+off+448(dst);  \
 289  	MOVL t, 4*4+off+512(dst);  \
 290  	MOVL t, 8*4+off+576(dst)
 291  
 292  // func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
 293  TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
 294  	MOVL h+0(FP), AX
 295  	MOVL c+4(FP), BX
 296  	MOVL flag+8(FP), CX
 297  	MOVL blocks_base+12(FP), SI
 298  	MOVL blocks_len+16(FP), DX
 299  
 300  	MOVL SP, DI
 301  	ADDL $15, DI
 302  	ANDL $~15, DI
 303  
 304  	MOVL CX, 8(DI)
 305  	MOVL 0(BX), CX
 306  	MOVL CX, 0(DI)
 307  	MOVL 4(BX), CX
 308  	MOVL CX, 4(DI)
 309  	XORL CX, CX
 310  	MOVL CX, 12(DI)
 311  
 312  	MOVOU 0(AX), X0
 313  	MOVOU 16(AX), X1
 314  	MOVOU counter<>(SB), X2
 315  
 316  loop:
 317  	MOVO  X0, X4
 318  	MOVO  X1, X5
 319  	MOVOU iv0<>(SB), X6
 320  	MOVOU iv1<>(SB), X7
 321  
 322  	MOVO  0(DI), X3
 323  	PADDQ X2, X3
 324  	PXOR  X3, X7
 325  	MOVO  X3, 0(DI)
 326  
 327  	PRECOMPUTE(DI, 16, SI, CX)
 328  	ROUND_SSE2(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3)
 329  	ROUND_SSE2(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3)
 330  	ROUND_SSE2(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3)
 331  	ROUND_SSE2(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3)
 332  	ROUND_SSE2(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3)
 333  	ROUND_SSE2(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3)
 334  	ROUND_SSE2(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3)
 335  	ROUND_SSE2(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3)
 336  	ROUND_SSE2(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3)
 337  	ROUND_SSE2(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3)
 338  
 339  	PXOR X4, X0
 340  	PXOR X5, X1
 341  	PXOR X6, X0
 342  	PXOR X7, X1
 343  
 344  	LEAL 64(SI), SI
 345  	SUBL $64, DX
 346  	JNE  loop
 347  
 348  	MOVL 0(DI), CX
 349  	MOVL CX, 0(BX)
 350  	MOVL 4(DI), CX
 351  	MOVL CX, 4(BX)
 352  
 353  	MOVOU X0, 0(AX)
 354  	MOVOU X1, 16(AX)
 355  
 356  	RET
 357  
 358  // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
 359  TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
 360  	MOVL h+0(FP), AX
 361  	MOVL c+4(FP), BX
 362  	MOVL flag+8(FP), CX
 363  	MOVL blocks_base+12(FP), SI
 364  	MOVL blocks_len+16(FP), DX
 365  
 366  	MOVL SP, DI
 367  	ADDL $15, DI
 368  	ANDL $~15, DI
 369  
 370  	MOVL CX, 8(DI)
 371  	MOVL 0(BX), CX
 372  	MOVL CX, 0(DI)
 373  	MOVL 4(BX), CX
 374  	MOVL CX, 4(DI)
 375  	XORL CX, CX
 376  	MOVL CX, 12(DI)
 377  
 378  	MOVOU 0(AX), X0
 379  	MOVOU 16(AX), X1
 380  	MOVOU counter<>(SB), X2
 381  
 382  loop:
 383  	MOVO  X0, 656(DI)
 384  	MOVO  X1, 672(DI)
 385  	MOVO  X0, X4
 386  	MOVO  X1, X5
 387  	MOVOU iv0<>(SB), X6
 388  	MOVOU iv1<>(SB), X7
 389  
 390  	MOVO  0(DI), X3
 391  	PADDQ X2, X3
 392  	PXOR  X3, X7
 393  	MOVO  X3, 0(DI)
 394  
 395  	MOVOU rol16<>(SB), X0
 396  	MOVOU rol8<>(SB), X1
 397  
 398  	PRECOMPUTE(DI, 16, SI, CX)
 399  	ROUND_SSSE3(X4, X5, X6, X7, 16(DI), 32(DI), 48(DI), 64(DI), X3, X0, X1)
 400  	ROUND_SSSE3(X4, X5, X6, X7, 16+64(DI), 32+64(DI), 48+64(DI), 64+64(DI), X3, X0, X1)
 401  	ROUND_SSSE3(X4, X5, X6, X7, 16+128(DI), 32+128(DI), 48+128(DI), 64+128(DI), X3, X0, X1)
 402  	ROUND_SSSE3(X4, X5, X6, X7, 16+192(DI), 32+192(DI), 48+192(DI), 64+192(DI), X3, X0, X1)
 403  	ROUND_SSSE3(X4, X5, X6, X7, 16+256(DI), 32+256(DI), 48+256(DI), 64+256(DI), X3, X0, X1)
 404  	ROUND_SSSE3(X4, X5, X6, X7, 16+320(DI), 32+320(DI), 48+320(DI), 64+320(DI), X3, X0, X1)
 405  	ROUND_SSSE3(X4, X5, X6, X7, 16+384(DI), 32+384(DI), 48+384(DI), 64+384(DI), X3, X0, X1)
 406  	ROUND_SSSE3(X4, X5, X6, X7, 16+448(DI), 32+448(DI), 48+448(DI), 64+448(DI), X3, X0, X1)
 407  	ROUND_SSSE3(X4, X5, X6, X7, 16+512(DI), 32+512(DI), 48+512(DI), 64+512(DI), X3, X0, X1)
 408  	ROUND_SSSE3(X4, X5, X6, X7, 16+576(DI), 32+576(DI), 48+576(DI), 64+576(DI), X3, X0, X1)
 409  
 410  	MOVO 656(DI), X0
 411  	MOVO 672(DI), X1
 412  	PXOR X4, X0
 413  	PXOR X5, X1
 414  	PXOR X6, X0
 415  	PXOR X7, X1
 416  
 417  	LEAL 64(SI), SI
 418  	SUBL $64, DX
 419  	JNE  loop
 420  
 421  	MOVL 0(DI), CX
 422  	MOVL CX, 0(BX)
 423  	MOVL 4(DI), CX
 424  	MOVL CX, 4(BX)
 425  
 426  	MOVOU X0, 0(AX)
 427  	MOVOU X1, 16(AX)
 428  
 429  	RET
 430