md5block_loong64.s raw

   1  // Copyright 2024 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  //
   5  // Loong64 version of md5block.go
   6  // derived from crypto/md5/md5block_amd64.s
   7  
   8  //go:build !purego
   9  
  10  #define REGTMP	R30
  11  #define REGTMP1 R12
  12  #define REGTMP2 R18
  13  
  14  #include "textflag.h"
  15  
  16  // func block(dig *digest, p []byte)
  17  TEXT	·block(SB),NOSPLIT,$0-32
  18  	MOVV	dig+0(FP), R4
  19  	MOVV	p+8(FP), R5
  20  	MOVV	p_len+16(FP), R6
  21  	AND	$~63, R6
  22  	BEQ	R6, zero
  23  
  24  	// p_len >= 64
  25  	ADDV	R5, R6, R24
  26  	MOVW	(0*4)(R4), R7
  27  	MOVW	(1*4)(R4), R8
  28  	MOVW	(2*4)(R4), R9
  29  	MOVW	(3*4)(R4), R10
  30  
  31  loop:
  32  	MOVW	R7, R14
  33  	MOVW	R8, R15
  34  	MOVW	R9, R16
  35  	MOVW	R10, R17
  36  
  37  	MOVW	(0*4)(R5), R11
  38  	MOVW	R10, REGTMP1
  39  
  40  // F = ((c ^ d) & b) ^ d
  41  #define ROUND1(a, b, c, d, index, const, shift) \
  42  	ADDV	$const, a; \
  43  	ADD	R11, a; \
  44  	MOVW	(index*4)(R5), R11; \
  45  	XOR	c, REGTMP1; \
  46  	AND	b, REGTMP1; \
  47  	XOR	d, REGTMP1; \
  48  	ADD	REGTMP1, a; \
  49  	ROTR	$(32-shift), a; \
  50  	MOVW	c, REGTMP1; \
  51  	ADD	b, a
  52  
  53  	ROUND1(R7,  R8,  R9,  R10,  1, 0xd76aa478,  7);
  54  	ROUND1(R10, R7,  R8,  R9,   2, 0xe8c7b756, 12);
  55  	ROUND1(R9,  R10, R7,  R8,   3, 0x242070db, 17);
  56  	ROUND1(R8,  R9,  R10, R7,   4, 0xc1bdceee, 22);
  57  	ROUND1(R7,  R8,  R9,  R10,  5, 0xf57c0faf,  7);
  58  	ROUND1(R10, R7,  R8,  R9,   6, 0x4787c62a, 12);
  59  	ROUND1(R9,  R10, R7,  R8,   7, 0xa8304613, 17);
  60  	ROUND1(R8,  R9,  R10, R7,   8, 0xfd469501, 22);
  61  	ROUND1(R7,  R8,  R9,  R10,  9, 0x698098d8,  7);
  62  	ROUND1(R10, R7,  R8,  R9,  10, 0x8b44f7af, 12);
  63  	ROUND1(R9,  R10, R7,  R8,  11, 0xffff5bb1, 17);
  64  	ROUND1(R8,  R9,  R10, R7,  12, 0x895cd7be, 22);
  65  	ROUND1(R7,  R8,  R9,  R10, 13, 0x6b901122,  7);
  66  	ROUND1(R10, R7,  R8,  R9,  14, 0xfd987193, 12);
  67  	ROUND1(R9,  R10, R7,  R8,  15, 0xa679438e, 17);
  68  	ROUND1(R8,  R9,  R10, R7,   1, 0x49b40821, 22);
  69  
  70  	MOVW	(1*4)(R5), R11
  71  
  72  // F = ((b ^ c) & d) ^ c
  73  #define ROUND2(a, b, c, d, index, const, shift) \
  74  	ADDV	$const, a; \
  75  	ADD	R11, a; \
  76  	MOVW	(index*4)(R5), R11; \
  77  	XOR	b, c, REGTMP; \
  78  	AND	REGTMP, d, REGTMP; \
  79  	XOR	REGTMP, c, REGTMP; \
  80  	ADD	REGTMP, a; \
  81  	ROTR	$(32-shift), a; \
  82  	ADD	b, a
  83  
  84  	ROUND2(R7,  R8,  R9,  R10,  6, 0xf61e2562,  5);
  85  	ROUND2(R10, R7,  R8,  R9,  11, 0xc040b340,  9);
  86  	ROUND2(R9,  R10, R7,  R8,   0, 0x265e5a51, 14);
  87  	ROUND2(R8,  R9,  R10, R7,   5, 0xe9b6c7aa, 20);
  88  	ROUND2(R7,  R8,  R9,  R10, 10, 0xd62f105d,  5);
  89  	ROUND2(R10, R7,  R8,  R9,  15,  0x2441453,  9);
  90  	ROUND2(R9,  R10, R7,  R8,   4, 0xd8a1e681, 14);
  91  	ROUND2(R8,  R9,  R10, R7,   9, 0xe7d3fbc8, 20);
  92  	ROUND2(R7,  R8,  R9,  R10, 14, 0x21e1cde6,  5);
  93  	ROUND2(R10, R7,  R8,  R9,   3, 0xc33707d6,  9);
  94  	ROUND2(R9,  R10, R7,  R8,   8, 0xf4d50d87, 14);
  95  	ROUND2(R8,  R9,  R10, R7,  13, 0x455a14ed, 20);
  96  	ROUND2(R7,  R8,  R9,  R10,  2, 0xa9e3e905,  5);
  97  	ROUND2(R10, R7,  R8,  R9,   7, 0xfcefa3f8,  9);
  98  	ROUND2(R9,  R10, R7,  R8,  12, 0x676f02d9, 14);
  99  	ROUND2(R8,  R9,  R10, R7,   5, 0x8d2a4c8a, 20);
 100  
 101  	MOVW	(5*4)(R5), R11
 102  	MOVW	R9, REGTMP1
 103  
 104  // F = b ^ c ^ d
 105  #define ROUND3(a, b, c, d, index, const, shift) \
 106  	ADDV	$const, a; \
 107  	ADD	R11, a; \
 108  	MOVW	(index*4)(R5), R11; \
 109  	XOR	d, REGTMP1; \
 110  	XOR	b, REGTMP1; \
 111  	ADD	REGTMP1, a; \
 112  	ROTR	$(32-shift), a; \
 113  	MOVW	b, REGTMP1; \
 114  	ADD	b, a
 115  
 116  	ROUND3(R7,  R8,  R9,  R10,  8, 0xfffa3942,  4);
 117  	ROUND3(R10, R7,  R8,  R9,  11, 0x8771f681, 11);
 118  	ROUND3(R9,  R10, R7,  R8,  14, 0x6d9d6122, 16);
 119  	ROUND3(R8,  R9,  R10, R7,   1, 0xfde5380c, 23);
 120  	ROUND3(R7,  R8,  R9,  R10,  4, 0xa4beea44,  4);
 121  	ROUND3(R10, R7,  R8,  R9,   7, 0x4bdecfa9, 11);
 122  	ROUND3(R9,  R10, R7,  R8,  10, 0xf6bb4b60, 16);
 123  	ROUND3(R8,  R9,  R10, R7,  13, 0xbebfbc70, 23);
 124  	ROUND3(R7,  R8,  R9,  R10,  0, 0x289b7ec6,  4);
 125  	ROUND3(R10, R7,  R8,  R9,   3, 0xeaa127fa, 11);
 126  	ROUND3(R9,  R10, R7,  R8,   6, 0xd4ef3085, 16);
 127  	ROUND3(R8,  R9,  R10, R7,   9,  0x4881d05, 23);
 128  	ROUND3(R7,  R8,  R9,  R10, 12, 0xd9d4d039,  4);
 129  	ROUND3(R10, R7,  R8,  R9,  15, 0xe6db99e5, 11);
 130  	ROUND3(R9,  R10, R7,  R8,   2, 0x1fa27cf8, 16);
 131  	ROUND3(R8,  R9,  R10, R7,   0, 0xc4ac5665, 23);
 132  
 133  	MOVW	(0*4)(R5), R11
 134  	MOVV	$0xffffffff, REGTMP2
 135  	XOR	R10, REGTMP2, REGTMP1	// REGTMP1 = ~d
 136  
 137  // F = c ^ (b | (~d))
 138  #define ROUND4(a, b, c, d, index, const, shift) \
 139  	ADDV	$const, a; \
 140  	ADD	R11, a; \
 141  	MOVW	(index*4)(R5), R11; \
 142  	OR	b, REGTMP1; \
 143  	XOR	c, REGTMP1; \
 144  	ADD	REGTMP1, a; \
 145  	ROTR	$(32-shift), a; \
 146  	MOVV	$0xffffffff, REGTMP2; \
 147  	XOR	c, REGTMP2, REGTMP1; \
 148  	ADD	b, a
 149  
 150  	ROUND4(R7,  R8,  R9,  R10,  7, 0xf4292244,  6);
 151  	ROUND4(R10, R7,  R8,  R9,  14, 0x432aff97, 10);
 152  	ROUND4(R9,  R10, R7,  R8,   5, 0xab9423a7, 15);
 153  	ROUND4(R8,  R9,  R10, R7,  12, 0xfc93a039, 21);
 154  	ROUND4(R7,  R8,  R9,  R10,  3, 0x655b59c3,  6);
 155  	ROUND4(R10, R7,  R8,  R9,  10, 0x8f0ccc92, 10);
 156  	ROUND4(R9,  R10, R7,  R8,   1, 0xffeff47d, 15);
 157  	ROUND4(R8,  R9,  R10, R7,   8, 0x85845dd1, 21);
 158  	ROUND4(R7,  R8,  R9,  R10, 15, 0x6fa87e4f,  6);
 159  	ROUND4(R10, R7,  R8,  R9,   6, 0xfe2ce6e0, 10);
 160  	ROUND4(R9,  R10, R7,  R8,  13, 0xa3014314, 15);
 161  	ROUND4(R8,  R9,  R10, R7,   4, 0x4e0811a1, 21);
 162  	ROUND4(R7,  R8,  R9,  R10, 11, 0xf7537e82,  6);
 163  	ROUND4(R10, R7,  R8,  R9,   2, 0xbd3af235, 10);
 164  	ROUND4(R9,  R10, R7,  R8,   9, 0x2ad7d2bb, 15);
 165  	ROUND4(R8,  R9,  R10, R7,   0, 0xeb86d391, 21);
 166  
 167  	ADD	R14, R7
 168  	ADD	R15, R8
 169  	ADD	R16, R9
 170  	ADD	R17, R10
 171  
 172  	ADDV	$64, R5
 173  	BNE	R5, R24, loop
 174  
 175  	MOVW	R7, (0*4)(R4)
 176  	MOVW	R8, (1*4)(R4)
 177  	MOVW	R9, (2*4)(R4)
 178  	MOVW	R10, (3*4)(R4)
 179  zero:
 180  	RET
 181