md5block_riscv64.s raw

   1  // Copyright 2023 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  //
   5  // RISCV64 version of md5block.go
   6  // derived from crypto/md5/md5block_arm64.s and crypto/md5/md5block.go
   7  
   8  //go:build !purego
   9  
  10  #include "textflag.h"
  11  
  12  #define LOAD32U(base, offset, tmp, dest) \
  13  	MOVBU	(offset+0*1)(base), dest; \
  14  	MOVBU	(offset+1*1)(base), tmp; \
  15  	SLL	$8, tmp; \
  16  	OR	tmp, dest; \
  17  	MOVBU	(offset+2*1)(base), tmp; \
  18  	SLL	$16, tmp; \
  19  	OR	tmp, dest; \
  20  	MOVBU	(offset+3*1)(base), tmp; \
  21  	SLL	$24, tmp; \
  22  	OR	tmp, dest
  23  
  24  #define LOAD64U(base, offset, tmp1, tmp2, dst) \
  25  	LOAD32U(base, offset, tmp1, dst); \
  26  	LOAD32U(base, offset+4, tmp1, tmp2); \
  27  	SLL	$32, tmp2; \
  28  	OR	tmp2, dst
  29  
  30  #define ROUND1EVN(a, b, c, d, x, const, shift) \
  31  	MOV	$const, X23; \
  32  	ADDW	x, a; \
  33  	ADDW	X23, a; \
  34  	XOR	c, d, X23; \
  35  	AND	b, X23; \
  36  	XOR	d, X23; \
  37  	ADDW	X23, a; \
  38  	RORIW	$(32-shift), a; \
  39  	ADDW	b, a
  40  
  41  #define ROUND1ODD(a, b, c, d, x, const, shift) \
  42  	MOV	$const, X23; \
  43  	ADDW	X23, a; \
  44  	SRL	$32, x, X23; \
  45  	ADDW	X23, a; \
  46  	XOR	c, d, X23; \
  47  	AND	b, X23; \
  48  	XOR	d, X23; \
  49  	ADDW	X23, a; \
  50  	RORIW	$(32-shift), a; \
  51  	ADDW	b, a
  52  
  53  #define ROUND2EVN(a, b, c, d, x, const, shift) \
  54  	MOV	$const, X23; \
  55  	ADDW	x, a; \
  56  	ADDW	X23, a; \
  57  	XOR	b, c, X23; \
  58  	AND	d, X23; \
  59  	XOR	c, X23; \
  60  	ADDW	X23, a; \
  61  	RORIW	$(32-shift), a; \
  62  	ADDW	b, a
  63  
  64  #define ROUND2ODD(a, b, c, d, x, const, shift) \
  65  	MOV	$const, X23; \
  66  	ADDW	X23, a; \
  67  	SRL	$32, x, X23; \
  68  	ADDW	X23, a; \
  69  	XOR	b, c, X23; \
  70  	AND	d, X23; \
  71  	XOR	c, X23; \
  72  	ADDW	X23, a; \
  73  	RORIW	$(32-shift), a; \
  74  	ADDW	b, a
  75  
  76  #define ROUND3EVN(a, b, c, d, x, const, shift) \
  77  	MOV	$const, X23; \
  78  	ADDW	x, a; \
  79  	ADDW	X23, a; \
  80  	XOR	c, d, X23; \
  81  	XOR	b, X23; \
  82  	ADDW	X23, a; \
  83  	RORIW	$(32-shift), a; \
  84  	ADDW	b, a
  85  
  86  #define ROUND3ODD(a, b, c, d, x, const, shift) \
  87  	MOV	$const, X23; \
  88  	ADDW	X23, a; \
  89  	SRL	$32, x, X23; \
  90  	ADDW	X23, a; \
  91  	XOR	c, d, X23; \
  92  	XOR	b, X23; \
  93  	ADDW	X23, a; \
  94  	RORIW	$(32-shift), a; \
  95  	ADDW	b, a
  96  
  97  #define ROUND4EVN(a, b, c, d, x, const, shift) \
  98  	MOV	$const, X23; \
  99  	ADDW	x, a; \
 100  	ADDW	X23, a; \
 101  	ORN	d, b, X23; \
 102  	XOR	c, X23; \
 103  	ADDW	X23, a; \
 104  	RORIW	$(32-shift), a; \
 105  	ADDW	b, a
 106  
 107  #define ROUND4ODD(a, b, c, d, x, const, shift) \
 108  	MOV	$const, X23; \
 109  	ADDW	X23, a; \
 110  	SRL	$32, x, X23; \
 111  	ADDW	X23, a; \
 112  	ORN	d, b, X23; \
 113  	XOR	c, X23; \
 114  	ADDW	X23, a; \
 115  	RORIW	$(32-shift), a; \
 116  	ADDW	b, a
 117  
 118  // Register use for the block function
 119  //
 120  // X5 - X12	: contain the 16 32 bit data items in the block we're
 121  //		  processing.  Odd numbered values, e.g., x1, x3 are stored in
 122  //		  the upper 32 bits of the register.
 123  // X13 - X16	: a, b, c, d
 124  // X17 - X20	: used to store the old values of a, b, c, d, i.e., aa, bb, cc,
 125  //		  dd.  X17 and X18 are also used as temporary registers when
 126  //		  loading unaligned data.
 127  // X22		: pointer to dig.s
 128  // X23		: temporary register
 129  // X28		: pointer to the first byte beyond the end of p
 130  // X29		: pointer to current 64 byte block of data, initially set to
 131  //		  &p[0]
 132  // X30		: temporary register
 133  
 134  TEXT	·block(SB),NOSPLIT,$0-32
 135  	MOV	p+8(FP), X29
 136  	MOV	p_len+16(FP), X30
 137  	SRL	$6, X30
 138  	SLL	$6, X30
 139  	BEQZ	X30, zero
 140  
 141  	ADD	X29, X30, X28
 142  
 143  	MOV	dig+0(FP), X22
 144  	MOVWU	(0*4)(X22), X13	// a = s[0]
 145  	MOVWU	(1*4)(X22), X14	// b = s[1]
 146  	MOVWU	(2*4)(X22), X15	// c = s[2]
 147  	MOVWU	(3*4)(X22), X16	// d = s[3]
 148  
 149  loop:
 150  
 151  	// Load the 64 bytes of data in x0-15 into 8 64 bit registers, X5-X12.
 152  	// Different paths are taken to load the values depending on whether the
 153  	// buffer is 8 byte aligned or not.  We load all the values up front
 154  	// here at the start of the loop to avoid multiple alignment checks and
 155  	// to reduce code size.  It takes 10 instructions to load an unaligned
 156  	// 32 bit value and this value will be used 4 times in the main body
 157  	// of the loop below.
 158  
 159  	AND	$7, X29, X30
 160  	BEQZ	X30, aligned
 161  
 162  	LOAD64U(X29,0, X17, X18, X5)
 163  	LOAD64U(X29,8, X17, X18, X6)
 164  	LOAD64U(X29,16, X17, X18, X7)
 165  	LOAD64U(X29,24, X17, X18, X8)
 166  	LOAD64U(X29,32, X17, X18, X9)
 167  	LOAD64U(X29,40, X17, X18, X10)
 168  	LOAD64U(X29,48, X17, X18, X11)
 169  	LOAD64U(X29,56, X17, X18, X12)
 170  	JMP block_loaded
 171  
 172  aligned:
 173  	MOV	(0*8)(X29), X5
 174  	MOV	(1*8)(X29), X6
 175  	MOV	(2*8)(X29), X7
 176  	MOV	(3*8)(X29), X8
 177  	MOV	(4*8)(X29), X9
 178  	MOV	(5*8)(X29), X10
 179  	MOV	(6*8)(X29), X11
 180  	MOV	(7*8)(X29), X12
 181  
 182  block_loaded:
 183  	MOV	X13, X17
 184  	MOV	X14, X18
 185  	MOV	X15, X19
 186  	MOV	X16, X20
 187  
 188  	// Some of the hex constants below are too large to fit into a
 189  	// signed 32 bit value.  The assembler will handle these
 190  	// constants in a special way to ensure that they are
 191  	// zero extended.  Our algorithm is only interested in the
 192  	// bottom 32 bits and doesn't care whether constants are
 193  	// sign or zero extended when moved into 64 bit registers.
 194  	// So we use signed constants instead of hex when bit 31 is
 195  	// set so all constants can be loaded by lui+addi.
 196  
 197  	ROUND1EVN(X13,X14,X15,X16,X5,  -680876936, 7); // 0xd76aa478
 198  	ROUND1ODD(X16,X13,X14,X15,X5,  -389564586,12); // 0xe8c7b756
 199  	ROUND1EVN(X15,X16,X13,X14,X6,  0x242070db,17); // 0x242070db
 200  	ROUND1ODD(X14,X15,X16,X13,X6, -1044525330,22); // 0xc1bdceee
 201  	ROUND1EVN(X13,X14,X15,X16,X7,  -176418897, 7); // 0xf57c0faf
 202  	ROUND1ODD(X16,X13,X14,X15,X7,  0x4787c62a,12); // 0x4787c62a
 203  	ROUND1EVN(X15,X16,X13,X14,X8, -1473231341,17); // 0xa8304613
 204  	ROUND1ODD(X14,X15,X16,X13,X8,   -45705983,22); // 0xfd469501
 205  	ROUND1EVN(X13,X14,X15,X16,X9,  0x698098d8, 7); // 0x698098d8
 206  	ROUND1ODD(X16,X13,X14,X15,X9, -1958414417,12); // 0x8b44f7af
 207  	ROUND1EVN(X15,X16,X13,X14,X10,     -42063,17); // 0xffff5bb1
 208  	ROUND1ODD(X14,X15,X16,X13,X10,-1990404162,22); // 0x895cd7be
 209  	ROUND1EVN(X13,X14,X15,X16,X11, 0x6b901122, 7); // 0x6b901122
 210  	ROUND1ODD(X16,X13,X14,X15,X11,  -40341101,12); // 0xfd987193
 211  	ROUND1EVN(X15,X16,X13,X14,X12,-1502002290,17); // 0xa679438e
 212  	ROUND1ODD(X14,X15,X16,X13,X12, 0x49b40821,22); // 0x49b40821
 213  
 214  	ROUND2ODD(X13,X14,X15,X16,X5,  -165796510, 5); // f61e2562
 215  	ROUND2EVN(X16,X13,X14,X15,X8, -1069501632, 9); // c040b340
 216  	ROUND2ODD(X15,X16,X13,X14,X10, 0x265e5a51,14); // 265e5a51
 217  	ROUND2EVN(X14,X15,X16,X13,X5,  -373897302,20); // e9b6c7aa
 218  	ROUND2ODD(X13,X14,X15,X16,X7,  -701558691, 5); // d62f105d
 219  	ROUND2EVN(X16,X13,X14,X15,X10,  0x2441453, 9); // 2441453
 220  	ROUND2ODD(X15,X16,X13,X14,X12, -660478335,14); // d8a1e681
 221  	ROUND2EVN(X14,X15,X16,X13,X7,  -405537848,20); // e7d3fbc8
 222  	ROUND2ODD(X13,X14,X15,X16,X9,  0x21e1cde6, 5); // 21e1cde6
 223  	ROUND2EVN(X16,X13,X14,X15,X12,-1019803690, 9); // c33707d6
 224  	ROUND2ODD(X15,X16,X13,X14,X6,  -187363961,14); // f4d50d87
 225  	ROUND2EVN(X14,X15,X16,X13,X9,  0x455a14ed,20); // 455a14ed
 226  	ROUND2ODD(X13,X14,X15,X16,X11,-1444681467, 5); // a9e3e905
 227  	ROUND2EVN(X16,X13,X14,X15,X6,   -51403784, 9); // fcefa3f8
 228  	ROUND2ODD(X15,X16,X13,X14,X8,  0x676f02d9,14); // 676f02d9
 229  	ROUND2EVN(X14,X15,X16,X13,X11,-1926607734,20); // 8d2a4c8a
 230  
 231  	ROUND3ODD(X13,X14,X15,X16,X7,     -378558, 4); // fffa3942
 232  	ROUND3EVN(X16,X13,X14,X15,X9, -2022574463,11); // 8771f681
 233  	ROUND3ODD(X15,X16,X13,X14,X10, 0x6d9d6122,16); // 6d9d6122
 234  	ROUND3EVN(X14,X15,X16,X13,X12,  -35309556,23); // fde5380c
 235  	ROUND3ODD(X13,X14,X15,X16,X5, -1530992060, 4); // a4beea44
 236  	ROUND3EVN(X16,X13,X14,X15,X7,  0x4bdecfa9,11); // 4bdecfa9
 237  	ROUND3ODD(X15,X16,X13,X14,X8,  -155497632,16); // f6bb4b60
 238  	ROUND3EVN(X14,X15,X16,X13,X10,-1094730640,23); // bebfbc70
 239  	ROUND3ODD(X13,X14,X15,X16,X11, 0x289b7ec6, 4); // 289b7ec6
 240  	ROUND3EVN(X16,X13,X14,X15,X5,  -358537222,11); // eaa127fa
 241  	ROUND3ODD(X15,X16,X13,X14,X6,  -722521979,16); // d4ef3085
 242  	ROUND3EVN(X14,X15,X16,X13,X8,   0x4881d05,23); // 4881d05
 243  	ROUND3ODD(X13,X14,X15,X16,X9,  -640364487, 4); // d9d4d039
 244  	ROUND3EVN(X16,X13,X14,X15,X11, -421815835,11); // e6db99e5
 245  	ROUND3ODD(X15,X16,X13,X14,X12, 0x1fa27cf8,16); // 1fa27cf8
 246  	ROUND3EVN(X14,X15,X16,X13,X6,  -995338651,23); // c4ac5665
 247  
 248  	ROUND4EVN(X13,X14,X15,X16,X5,  -198630844, 6); // f4292244
 249  	ROUND4ODD(X16,X13,X14,X15,X8,  0x432aff97,10); // 432aff97
 250  	ROUND4EVN(X15,X16,X13,X14,X12,-1416354905,15); // ab9423a7
 251  	ROUND4ODD(X14,X15,X16,X13,X7,   -57434055,21); // fc93a039
 252  	ROUND4EVN(X13,X14,X15,X16,X11, 0x655b59c3, 6); // 655b59c3
 253  	ROUND4ODD(X16,X13,X14,X15,X6, -1894986606,10); // 8f0ccc92
 254  	ROUND4EVN(X15,X16,X13,X14,X10   ,-1051523,15); // ffeff47d
 255  	ROUND4ODD(X14,X15,X16,X13,X5, -2054922799,21); // 85845dd1
 256  	ROUND4EVN(X13,X14,X15,X16,X9,  0x6fa87e4f, 6); // 6fa87e4f
 257  	ROUND4ODD(X16,X13,X14,X15,X12,  -30611744,10); // fe2ce6e0
 258  	ROUND4EVN(X15,X16,X13,X14,X8, -1560198380,15); // a3014314
 259  	ROUND4ODD(X14,X15,X16,X13,X11, 0x4e0811a1,21); // 4e0811a1
 260  	ROUND4EVN(X13,X14,X15,X16,X7,  -145523070, 6); // f7537e82
 261  	ROUND4ODD(X16,X13,X14,X15,X10,-1120210379,10); // bd3af235
 262  	ROUND4EVN(X15,X16,X13,X14,X6,  0x2ad7d2bb,15); // 2ad7d2bb
 263  	ROUND4ODD(X14,X15,X16,X13,X9,  -343485551,21); // eb86d391
 264  
 265  	ADDW	X17, X13
 266  	ADDW	X18, X14
 267  	ADDW	X19, X15
 268  	ADDW	X20, X16
 269  
 270  	ADD	$64, X29
 271  	BNE	X28, X29, loop
 272  
 273  	MOVW	X13, (0*4)(X22)
 274  	MOVW	X14, (1*4)(X22)
 275  	MOVW	X15, (2*4)(X22)
 276  	MOVW	X16, (3*4)(X22)
 277  
 278  zero:
 279  	RET
 280