memcpy.S raw

   1  /*
   2   * memcpy - copy memory area
   3   *
   4   * Copyright (c) 2012-2020, Arm Limited.
   5   * SPDX-License-Identifier: MIT
   6   */
   7  
   8  /* Assumptions:
   9   *
  10   * ARMv8-a, AArch64, unaligned accesses.
  11   *
  12   */
  13  
  14  #define dstin   x0
  15  #define src     x1
  16  #define count   x2
  17  #define dst     x3
  18  #define srcend  x4
  19  #define dstend  x5
  20  #define A_l     x6
  21  #define A_lw    w6
  22  #define A_h     x7
  23  #define B_l     x8
  24  #define B_lw    w8
  25  #define B_h     x9
  26  #define C_l     x10
  27  #define C_lw    w10
  28  #define C_h     x11
  29  #define D_l     x12
  30  #define D_h     x13
  31  #define E_l     x14
  32  #define E_h     x15
  33  #define F_l     x16
  34  #define F_h     x17
  35  #define G_l     count
  36  #define G_h     dst
  37  #define H_l     src
  38  #define H_h     srcend
  39  #define tmp1    x14
  40  
  41  /* This implementation of memcpy uses unaligned accesses and branchless
  42     sequences to keep the code small, simple and improve performance.
  43  
  44     Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  45     copies of up to 128 bytes, and large copies.  The overhead of the overlap
  46     check is negligible since it is only required for large copies.
  47  
  48     Large copies use a software pipelined loop processing 64 bytes per iteration.
  49     The destination pointer is 16-byte aligned to minimize unaligned accesses.
  50     The loop tail is handled by always copying 64 bytes from the end.
  51  */
  52  
  53  .global memcpy
  54  .type memcpy,%function
  55  memcpy:
  56  	add     srcend, src, count
  57  	add     dstend, dstin, count
  58  	cmp     count, 128
  59  	b.hi    .Lcopy_long
  60  	cmp     count, 32
  61  	b.hi    .Lcopy32_128
  62  
  63  	/* Small copies: 0..32 bytes.  */
  64  	cmp     count, 16
  65  	b.lo    .Lcopy16
  66  	ldp     A_l, A_h, [src]
  67  	ldp     D_l, D_h, [srcend, -16]
  68  	stp     A_l, A_h, [dstin]
  69  	stp     D_l, D_h, [dstend, -16]
  70  	ret
  71  
  72  	/* Copy 8-15 bytes.  */
  73  .Lcopy16:
  74  	tbz     count, 3, .Lcopy8
  75  	ldr     A_l, [src]
  76  	ldr     A_h, [srcend, -8]
  77  	str     A_l, [dstin]
  78  	str     A_h, [dstend, -8]
  79  	ret
  80  
  81  	.p2align 3
  82  	/* Copy 4-7 bytes.  */
  83  .Lcopy8:
  84  	tbz     count, 2, .Lcopy4
  85  	ldr     A_lw, [src]
  86  	ldr     B_lw, [srcend, -4]
  87  	str     A_lw, [dstin]
  88  	str     B_lw, [dstend, -4]
  89  	ret
  90  
  91  	/* Copy 0..3 bytes using a branchless sequence.  */
  92  .Lcopy4:
  93  	cbz     count, .Lcopy0
  94  	lsr     tmp1, count, 1
  95  	ldrb    A_lw, [src]
  96  	ldrb    C_lw, [srcend, -1]
  97  	ldrb    B_lw, [src, tmp1]
  98  	strb    A_lw, [dstin]
  99  	strb    B_lw, [dstin, tmp1]
 100  	strb    C_lw, [dstend, -1]
 101  .Lcopy0:
 102  	ret
 103  
 104  	.p2align 4
 105  	/* Medium copies: 33..128 bytes.  */
 106  .Lcopy32_128:
 107  	ldp     A_l, A_h, [src]
 108  	ldp     B_l, B_h, [src, 16]
 109  	ldp     C_l, C_h, [srcend, -32]
 110  	ldp     D_l, D_h, [srcend, -16]
 111  	cmp     count, 64
 112  	b.hi    .Lcopy128
 113  	stp     A_l, A_h, [dstin]
 114  	stp     B_l, B_h, [dstin, 16]
 115  	stp     C_l, C_h, [dstend, -32]
 116  	stp     D_l, D_h, [dstend, -16]
 117  	ret
 118  
 119  	.p2align 4
 120  	/* Copy 65..128 bytes.  */
 121  .Lcopy128:
 122  	ldp     E_l, E_h, [src, 32]
 123  	ldp     F_l, F_h, [src, 48]
 124  	cmp     count, 96
 125  	b.ls    .Lcopy96
 126  	ldp     G_l, G_h, [srcend, -64]
 127  	ldp     H_l, H_h, [srcend, -48]
 128  	stp     G_l, G_h, [dstend, -64]
 129  	stp     H_l, H_h, [dstend, -48]
 130  .Lcopy96:
 131  	stp     A_l, A_h, [dstin]
 132  	stp     B_l, B_h, [dstin, 16]
 133  	stp     E_l, E_h, [dstin, 32]
 134  	stp     F_l, F_h, [dstin, 48]
 135  	stp     C_l, C_h, [dstend, -32]
 136  	stp     D_l, D_h, [dstend, -16]
 137  	ret
 138  
 139  	.p2align 4
 140  	/* Copy more than 128 bytes.  */
 141  .Lcopy_long:
 142  
 143  	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
 144  
 145  	ldp     D_l, D_h, [src]
 146  	and     tmp1, dstin, 15
 147  	bic     dst, dstin, 15
 148  	sub     src, src, tmp1
 149  	add     count, count, tmp1      /* Count is now 16 too large.  */
 150  	ldp     A_l, A_h, [src, 16]
 151  	stp     D_l, D_h, [dstin]
 152  	ldp     B_l, B_h, [src, 32]
 153  	ldp     C_l, C_h, [src, 48]
 154  	ldp     D_l, D_h, [src, 64]!
 155  	subs    count, count, 128 + 16  /* Test and readjust count.  */
 156  	b.ls    .Lcopy64_from_end
 157  
 158  .Lloop64:
 159  	stp     A_l, A_h, [dst, 16]
 160  	ldp     A_l, A_h, [src, 16]
 161  	stp     B_l, B_h, [dst, 32]
 162  	ldp     B_l, B_h, [src, 32]
 163  	stp     C_l, C_h, [dst, 48]
 164  	ldp     C_l, C_h, [src, 48]
 165  	stp     D_l, D_h, [dst, 64]!
 166  	ldp     D_l, D_h, [src, 64]!
 167  	subs    count, count, 64
 168  	b.hi    .Lloop64
 169  
 170  	/* Write the last iteration and copy 64 bytes from the end.  */
 171  .Lcopy64_from_end:
 172  	ldp     E_l, E_h, [srcend, -64]
 173  	stp     A_l, A_h, [dst, 16]
 174  	ldp     A_l, A_h, [srcend, -48]
 175  	stp     B_l, B_h, [dst, 32]
 176  	ldp     B_l, B_h, [srcend, -32]
 177  	stp     C_l, C_h, [dst, 48]
 178  	ldp     C_l, C_h, [srcend, -16]
 179  	stp     D_l, D_h, [dst, 64]
 180  	stp     E_l, E_h, [dstend, -64]
 181  	stp     A_l, A_h, [dstend, -48]
 182  	stp     B_l, B_h, [dstend, -32]
 183  	stp     C_l, C_h, [dstend, -16]
 184  	ret
 185  
 186  .size memcpy,.-memcpy
 187