1/* 2 * strcpy/stpcpy - copy a string returning pointer to start/end. 3 * 4 * Copyright (c) 2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD. 11 * MTE compatible. 12 */ 13 14#include "../asmdefs.h" 15 16#define dstin x0 17#define srcin x1 18#define result x0 19 20#define src x2 21#define dst x3 22#define len x4 23#define synd x4 24#define tmp x5 25#define wtmp w5 26#define shift x5 27#define data1 x6 28#define dataw1 w6 29#define data2 x7 30#define dataw2 w7 31 32#define dataq q0 33#define vdata v0 34#define vhas_nul v1 35#define vrepmask v2 36#define vend v3 37#define dend d3 38#define dataq2 q1 39 40#ifdef BUILD_STPCPY 41# define STRCPY __stpcpy_aarch64_mte 42# define IFSTPCPY(X,...) X,__VA_ARGS__ 43#else 44# define STRCPY __strcpy_aarch64_mte 45# define IFSTPCPY(X,...) 46#endif 47 48/* Core algorithm: 49 50 For each 16-byte chunk we calculate a 64-bit syndrome value with four bits 51 per byte. For even bytes, bits 0-3 are set if the relevant byte matched the 52 requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are 53 set likewise for odd bytes so that adjacent bytes can be merged. Since the 54 bits in the syndrome reflect the order in which things occur in the original 55 string, counting trailing zeros identifies exactly which byte matched. */ 56 57ENTRY (STRCPY) 58 PTR_ARG (0) 59 PTR_ARG (1) 60 bic src, srcin, 15 61 mov wtmp, 0xf00f 62 ld1 {vdata.16b}, [src] 63 dup vrepmask.8h, wtmp 64 cmeq vhas_nul.16b, vdata.16b, 0 65 lsl shift, srcin, 2 66 and vhas_nul.16b, vhas_nul.16b, vrepmask.16b 67 addp vend.16b, vhas_nul.16b, vhas_nul.16b 68 fmov synd, dend 69 lsr synd, synd, shift 70 cbnz synd, L(tail) 71 72 ldr dataq, [src, 16]! 73 cmeq vhas_nul.16b, vdata.16b, 0 74 and vhas_nul.16b, vhas_nul.16b, vrepmask.16b 75 addp vend.16b, vhas_nul.16b, vhas_nul.16b 76 fmov synd, dend 77 cbz synd, L(start_loop) 78 79#ifndef __AARCH64EB__ 80 rbit synd, synd 81#endif 82 sub tmp, src, srcin 83 clz len, synd 84 add len, tmp, len, lsr 2 85 tbz len, 4, L(less16) 86 sub tmp, len, 15 87 ldr dataq, [srcin] 88 ldr dataq2, [srcin, tmp] 89 str dataq, [dstin] 90 str dataq2, [dstin, tmp] 91 IFSTPCPY (add result, dstin, len) 92 ret 93 94 .p2align 4,,8 95L(tail): 96 rbit synd, synd 97 clz len, synd 98 lsr len, len, 2 99 100 .p2align 4 101L(less16): 102 tbz len, 3, L(less8) 103 sub tmp, len, 7 104 ldr data1, [srcin] 105 ldr data2, [srcin, tmp] 106 str data1, [dstin] 107 str data2, [dstin, tmp] 108 IFSTPCPY (add result, dstin, len) 109 ret 110 111 .p2align 4 112L(less8): 113 subs tmp, len, 3 114 b.lo L(less4) 115 ldr dataw1, [srcin] 116 ldr dataw2, [srcin, tmp] 117 str dataw1, [dstin] 118 str dataw2, [dstin, tmp] 119 IFSTPCPY (add result, dstin, len) 120 ret 121 122L(less4): 123 cbz len, L(zerobyte) 124 ldrh dataw1, [srcin] 125 strh dataw1, [dstin] 126L(zerobyte): 127 strb wzr, [dstin, len] 128 IFSTPCPY (add result, dstin, len) 129 ret 130 131 .p2align 4 132L(start_loop): 133 sub len, src, srcin 134 ldr dataq2, [srcin] 135 add dst, dstin, len 136 str dataq2, [dstin] 137 138 .p2align 5 139L(loop): 140 str dataq, [dst], 16 141 ldr dataq, [src, 16]! 142 cmeq vhas_nul.16b, vdata.16b, 0 143 umaxp vend.16b, vhas_nul.16b, vhas_nul.16b 144 fmov synd, dend 145 cbz synd, L(loop) 146 147 and vhas_nul.16b, vhas_nul.16b, vrepmask.16b 148 addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ 149 fmov synd, dend 150#ifndef __AARCH64EB__ 151 rbit synd, synd 152#endif 153 clz len, synd 154 lsr len, len, 2 155 sub tmp, len, 15 156 ldr dataq, [src, tmp] 157 str dataq, [dst, tmp] 158 IFSTPCPY (add result, dst, len) 159 ret 160 161END (STRCPY) 162