1/* 2 * Copyright (c) 2023 Institute of Parallel And Distributed Systems (IPADS), Shanghai Jiao Tong University (SJTU) 3 * Licensed under the Mulan PSL v2. 4 * You can use this software according to the terms and conditions of the Mulan PSL v2. 5 * You may obtain a copy of Mulan PSL v2 at: 6 * http://license.coscl.org.cn/MulanPSL2 7 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR 8 * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR 9 * PURPOSE. 10 * See the Mulan PSL v2 for more details. 11 */ 12 13#include <common/asm.h> 14 15/* Assumptions: 16 * 17 * ARMv8-a, AArch64, unaligned accesses 18 */ 19 20/* Parameters and result. */ 21#define dstin x0 22#define src x1 23#define count x2 24#define srcend x3 25#define dstend x4 26#define tmp1 x5 27#define A_l x6 28#define A_h x7 29#define B_l x8 30#define B_h x9 31#define C_l x10 32#define C_h x11 33#define D_l x12 34#define D_h x13 35#define E_l count 36#define E_h tmp1 37 38/* All memmoves up to 96 bytes are done by memcpy as it supports overlaps. 39 Larger backwards copies are also handled by memcpy. The only remaining 40 case is forward large copies. The destination is aligned, and an 41 unrolled loop processes 64 bytes per iteration. 42*/ 43 44BEGIN_FUNC(memmove) 45 sub tmp1, dstin, src 46 cmp count, 96 47 ccmp tmp1, count, 2, hi 48 b.hs memcpy 49 50 cbz tmp1, 3f 51 add dstend, dstin, count 52 add srcend, src, count 53 54 /* Align dstend to 16 byte alignment so that we don't cross cache line 55 boundaries on both loads and stores. There are at least 96 bytes 56 to copy, so copy 16 bytes unaligned and then align. The loop 57 copies 64 bytes per iteration and prefetches one iteration ahead. */ 58 59 and tmp1, dstend, 15 60 ldp D_l, D_h, [srcend, -16] 61 sub srcend, srcend, tmp1 62 sub count, count, tmp1 63 ldp A_l, A_h, [srcend, -16] 64 stp D_l, D_h, [dstend, -16] 65 ldp B_l, B_h, [srcend, -32] 66 ldp C_l, C_h, [srcend, -48] 67 ldp D_l, D_h, [srcend, -64]! 68 sub dstend, dstend, tmp1 69 subs count, count, 128 70 b.ls 2f 71 nop 721: 73 stp A_l, A_h, [dstend, -16] 74 ldp A_l, A_h, [srcend, -16] 75 stp B_l, B_h, [dstend, -32] 76 ldp B_l, B_h, [srcend, -32] 77 stp C_l, C_h, [dstend, -48] 78 ldp C_l, C_h, [srcend, -48] 79 stp D_l, D_h, [dstend, -64]! 80 ldp D_l, D_h, [srcend, -64]! 81 subs count, count, 64 82 b.hi 1b 83 84 /* Write the last full set of 64 bytes. The remainder is at most 64 85 bytes, so it is safe to always copy 64 bytes from the start even if 86 there is just 1 byte left. */ 872: 88 ldp E_l, E_h, [src, 48] 89 stp A_l, A_h, [dstend, -16] 90 ldp A_l, A_h, [src, 32] 91 stp B_l, B_h, [dstend, -32] 92 ldp B_l, B_h, [src, 16] 93 stp C_l, C_h, [dstend, -48] 94 ldp C_l, C_h, [src] 95 stp D_l, D_h, [dstend, -64] 96 stp E_l, E_h, [dstin, 48] 97 stp A_l, A_h, [dstin, 32] 98 stp B_l, B_h, [dstin, 16] 99 stp C_l, C_h, [dstin] 1003: ret 101END_FUNC(memmove) 102