• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#define dstin   x0
15#define src     x1
16#define count   x2
17#define dst     x3
18#define srcend  x4
19#define dstend  x5
20#define A_l     x6
21#define A_lw    w6
22#define A_h     x7
23#define B_l     x8
24#define B_lw    w8
25#define B_h     x9
26#define C_l     x10
27#define C_lw    w10
28#define C_h     x11
29#define D_l     x12
30#define D_h     x13
31#define E_l     x14
32#define E_h     x15
33#define F_l     x16
34#define F_h     x17
35#define G_l     count
36#define G_h     dst
37#define H_l     src
38#define H_h     srcend
39#define tmp1    x14
40
41/* This implementation of memcpy uses unaligned accesses and branchless
42   sequences to keep the code small, simple and improve performance.
43
44   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
45   copies of up to 128 bytes, and large copies.  The overhead of the overlap
46   check is negligible since it is only required for large copies.
47
48   Large copies use a software pipelined loop processing 64 bytes per iteration.
49   The destination pointer is 16-byte aligned to minimize unaligned accesses.
50   The loop tail is handled by always copying 64 bytes from the end.
51*/
52
53.global memcpy
54.type memcpy,%function
55memcpy:
56	add     srcend, src, count
57	add     dstend, dstin, count
58	cmp     count, 128
59	b.hi    .Lcopy_long
60	cmp     count, 32
61	b.hi    .Lcopy32_128
62
63	/* Small copies: 0..32 bytes.  */
64	cmp     count, 16
65	b.lo    .Lcopy16
66	ldp     A_l, A_h, [src]
67	ldp     D_l, D_h, [srcend, -16]
68	stp     A_l, A_h, [dstin]
69	stp     D_l, D_h, [dstend, -16]
70	ret
71
72	/* Copy 8-15 bytes.  */
73.Lcopy16:
74	tbz     count, 3, .Lcopy8
75	ldr     A_l, [src]
76	ldr     A_h, [srcend, -8]
77	str     A_l, [dstin]
78	str     A_h, [dstend, -8]
79	ret
80
81	.p2align 3
82	/* Copy 4-7 bytes.  */
83.Lcopy8:
84	tbz     count, 2, .Lcopy4
85	ldr     A_lw, [src]
86	ldr     B_lw, [srcend, -4]
87	str     A_lw, [dstin]
88	str     B_lw, [dstend, -4]
89	ret
90
91	/* Copy 0..3 bytes using a branchless sequence.  */
92.Lcopy4:
93	cbz     count, .Lcopy0
94	lsr     tmp1, count, 1
95	ldrb    A_lw, [src]
96	ldrb    C_lw, [srcend, -1]
97	ldrb    B_lw, [src, tmp1]
98	strb    A_lw, [dstin]
99	strb    B_lw, [dstin, tmp1]
100	strb    C_lw, [dstend, -1]
101.Lcopy0:
102	ret
103
104	.p2align 4
105	/* Medium copies: 33..128 bytes.  */
106.Lcopy32_128:
107	ldp     A_l, A_h, [src]
108	ldp     B_l, B_h, [src, 16]
109	ldp     C_l, C_h, [srcend, -32]
110	ldp     D_l, D_h, [srcend, -16]
111	cmp     count, 64
112	b.hi    .Lcopy128
113	stp     A_l, A_h, [dstin]
114	stp     B_l, B_h, [dstin, 16]
115	stp     C_l, C_h, [dstend, -32]
116	stp     D_l, D_h, [dstend, -16]
117	ret
118
119	.p2align 4
120	/* Copy 65..128 bytes.  */
121.Lcopy128:
122	ldp     E_l, E_h, [src, 32]
123	ldp     F_l, F_h, [src, 48]
124	cmp     count, 96
125	b.ls    .Lcopy96
126	ldp     G_l, G_h, [srcend, -64]
127	ldp     H_l, H_h, [srcend, -48]
128	stp     G_l, G_h, [dstend, -64]
129	stp     H_l, H_h, [dstend, -48]
130.Lcopy96:
131	stp     A_l, A_h, [dstin]
132	stp     B_l, B_h, [dstin, 16]
133	stp     E_l, E_h, [dstin, 32]
134	stp     F_l, F_h, [dstin, 48]
135	stp     C_l, C_h, [dstend, -32]
136	stp     D_l, D_h, [dstend, -16]
137	ret
138
139	.p2align 4
140	/* Copy more than 128 bytes.  */
141.Lcopy_long:
142
143	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
144
145	ldp     D_l, D_h, [src]
146	and     tmp1, dstin, 15
147	bic     dst, dstin, 15
148	sub     src, src, tmp1
149	add     count, count, tmp1      /* Count is now 16 too large.  */
150	ldp     A_l, A_h, [src, 16]
151	stp     D_l, D_h, [dstin]
152	ldp     B_l, B_h, [src, 32]
153	ldp     C_l, C_h, [src, 48]
154	ldp     D_l, D_h, [src, 64]!
155	subs    count, count, 128 + 16  /* Test and readjust count.  */
156	b.ls    .Lcopy64_from_end
157
158.Lloop64:
159	stp     A_l, A_h, [dst, 16]
160	ldp     A_l, A_h, [src, 16]
161	stp     B_l, B_h, [dst, 32]
162	ldp     B_l, B_h, [src, 32]
163	stp     C_l, C_h, [dst, 48]
164	ldp     C_l, C_h, [src, 48]
165	stp     D_l, D_h, [dst, 64]!
166	ldp     D_l, D_h, [src, 64]!
167	subs    count, count, 64
168	b.hi    .Lloop64
169
170	/* Write the last iteration and copy 64 bytes from the end.  */
171.Lcopy64_from_end:
172	ldp     E_l, E_h, [srcend, -64]
173	stp     A_l, A_h, [dst, 16]
174	ldp     A_l, A_h, [srcend, -48]
175	stp     B_l, B_h, [dst, 32]
176	ldp     B_l, B_h, [srcend, -32]
177	stp     C_l, C_h, [dst, 48]
178	ldp     C_l, C_h, [srcend, -16]
179	stp     D_l, D_h, [dst, 64]
180	stp     E_l, E_h, [dstend, -64]
181	stp     A_l, A_h, [dstend, -48]
182	stp     B_l, B_h, [dstend, -32]
183	stp     C_l, C_h, [dstend, -16]
184	ret
185
186.size memcpy,.-memcpy
187