• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* Copyright (c) 2012, Linaro Limited
2   All rights reserved.
3
4   Redistribution and use in source and binary forms, with or without
5   modification, are permitted provided that the following conditions are met:
6       * Redistributions of source code must retain the above copyright
7         notice, this list of conditions and the following disclaimer.
8       * Redistributions in binary form must reproduce the above copyright
9         notice, this list of conditions and the following disclaimer in the
10         documentation and/or other materials provided with the distribution.
11       * Neither the name of the Linaro nor the
12         names of its contributors may be used to endorse or promote products
13         derived from this software without specific prior written permission.
14
15   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#include <private/bionic_asm.h>
36
37#define dstin	x0
38#define src	x1
39#define count	x2
40#define tmp1	x3
41#define tmp1w	w3
42#define tmp2	x4
43#define tmp2w	w4
44#define tmp3	x5
45#define tmp3w	w5
46#define dst	x6
47
48#define A_l	x7
49#define A_h	x8
50#define B_l	x9
51#define B_h	x10
52#define C_l	x11
53#define C_h	x12
54#define D_l	x13
55#define D_h	x14
56
57#define QA_l q0
58#define QA_h q1
59#define QB_l q2
60#define QB_h q3
61
62ENTRY(memcpy)
63
64	mov	dst, dstin
65	cmp	count, #64
66	b.ge	.Lcpy_not_short
67	cmp	count, #15
68	b.le	.Ltail15tiny
69
70	/* Deal with small copies quickly by dropping straight into the
71	 * exit block.  */
72.Ltail63:
73	/* Copy up to 48 bytes of data.  At this point we only need the
74	 * bottom 6 bits of count to be accurate.  */
75	ands	tmp1, count, #0x30
76	b.eq	.Ltail15
77	add	dst, dst, tmp1
78	add	src, src, tmp1
79	cmp	tmp1w, #0x20
80	b.eq	1f
81	b.lt	2f
82	ldp	A_l, A_h, [src, #-48]
83	stp	A_l, A_h, [dst, #-48]
841:
85	ldp	A_l, A_h, [src, #-32]
86	stp	A_l, A_h, [dst, #-32]
872:
88	ldp	A_l, A_h, [src, #-16]
89	stp	A_l, A_h, [dst, #-16]
90
91.Ltail15:
92	ands	count, count, #15
93	beq	1f
94	add	src, src, count
95	ldp	A_l, A_h, [src, #-16]
96	add	dst, dst, count
97	stp	A_l, A_h, [dst, #-16]
981:
99	ret
100
101.Ltail15tiny:
102	/* Copy up to 15 bytes of data.  Does not assume additional data
103	   being copied.  */
104	tbz	count, #3, 1f
105	ldr	tmp1, [src], #8
106	str	tmp1, [dst], #8
1071:
108	tbz	count, #2, 1f
109	ldr	tmp1w, [src], #4
110	str	tmp1w, [dst], #4
1111:
112	tbz	count, #1, 1f
113	ldrh	tmp1w, [src], #2
114	strh	tmp1w, [dst], #2
1151:
116	tbz	count, #0, 1f
117	ldrb	tmp1w, [src]
118	strb	tmp1w, [dst]
1191:
120	ret
121
122.Lcpy_not_short:
123	/* We don't much care about the alignment of DST, but we want SRC
124	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
125	 * boundaries on both loads and stores.  */
126	neg	tmp2, src
127	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
128	b.eq	2f
129	sub	count, count, tmp2
130	/* Copy more data than needed; it's faster than jumping
131	 * around copying sub-Quadword quantities.  We know that
132	 * it can't overrun.  */
133	ldp	A_l, A_h, [src]
134	add	src, src, tmp2
135	stp	A_l, A_h, [dst]
136	add	dst, dst, tmp2
137	/* There may be less than 63 bytes to go now.  */
138	cmp	count, #63
139	b.le	.Ltail63
1402:
141	subs	count, count, #128
142	b.ge	.Lcpy_body_large
143	/* Less than 128 bytes to copy, so handle 64 here and then jump
144	 * to the tail.  */
145	ldp QA_l, QA_h, [src]
146	ldp QB_l, QB_h, [src, #32]
147	stp QA_l, QA_h, [dst]
148	stp QB_l, QB_h, [dst, #32]
149	tst	count, #0x3f
150	add	src, src, #64
151	add	dst, dst, #64
152	b.ne	.Ltail63
153	ret
154
155	/* Critical loop.  Start at a new cache line boundary.  Assuming
156	 * 64 bytes per line this ensures the entire loop is in one line.  */
157	.p2align 6
158.Lcpy_body_large:
159	/* There are at least 128 bytes to copy.  */
160	ldp QA_l, QA_h, [src, #0]
161	sub	dst, dst, #32		/* Pre-bias.  */
162	ldp QB_l, QB_h, [src, #32]!	/* src += 64 - Pre-bias.  */
1631:
164	stp QA_l, QA_h, [dst, #32]
165	ldp QA_l, QA_h, [src, #32]
166	stp QB_l, QB_h, [dst, #64]!
167	ldp QB_l, QB_h, [src, #64]!
168
169	subs	count, count, #64
170	b.ge	1b
171
172	stp QA_l, QA_h, [dst, #32]
173	stp QB_l, QB_h, [dst, #64]
174	add	src, src, #32
175	add	dst, dst, #64 + 32
176	tst	count, #0x3f
177	b.ne	.Ltail63
178	ret
179END(memcpy)
180