• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/***************************************************************************
2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the name of The Linux Foundation nor the names of its contributors may
12       be used to endorse or promote products derived from this software
13       without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26  ***************************************************************************/
27
28/* Assumes neon instructions and a cache line size of 64 bytes. */
29
30#define PLDOFFS	(10)
31#define PLDTHRESH (PLDOFFS)
32#define BBTHRESH (4096/64)
33#define PLDSIZE (64)
34
35#if (PLDOFFS < 1)
36#error Routine does not support offsets less than 1
37#endif
38
39#if (PLDTHRESH < PLDOFFS)
40#error PLD threshold must be greater than or equal to the PLD offset
41#endif
42
43	.text
44	.syntax unified
45	.fpu    neon
46
47	// To avoid warning about deprecated instructions, add an explicit
48	// arch. The code generated is exactly the same.
49	.arch armv7-a
50
51.L_memcpy_base:
52	cmp	r2, #4
53	blt	.L_neon_lt4
54	cmp	r2, #16
55	blt	.L_neon_lt16
56	cmp	r2, #32
57	blt	.L_neon_16
58	cmp	r2, #64
59	blt	.L_neon_copy_32_a
60
61	mov	r12, r2, lsr #6
62	cmp	r12, #PLDTHRESH
63	ble	.L_neon_copy_64_loop_nopld
64
65	push	{r9, r10}
66	.cfi_adjust_cfa_offset 8
67	.cfi_rel_offset r9, 0
68	.cfi_rel_offset r10, 4
69
70	cmp	r12, #BBTHRESH
71	ble	.L_neon_prime_pump
72
73	add	lr, r0, #0x400
74	add	r9, r1, #(PLDOFFS*PLDSIZE)
75	sub	lr, lr, r9
76	lsl	lr, lr, #21
77	lsr	lr, lr, #21
78	add	lr, lr, #(PLDOFFS*PLDSIZE)
79	cmp	r12, lr, lsr #6
80	ble	.L_neon_prime_pump
81
82	itt	gt
83	movgt	r9, #(PLDOFFS)
84	rsbsgt	r9, r9, lr, lsr #6
85	ble	.L_neon_prime_pump
86
87	add	r10, r1, lr
88	bic	r10, #0x3F
89
90	sub	r12, r12, lr, lsr #6
91
92	cmp	r9, r12
93	itee	le
94	suble	r12, r12, r9
95	movgt	r9, r12
96	movgt	r12, #0
97
98	pld	[r1, #((PLDOFFS-1)*PLDSIZE)]
99.L_neon_copy_64_loop_outer_doublepld:
100	pld	[r1, #((PLDOFFS)*PLDSIZE)]
101	vld1.32	{q0, q1}, [r1]!
102	vld1.32	{q2, q3}, [r1]!
103	ldr	r3, [r10]
104	subs	r9, r9, #1
105	vst1.32	{q0, q1}, [r0]!
106	vst1.32	{q2, q3}, [r0]!
107	add	r10, #64
108	bne	.L_neon_copy_64_loop_outer_doublepld
109	cmp	r12, #0
110	beq	.L_neon_pop_before_nopld
111
112	cmp	r12, #(512*1024/64)
113	blt	.L_neon_copy_64_loop_outer
114
115.L_neon_copy_64_loop_ddr:
116	vld1.32	{q0, q1}, [r1]!
117	vld1.32	{q2, q3}, [r1]!
118	pld	[r10]
119	subs	r12, r12, #1
120	vst1.32	{q0, q1}, [r0]!
121	vst1.32	{q2, q3}, [r0]!
122	add	r10, #64
123	bne	.L_neon_copy_64_loop_ddr
124	b	.L_neon_pop_before_nopld
125
126.L_neon_prime_pump:
127	mov	lr, #(PLDOFFS*PLDSIZE)
128	add	r10, r1, #(PLDOFFS*PLDSIZE)
129	bic	r10, #0x3F
130	sub	r12, r12, #PLDOFFS
131	ldr	r3, [r10, #(-1*PLDSIZE)]
132
133.L_neon_copy_64_loop_outer:
134	vld1.32	{q0, q1}, [r1]!
135	vld1.32	{q2, q3}, [r1]!
136	ldr	r3, [r10]
137	subs	r12, r12, #1
138	vst1.32	{q0, q1}, [r0]!
139	vst1.32	{q2, q3}, [r0]!
140	add	r10, #64
141	bne	.L_neon_copy_64_loop_outer
142
143.L_neon_pop_before_nopld:
144	mov	r12, lr, lsr #6
145	pop	{r9, r10}
146	.cfi_adjust_cfa_offset -8
147	.cfi_restore r9
148	.cfi_restore r10
149
150.L_neon_copy_64_loop_nopld:
151	vld1.32	{q8, q9}, [r1]!
152	vld1.32	{q10, q11}, [r1]!
153	subs	r12, r12, #1
154	vst1.32	{q8, q9}, [r0]!
155	vst1.32	{q10, q11}, [r0]!
156	bne	.L_neon_copy_64_loop_nopld
157	ands	r2, r2, #0x3f
158	beq	.L_neon_exit
159
160.L_neon_copy_32_a:
161	movs	r3, r2, lsl #27
162	bcc	.L_neon_16
163	vld1.32	{q0,q1}, [r1]!
164	vst1.32	{q0,q1}, [r0]!
165
166.L_neon_16:
167	bpl	.L_neon_lt16
168	vld1.32	{q8}, [r1]!
169	vst1.32	{q8}, [r0]!
170	ands	r2, r2, #0x0f
171	beq	.L_neon_exit
172
173.L_neon_lt16:
174	movs	r3, r2, lsl #29
175	bcc	1f
176	vld1.8	{d0}, [r1]!
177	vst1.8	{d0}, [r0]!
1781:
179	bge	.L_neon_lt4
180	vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
181	vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]!
182
183.L_neon_lt4:
184	movs	r2, r2, lsl #31
185	itt	cs
186	ldrhcs	r3, [r1], #2
187	strhcs	r3, [r0], #2
188	itt	mi
189	ldrbmi	r3, [r1]
190	strbmi	r3, [r0]
191
192.L_neon_exit:
193	pop	{r0, pc}
194