• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16/*
17 * Contributed by: Intel Corporation
18 */
19
20#include "cache.h"
21
22#ifndef L
23# define L(label)	.L##label
24#endif
25
26#ifndef ALIGN
27# define ALIGN(n)	.p2align n
28#endif
29
30#ifndef cfi_startproc
31# define cfi_startproc			.cfi_startproc
32#endif
33
34#ifndef cfi_endproc
35# define cfi_endproc			.cfi_endproc
36#endif
37
38#ifndef ENTRY
39# define ENTRY(name)			\
40	.type name,  @function; 	\
41	.globl name;			\
42	.p2align 4;			\
43name:					\
44	cfi_startproc
45#endif
46
47#ifndef END
48# define END(name)			\
49	cfi_endproc;			\
50	.size name, .-name
51#endif
52
53#define JMPTBL(I, B)	I - B
54
55/* Branch to an entry in a jump table.  TABLE is a jump table with
56   relative offsets.  INDEX is a register contains the index into the
57   jump table.  SCALE is the scale of INDEX.  */
58#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
59	lea    TABLE(%rip), %r11;						\
60	movslq (%r11, INDEX, SCALE), INDEX;				\
61	lea    (%r11, INDEX), INDEX;					\
62	jmp    *INDEX
63
64	.section .text.sse2,"ax",@progbits
65	ALIGN (4)
66ENTRY (android_memset32)	// Address in rdi
67	shr    $2, %rdx			// Count in rdx
68	movl   %esi, %ecx		// Pattern in ecx
69
70	cmp    $16, %rdx
71	jae    L(16dbwordsormore)
72
73L(write_less16dbwords):
74	lea    (%rdi, %rdx, 4), %rdi
75	BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4)
76
77	.pushsection .rodata.sse2,"a",@progbits
78	ALIGN (2)
79L(table_less16dbwords):
80	.int	JMPTBL (L(write_0dbwords), L(table_less16dbwords))
81	.int	JMPTBL (L(write_1dbwords), L(table_less16dbwords))
82	.int	JMPTBL (L(write_2dbwords), L(table_less16dbwords))
83	.int	JMPTBL (L(write_3dbwords), L(table_less16dbwords))
84	.int	JMPTBL (L(write_4dbwords), L(table_less16dbwords))
85	.int	JMPTBL (L(write_5dbwords), L(table_less16dbwords))
86	.int	JMPTBL (L(write_6dbwords), L(table_less16dbwords))
87	.int	JMPTBL (L(write_7dbwords), L(table_less16dbwords))
88	.int	JMPTBL (L(write_8dbwords), L(table_less16dbwords))
89	.int	JMPTBL (L(write_9dbwords), L(table_less16dbwords))
90	.int	JMPTBL (L(write_10dbwords), L(table_less16dbwords))
91	.int	JMPTBL (L(write_11dbwords), L(table_less16dbwords))
92	.int	JMPTBL (L(write_12dbwords), L(table_less16dbwords))
93	.int	JMPTBL (L(write_13dbwords), L(table_less16dbwords))
94	.int	JMPTBL (L(write_14dbwords), L(table_less16dbwords))
95	.int	JMPTBL (L(write_15dbwords), L(table_less16dbwords))
96	.popsection
97
98	ALIGN (4)
99L(write_15dbwords):
100	movl   %ecx, -60(%rdi)
101L(write_14dbwords):
102	movl   %ecx, -56(%rdi)
103L(write_13dbwords):
104	movl   %ecx, -52(%rdi)
105L(write_12dbwords):
106	movl   %ecx, -48(%rdi)
107L(write_11dbwords):
108	movl   %ecx, -44(%rdi)
109L(write_10dbwords):
110	movl   %ecx, -40(%rdi)
111L(write_9dbwords):
112	movl   %ecx, -36(%rdi)
113L(write_8dbwords):
114	movl   %ecx, -32(%rdi)
115L(write_7dbwords):
116	movl   %ecx, -28(%rdi)
117L(write_6dbwords):
118	movl   %ecx, -24(%rdi)
119L(write_5dbwords):
120	movl   %ecx, -20(%rdi)
121L(write_4dbwords):
122	movl   %ecx, -16(%rdi)
123L(write_3dbwords):
124	movl   %ecx, -12(%rdi)
125L(write_2dbwords):
126	movl   %ecx, -8(%rdi)
127L(write_1dbwords):
128	movl   %ecx, -4(%rdi)
129L(write_0dbwords):
130	ret
131
132	ALIGN (4)
133L(16dbwordsormore):
134	test   $3, %edi
135	jz     L(aligned4bytes)
136	mov    %ecx, (%rdi)
137	mov    %ecx, -4(%rdi, %rdx, 4)
138	sub    $1, %rdx
139	rol    $24, %ecx
140	add    $1, %rdi
141	test   $3, %edi
142	jz     L(aligned4bytes)
143	ror    $8, %ecx
144	add    $1, %rdi
145	test   $3, %edi
146	jz     L(aligned4bytes)
147	ror    $8, %ecx
148	add    $1, %rdi
149L(aligned4bytes):
150	shl    $2, %rdx
151
152	/* Fill xmm0 with the pattern.  */
153	movd   %ecx, %xmm0
154	pshufd $0, %xmm0, %xmm0
155
156	testl  $0xf, %edi
157	jz     L(aligned_16)
158/* RDX > 32 and RDI is not 16 byte aligned.  */
159	movdqu %xmm0, (%rdi)
160	mov    %rdi, %rsi
161	and    $-16, %rdi
162	add    $16, %rdi
163	sub    %rdi, %rsi
164	add    %rsi, %rdx
165
166	ALIGN (4)
167L(aligned_16):
168	cmp    $128, %rdx
169	jge    L(128bytesormore)
170
171L(aligned_16_less128bytes):
172	add    %rdx, %rdi
173	shr    $2, %rdx
174	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
175
176	ALIGN (4)
177L(128bytesormore):
178	cmp    $SHARED_CACHE_SIZE, %rdx
179	jg     L(128bytesormore_nt)
180
181L(128bytesormore_normal):
182	sub    $128, %rdx
183	movdqa %xmm0, (%rdi)
184	movdqa %xmm0, 0x10(%rdi)
185	movdqa %xmm0, 0x20(%rdi)
186	movdqa %xmm0, 0x30(%rdi)
187	movdqa %xmm0, 0x40(%rdi)
188	movdqa %xmm0, 0x50(%rdi)
189	movdqa %xmm0, 0x60(%rdi)
190	movdqa %xmm0, 0x70(%rdi)
191	lea    128(%rdi), %rdi
192	cmp    $128, %rdx
193	jl     L(128bytesless_normal)
194
195	sub    $128, %rdx
196	movdqa %xmm0, (%rdi)
197	movdqa %xmm0, 0x10(%rdi)
198	movdqa %xmm0, 0x20(%rdi)
199	movdqa %xmm0, 0x30(%rdi)
200	movdqa %xmm0, 0x40(%rdi)
201	movdqa %xmm0, 0x50(%rdi)
202	movdqa %xmm0, 0x60(%rdi)
203	movdqa %xmm0, 0x70(%rdi)
204	lea    128(%rdi), %rdi
205	cmp    $128, %rdx
206	jl     L(128bytesless_normal)
207
208	sub    $128, %rdx
209	movdqa %xmm0, (%rdi)
210	movdqa %xmm0, 0x10(%rdi)
211	movdqa %xmm0, 0x20(%rdi)
212	movdqa %xmm0, 0x30(%rdi)
213	movdqa %xmm0, 0x40(%rdi)
214	movdqa %xmm0, 0x50(%rdi)
215	movdqa %xmm0, 0x60(%rdi)
216	movdqa %xmm0, 0x70(%rdi)
217	lea    128(%rdi), %rdi
218	cmp    $128, %rdx
219	jl     L(128bytesless_normal)
220
221	sub    $128, %rdx
222	movdqa %xmm0, (%rdi)
223	movdqa %xmm0, 0x10(%rdi)
224	movdqa %xmm0, 0x20(%rdi)
225	movdqa %xmm0, 0x30(%rdi)
226	movdqa %xmm0, 0x40(%rdi)
227	movdqa %xmm0, 0x50(%rdi)
228	movdqa %xmm0, 0x60(%rdi)
229	movdqa %xmm0, 0x70(%rdi)
230	lea    128(%rdi), %rdi
231	cmp    $128, %rdx
232	jge    L(128bytesormore_normal)
233
234L(128bytesless_normal):
235	add    %rdx, %rdi
236	shr    $2, %rdx
237	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
238
239	ALIGN (4)
240L(128bytesormore_nt):
241	sub    $128, %rdx
242	movntdq %xmm0, (%rdi)
243	movntdq %xmm0, 0x10(%rdi)
244	movntdq %xmm0, 0x20(%rdi)
245	movntdq %xmm0, 0x30(%rdi)
246	movntdq %xmm0, 0x40(%rdi)
247	movntdq %xmm0, 0x50(%rdi)
248	movntdq %xmm0, 0x60(%rdi)
249	movntdq %xmm0, 0x70(%rdi)
250	lea    128(%rdi), %rdi
251	cmp    $128, %rdx
252	jge    L(128bytesormore_nt)
253
254	sfence
255	add    %rdx, %rdi
256	shr    $2, %rdx
257	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
258
259	.pushsection .rodata.sse2,"a",@progbits
260	ALIGN (2)
261L(table_16_128bytes):
262	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
263	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
264	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
265	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
266	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
267	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
268	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
269	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
270	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
271	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
272	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
273	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
274	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
275	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
276	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
277	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
278	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
279	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
280	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
281	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
282	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
283	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
284	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
285	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
286	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
287	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
288	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
289	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
290	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
291	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
292	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
293	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
294	.popsection
295
296	ALIGN (4)
297L(aligned_16_112bytes):
298	movdqa	%xmm0, -112(%rdi)
299L(aligned_16_96bytes):
300	movdqa	%xmm0, -96(%rdi)
301L(aligned_16_80bytes):
302	movdqa	%xmm0, -80(%rdi)
303L(aligned_16_64bytes):
304	movdqa	%xmm0, -64(%rdi)
305L(aligned_16_48bytes):
306	movdqa	%xmm0, -48(%rdi)
307L(aligned_16_32bytes):
308	movdqa	%xmm0, -32(%rdi)
309L(aligned_16_16bytes):
310	movdqa	%xmm0, -16(%rdi)
311L(aligned_16_0bytes):
312	ret
313
314	ALIGN (4)
315L(aligned_16_116bytes):
316	movdqa	%xmm0, -116(%rdi)
317L(aligned_16_100bytes):
318	movdqa	%xmm0, -100(%rdi)
319L(aligned_16_84bytes):
320	movdqa	%xmm0, -84(%rdi)
321L(aligned_16_68bytes):
322	movdqa	%xmm0, -68(%rdi)
323L(aligned_16_52bytes):
324	movdqa	%xmm0, -52(%rdi)
325L(aligned_16_36bytes):
326	movdqa	%xmm0, -36(%rdi)
327L(aligned_16_20bytes):
328	movdqa	%xmm0, -20(%rdi)
329L(aligned_16_4bytes):
330	movl	%ecx, -4(%rdi)
331	ret
332
333	ALIGN (4)
334L(aligned_16_120bytes):
335	movdqa	%xmm0, -120(%rdi)
336L(aligned_16_104bytes):
337	movdqa	%xmm0, -104(%rdi)
338L(aligned_16_88bytes):
339	movdqa	%xmm0, -88(%rdi)
340L(aligned_16_72bytes):
341	movdqa	%xmm0, -72(%rdi)
342L(aligned_16_56bytes):
343	movdqa	%xmm0, -56(%rdi)
344L(aligned_16_40bytes):
345	movdqa	%xmm0, -40(%rdi)
346L(aligned_16_24bytes):
347	movdqa	%xmm0, -24(%rdi)
348L(aligned_16_8bytes):
349	movq	%xmm0, -8(%rdi)
350	ret
351
352	ALIGN (4)
353L(aligned_16_124bytes):
354	movdqa	%xmm0, -124(%rdi)
355L(aligned_16_108bytes):
356	movdqa	%xmm0, -108(%rdi)
357L(aligned_16_92bytes):
358	movdqa	%xmm0, -92(%rdi)
359L(aligned_16_76bytes):
360	movdqa	%xmm0, -76(%rdi)
361L(aligned_16_60bytes):
362	movdqa	%xmm0, -60(%rdi)
363L(aligned_16_44bytes):
364	movdqa	%xmm0, -44(%rdi)
365L(aligned_16_28bytes):
366	movdqa	%xmm0, -28(%rdi)
367L(aligned_16_12bytes):
368	movq	%xmm0, -12(%rdi)
369	movl	%ecx, -4(%rdi)
370	ret
371
372END (android_memset32)
373