• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31
32#ifndef MEMMOVE
33# define MEMMOVE		memmove
34#endif
35
36#ifndef L
37# define L(label)	.L##label
38#endif
39
40#ifndef cfi_startproc
41# define cfi_startproc	.cfi_startproc
42#endif
43
44#ifndef cfi_endproc
45# define cfi_endproc	.cfi_endproc
46#endif
47
48#ifndef cfi_rel_offset
49# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
50#endif
51
52#ifndef cfi_restore
53# define cfi_restore(reg)	.cfi_restore reg
54#endif
55
56#ifndef cfi_adjust_cfa_offset
57# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
58#endif
59
60#ifndef ENTRY
61# define ENTRY(name)		\
62	.type name,  @function;		\
63	.globl name;		\
64	.p2align 4;		\
65name:		\
66	cfi_startproc
67#endif
68
69#ifndef ALIAS_SYMBOL
70# define ALIAS_SYMBOL(alias, original) \
71	.globl alias; \
72	.equ alias, original
73#endif
74
75#ifndef END
76# define END(name)		\
77	cfi_endproc;		\
78	.size name, .-name
79#endif
80
81#define CFI_PUSH(REG)		\
82	cfi_adjust_cfa_offset (8);		\
83	cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG)		\
86	cfi_adjust_cfa_offset (-8);		\
87	cfi_restore (REG)
88
89#define PUSH(REG)	push REG;
90#define POP(REG)	pop REG;
91
92#define ENTRANCE	\
93    PUSH (%rbx);    \
94    CFI_PUSH (%rbx);
95#define RETURN_END	\
96    POP (%rbx);     \
97    CFI_POP (%rbx); \
98    ret
99#define RETURN		RETURN_END;
100
101	.section .text.sse2,"ax",@progbits
102ENTRY (__memcpy_chk)
103	cmp	%rcx, %rdx
104	ja	__memcpy_chk_fail
105/* Fall through to memcpy/memmove. */
106END (__memcpy_chk)
107ENTRY (MEMMOVE)
108	ENTRANCE
109	mov	%rdi, %rax
110
111/* Check whether we should copy backward or forward.  */
112	cmp	%rsi, %rdi
113	je	L(mm_return)
114	jg	L(mm_len_0_or_more_backward)
115
116/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
117	separately.  */
118	cmp	$16, %rdx
119	jbe	L(mm_len_0_16_bytes_forward)
120
121	cmp	$32, %rdx
122	ja	L(mm_len_32_or_more_forward)
123
124/* Copy [0..32] and return.  */
125	movdqu	(%rsi), %xmm0
126	movdqu	-16(%rsi, %rdx), %xmm1
127	movdqu	%xmm0, (%rdi)
128	movdqu	%xmm1, -16(%rdi, %rdx)
129	jmp	L(mm_return)
130
131L(mm_len_32_or_more_forward):
132	cmp	$64, %rdx
133	ja	L(mm_len_64_or_more_forward)
134
135/* Copy [0..64] and return.  */
136	movdqu	(%rsi), %xmm0
137	movdqu	16(%rsi), %xmm1
138	movdqu	-16(%rsi, %rdx), %xmm2
139	movdqu	-32(%rsi, %rdx), %xmm3
140	movdqu	%xmm0, (%rdi)
141	movdqu	%xmm1, 16(%rdi)
142	movdqu	%xmm2, -16(%rdi, %rdx)
143	movdqu	%xmm3, -32(%rdi, %rdx)
144	jmp	L(mm_return)
145
146L(mm_len_64_or_more_forward):
147	cmp	$128, %rdx
148	ja	L(mm_len_128_or_more_forward)
149
150/* Copy [0..128] and return.  */
151	movdqu	(%rsi), %xmm0
152	movdqu	16(%rsi), %xmm1
153	movdqu	32(%rsi), %xmm2
154	movdqu	48(%rsi), %xmm3
155	movdqu	-64(%rsi, %rdx), %xmm4
156	movdqu	-48(%rsi, %rdx), %xmm5
157	movdqu	-32(%rsi, %rdx), %xmm6
158	movdqu	-16(%rsi, %rdx), %xmm7
159	movdqu	%xmm0, (%rdi)
160	movdqu	%xmm1, 16(%rdi)
161	movdqu	%xmm2, 32(%rdi)
162	movdqu	%xmm3, 48(%rdi)
163	movdqu	%xmm4, -64(%rdi, %rdx)
164	movdqu	%xmm5, -48(%rdi, %rdx)
165	movdqu	%xmm6, -32(%rdi, %rdx)
166	movdqu	%xmm7, -16(%rdi, %rdx)
167	jmp	L(mm_return)
168
169L(mm_len_128_or_more_forward):
170/* Aligning the address of destination.  */
171/*  save first unaligned 64 bytes */
172	movdqu	(%rsi), %xmm0
173	movdqu	16(%rsi), %xmm1
174	movdqu	32(%rsi), %xmm2
175	movdqu	48(%rsi), %xmm3
176
177	lea	64(%rdi), %r8
178	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
179	sub	%rdi, %rsi /* rsi = src - dst = diff */
180
181	movdqu	(%r8, %rsi), %xmm4
182	movdqu	16(%r8, %rsi), %xmm5
183	movdqu	32(%r8, %rsi), %xmm6
184	movdqu	48(%r8, %rsi), %xmm7
185
186	movdqu	%xmm0, (%rdi)
187	movdqu	%xmm1, 16(%rdi)
188	movdqu	%xmm2, 32(%rdi)
189	movdqu	%xmm3, 48(%rdi)
190	movdqa	%xmm4, (%r8)
191	movaps	%xmm5, 16(%r8)
192	movaps	%xmm6, 32(%r8)
193	movaps	%xmm7, 48(%r8)
194	add	$64, %r8
195
196	lea	(%rdi, %rdx), %rbx
197	and	$-64, %rbx
198	cmp	%r8, %rbx
199	jbe	L(mm_copy_remaining_forward)
200
201	cmp	__x86_shared_cache_size_half(%rip), %rdx
202
203	ja      L(mm_overlapping_check_forward)
204
205	.p2align 4
206L(mm_main_loop_forward):
207
208	prefetcht0 128(%r8, %rsi)
209
210	movdqu	(%r8, %rsi), %xmm0
211	movdqu	16(%r8, %rsi), %xmm1
212	movdqu	32(%r8, %rsi), %xmm2
213	movdqu	48(%r8, %rsi), %xmm3
214	movdqa	%xmm0, (%r8)
215	movaps	%xmm1, 16(%r8)
216	movaps	%xmm2, 32(%r8)
217	movaps	%xmm3, 48(%r8)
218	lea	64(%r8), %r8
219	cmp	%r8, %rbx
220	ja	L(mm_main_loop_forward)
221
222L(mm_copy_remaining_forward):
223	add	%rdi, %rdx
224	sub	%r8, %rdx
225/* We copied all up till %rdi position in the dst.
226	In %rdx now is how many bytes are left to copy.
227	Now we need to advance %r8. */
228	lea	(%r8, %rsi), %r9
229
230L(mm_remaining_0_64_bytes_forward):
231	cmp	$32, %rdx
232	ja	L(mm_remaining_33_64_bytes_forward)
233	cmp	$16, %rdx
234	ja	L(mm_remaining_17_32_bytes_forward)
235	test	%rdx, %rdx
236	.p2align 4,,2
237	je	L(mm_return)
238
239	cmpb	$8, %dl
240	ja	L(mm_remaining_9_16_bytes_forward)
241	cmpb	$4, %dl
242	.p2align 4,,5
243	ja	L(mm_remaining_5_8_bytes_forward)
244	cmpb	$2, %dl
245	.p2align 4,,1
246	ja	L(mm_remaining_3_4_bytes_forward)
247	movzbl	-1(%r9,%rdx), %esi
248	movzbl	(%r9), %ebx
249	movb	%sil, -1(%r8,%rdx)
250	movb	%bl, (%r8)
251	jmp	L(mm_return)
252
253L(mm_remaining_33_64_bytes_forward):
254	movdqu	(%r9), %xmm0
255	movdqu	16(%r9), %xmm1
256	movdqu	-32(%r9, %rdx), %xmm2
257	movdqu	-16(%r9, %rdx), %xmm3
258	movdqu	%xmm0, (%r8)
259	movdqu	%xmm1, 16(%r8)
260	movdqu	%xmm2, -32(%r8, %rdx)
261	movdqu	%xmm3, -16(%r8, %rdx)
262	jmp	L(mm_return)
263
264L(mm_remaining_17_32_bytes_forward):
265	movdqu	(%r9), %xmm0
266	movdqu	-16(%r9, %rdx), %xmm1
267	movdqu	%xmm0, (%r8)
268	movdqu	%xmm1, -16(%r8, %rdx)
269	jmp	L(mm_return)
270
271L(mm_remaining_5_8_bytes_forward):
272	movl	(%r9), %esi
273	movl	-4(%r9,%rdx), %ebx
274	movl	%esi, (%r8)
275	movl	%ebx, -4(%r8,%rdx)
276	jmp	L(mm_return)
277
278L(mm_remaining_9_16_bytes_forward):
279	mov	(%r9), %rsi
280	mov	-8(%r9, %rdx), %rbx
281	mov	%rsi, (%r8)
282	mov	%rbx, -8(%r8, %rdx)
283	jmp	L(mm_return)
284
285L(mm_remaining_3_4_bytes_forward):
286	movzwl	-2(%r9,%rdx), %esi
287	movzwl	(%r9), %ebx
288	movw	%si, -2(%r8,%rdx)
289	movw	%bx, (%r8)
290	jmp	L(mm_return)
291
292L(mm_len_0_16_bytes_forward):
293	testb	$24, %dl
294	jne	L(mm_len_9_16_bytes_forward)
295	testb	$4, %dl
296	.p2align 4,,5
297	jne	L(mm_len_5_8_bytes_forward)
298	test	%rdx, %rdx
299	.p2align 4,,2
300	je	L(mm_return)
301	testb	$2, %dl
302	.p2align 4,,1
303	jne	L(mm_len_2_4_bytes_forward)
304	movzbl	-1(%rsi,%rdx), %ebx
305	movzbl	(%rsi), %esi
306	movb	%bl, -1(%rdi,%rdx)
307	movb	%sil, (%rdi)
308	jmp	L(mm_return)
309
310L(mm_len_2_4_bytes_forward):
311	movzwl	-2(%rsi,%rdx), %ebx
312	movzwl	(%rsi), %esi
313	movw	%bx, -2(%rdi,%rdx)
314	movw	%si, (%rdi)
315	jmp	L(mm_return)
316
317L(mm_len_5_8_bytes_forward):
318	movl	(%rsi), %ebx
319	movl	-4(%rsi,%rdx), %esi
320	movl	%ebx, (%rdi)
321	movl	%esi, -4(%rdi,%rdx)
322	jmp	L(mm_return)
323
324L(mm_len_9_16_bytes_forward):
325	mov	(%rsi), %rbx
326	mov	-8(%rsi, %rdx), %rsi
327	mov	%rbx, (%rdi)
328	mov	%rsi, -8(%rdi, %rdx)
329	jmp	L(mm_return)
330
331L(mm_recalc_len):
332/* Compute in %rdx how many bytes are left to copy after
333	the main loop stops.  */
334	mov 	%rbx, %rdx
335	sub 	%rdi, %rdx
336/* The code for copying backwards.  */
337L(mm_len_0_or_more_backward):
338
339/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
340	separately.  */
341	cmp	$16, %rdx
342	jbe	L(mm_len_0_16_bytes_backward)
343
344	cmp	$32, %rdx
345	ja	L(mm_len_32_or_more_backward)
346
347/* Copy [0..32] and return.  */
348	movdqu	(%rsi), %xmm0
349	movdqu	-16(%rsi, %rdx), %xmm1
350	movdqu	%xmm0, (%rdi)
351	movdqu	%xmm1, -16(%rdi, %rdx)
352	jmp	L(mm_return)
353
354L(mm_len_32_or_more_backward):
355	cmp	$64, %rdx
356	ja	L(mm_len_64_or_more_backward)
357
358/* Copy [0..64] and return.  */
359	movdqu	(%rsi), %xmm0
360	movdqu	16(%rsi), %xmm1
361	movdqu	-16(%rsi, %rdx), %xmm2
362	movdqu	-32(%rsi, %rdx), %xmm3
363	movdqu	%xmm0, (%rdi)
364	movdqu	%xmm1, 16(%rdi)
365	movdqu	%xmm2, -16(%rdi, %rdx)
366	movdqu	%xmm3, -32(%rdi, %rdx)
367	jmp	L(mm_return)
368
369L(mm_len_64_or_more_backward):
370	cmp	$128, %rdx
371	ja	L(mm_len_128_or_more_backward)
372
373/* Copy [0..128] and return.  */
374	movdqu	(%rsi), %xmm0
375	movdqu	16(%rsi), %xmm1
376	movdqu	32(%rsi), %xmm2
377	movdqu	48(%rsi), %xmm3
378	movdqu	-64(%rsi, %rdx), %xmm4
379	movdqu	-48(%rsi, %rdx), %xmm5
380	movdqu	-32(%rsi, %rdx), %xmm6
381	movdqu	-16(%rsi, %rdx), %xmm7
382	movdqu	%xmm0, (%rdi)
383	movdqu	%xmm1, 16(%rdi)
384	movdqu	%xmm2, 32(%rdi)
385	movdqu	%xmm3, 48(%rdi)
386	movdqu	%xmm4, -64(%rdi, %rdx)
387	movdqu	%xmm5, -48(%rdi, %rdx)
388	movdqu	%xmm6, -32(%rdi, %rdx)
389	movdqu	%xmm7, -16(%rdi, %rdx)
390	jmp	L(mm_return)
391
392L(mm_len_128_or_more_backward):
393/* Aligning the address of destination. We need to save
394	16 bits from the source in order not to overwrite them.  */
395	movdqu	-16(%rsi, %rdx), %xmm0
396	movdqu	-32(%rsi, %rdx), %xmm1
397	movdqu	-48(%rsi, %rdx), %xmm2
398	movdqu	-64(%rsi, %rdx), %xmm3
399
400	lea	(%rdi, %rdx), %r9
401	and	$-64, %r9 /* r9 = aligned dst */
402
403	mov	%rsi, %r8
404	sub	%rdi, %r8 /* r8 = src - dst, diff */
405
406	movdqu	-16(%r9, %r8), %xmm4
407	movdqu	-32(%r9, %r8), %xmm5
408	movdqu	-48(%r9, %r8), %xmm6
409	movdqu	-64(%r9, %r8), %xmm7
410
411	movdqu	%xmm0, -16(%rdi, %rdx)
412	movdqu	%xmm1, -32(%rdi, %rdx)
413	movdqu	%xmm2, -48(%rdi, %rdx)
414	movdqu	%xmm3, -64(%rdi, %rdx)
415	movdqa	%xmm4, -16(%r9)
416	movaps	%xmm5, -32(%r9)
417	movaps	%xmm6, -48(%r9)
418	movaps	%xmm7, -64(%r9)
419	lea	-64(%r9), %r9
420
421	lea	64(%rdi), %rbx
422	and	$-64, %rbx
423
424	cmp	%r9, %rbx
425	jae	L(mm_recalc_len)
426
427	cmp	__x86_shared_cache_size_half(%rip), %rdx
428
429	ja	L(mm_overlapping_check_backward)
430
431	.p2align 4
432L(mm_main_loop_backward):
433
434	prefetcht0 -128(%r9, %r8)
435
436	movdqu	-64(%r9, %r8), %xmm0
437	movdqu	-48(%r9, %r8), %xmm1
438	movdqu	-32(%r9, %r8), %xmm2
439	movdqu	-16(%r9, %r8), %xmm3
440	movdqa	%xmm0, -64(%r9)
441	movaps	%xmm1, -48(%r9)
442	movaps	%xmm2, -32(%r9)
443	movaps	%xmm3, -16(%r9)
444	lea	-64(%r9), %r9
445	cmp	%r9, %rbx
446	jb	L(mm_main_loop_backward)
447	jmp	L(mm_recalc_len)
448
449/* Copy [0..16] and return.  */
450L(mm_len_0_16_bytes_backward):
451	testb	$24, %dl
452	jnz	L(mm_len_9_16_bytes_backward)
453	testb	$4, %dl
454	.p2align 4,,5
455	jnz	L(mm_len_5_8_bytes_backward)
456	test	%rdx, %rdx
457	.p2align 4,,2
458	je	L(mm_return)
459	testb	$2, %dl
460	.p2align 4,,1
461	jne	L(mm_len_3_4_bytes_backward)
462	movzbl	-1(%rsi,%rdx), %ebx
463	movzbl	(%rsi), %ecx
464	movb	%bl, -1(%rdi,%rdx)
465	movb	%cl, (%rdi)
466	jmp	L(mm_return)
467
468L(mm_len_3_4_bytes_backward):
469	movzwl	-2(%rsi,%rdx), %ebx
470	movzwl	(%rsi), %ecx
471	movw	%bx, -2(%rdi,%rdx)
472	movw	%cx, (%rdi)
473	jmp	L(mm_return)
474
475L(mm_len_9_16_bytes_backward):
476	movl	-4(%rsi,%rdx), %ebx
477	movl	-8(%rsi,%rdx), %ecx
478	movl	%ebx, -4(%rdi,%rdx)
479	movl	%ecx, -8(%rdi,%rdx)
480	sub	$8, %rdx
481	jmp	L(mm_len_0_16_bytes_backward)
482
483L(mm_len_5_8_bytes_backward):
484	movl	(%rsi), %ebx
485	movl	-4(%rsi,%rdx), %ecx
486	movl	%ebx, (%rdi)
487	movl	%ecx, -4(%rdi,%rdx)
488
489L(mm_return):
490	RETURN
491
492/* Big length copy forward part.  */
493
494	.p2align 4
495
496L(mm_overlapping_check_forward):
497	mov	%rsi, %r9
498	add	%rdx, %r9
499	cmp	__x86_shared_cache_size(%rip), %r9
500	jbe	L(mm_main_loop_forward)
501
502L(mm_large_page_loop_forward):
503	movdqu	(%r8, %rsi), %xmm0
504	movdqu	16(%r8, %rsi), %xmm1
505	movdqu	32(%r8, %rsi), %xmm2
506	movdqu	48(%r8, %rsi), %xmm3
507	movntdq	%xmm0, (%r8)
508	movntdq	%xmm1, 16(%r8)
509	movntdq	%xmm2, 32(%r8)
510	movntdq	%xmm3, 48(%r8)
511	lea 	64(%r8), %r8
512	cmp	%r8, %rbx
513	ja	L(mm_large_page_loop_forward)
514	sfence
515	jmp	L(mm_copy_remaining_forward)
516
517/* Big length copy backward part.  */
518	.p2align 4
519
520L(mm_overlapping_check_backward):
521	mov	%rdi, %r11
522	sub	%rsi, %r11 /* r11 = dst - src, diff */
523	add	%rdx, %r11
524	cmp	__x86_shared_cache_size(%rip), %r11
525	jbe	L(mm_main_loop_backward)
526
527L(mm_large_page_loop_backward):
528	movdqu	-64(%r9, %r8), %xmm0
529	movdqu	-48(%r9, %r8), %xmm1
530	movdqu	-32(%r9, %r8), %xmm2
531	movdqu	-16(%r9, %r8), %xmm3
532	movntdq	%xmm0, -64(%r9)
533	movntdq	%xmm1, -48(%r9)
534	movntdq	%xmm2, -32(%r9)
535	movntdq	%xmm3, -16(%r9)
536	lea 	-64(%r9), %r9
537	cmp	%r9, %rbx
538	jb	L(mm_large_page_loop_backward)
539	sfence
540	jmp	L(mm_recalc_len)
541
542END (MEMMOVE)
543
544ALIAS_SYMBOL(memcpy, MEMMOVE)
545