• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE		memmove
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#define CFI_PUSH(REG)		\
77	cfi_adjust_cfa_offset (4);		\
78	cfi_rel_offset (REG, 0)
79
80#define CFI_POP(REG)		\
81	cfi_adjust_cfa_offset (-4);		\
82	cfi_restore (REG)
83
84#define PUSH(REG)	push REG;
85#define POP(REG)	pop REG;
86
87#define ENTRANCE	PUSH (%rbx);
88#define RETURN_END	POP (%rbx); ret
89#define RETURN		RETURN_END;
90
91	.section .text.sse2,"ax",@progbits
92ENTRY (MEMMOVE)
93	ENTRANCE
94	mov	%rdi, %rax
95
96/* Check whether we should copy backward or forward.  */
97	cmp	%rsi, %rdi
98	je	L(mm_return)
99	jg	L(mm_len_0_or_more_backward)
100
101/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
102	separately.  */
103	cmp	$16, %rdx
104	jbe	L(mm_len_0_16_bytes_forward)
105
106	cmp	$32, %rdx
107	ja	L(mm_len_32_or_more_forward)
108
109/* Copy [0..32] and return.  */
110	movdqu	(%rsi), %xmm0
111	movdqu	-16(%rsi, %rdx), %xmm1
112	movdqu	%xmm0, (%rdi)
113	movdqu	%xmm1, -16(%rdi, %rdx)
114	jmp	L(mm_return)
115
116L(mm_len_32_or_more_forward):
117	cmp	$64, %rdx
118	ja	L(mm_len_64_or_more_forward)
119
120/* Copy [0..64] and return.  */
121	movdqu	(%rsi), %xmm0
122	movdqu	16(%rsi), %xmm1
123	movdqu	-16(%rsi, %rdx), %xmm2
124	movdqu	-32(%rsi, %rdx), %xmm3
125	movdqu	%xmm0, (%rdi)
126	movdqu	%xmm1, 16(%rdi)
127	movdqu	%xmm2, -16(%rdi, %rdx)
128	movdqu	%xmm3, -32(%rdi, %rdx)
129	jmp	L(mm_return)
130
131L(mm_len_64_or_more_forward):
132	cmp	$128, %rdx
133	ja	L(mm_len_128_or_more_forward)
134
135/* Copy [0..128] and return.  */
136	movdqu	(%rsi), %xmm0
137	movdqu	16(%rsi), %xmm1
138	movdqu	32(%rsi), %xmm2
139	movdqu	48(%rsi), %xmm3
140	movdqu	-64(%rsi, %rdx), %xmm4
141	movdqu	-48(%rsi, %rdx), %xmm5
142	movdqu	-32(%rsi, %rdx), %xmm6
143	movdqu	-16(%rsi, %rdx), %xmm7
144	movdqu	%xmm0, (%rdi)
145	movdqu	%xmm1, 16(%rdi)
146	movdqu	%xmm2, 32(%rdi)
147	movdqu	%xmm3, 48(%rdi)
148	movdqu	%xmm4, -64(%rdi, %rdx)
149	movdqu	%xmm5, -48(%rdi, %rdx)
150	movdqu	%xmm6, -32(%rdi, %rdx)
151	movdqu	%xmm7, -16(%rdi, %rdx)
152	jmp	L(mm_return)
153
154L(mm_len_128_or_more_forward):
155/* Aligning the address of destination.  */
156/*  save first unaligned 64 bytes */
157	movdqu	(%rsi), %xmm0
158	movdqu	16(%rsi), %xmm1
159	movdqu	32(%rsi), %xmm2
160	movdqu	48(%rsi), %xmm3
161
162	lea	64(%rdi), %r8
163	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
164	sub	%rdi, %rsi /* rsi = src - dst = diff */
165
166	movdqu	(%r8, %rsi), %xmm4
167	movdqu	16(%r8, %rsi), %xmm5
168	movdqu	32(%r8, %rsi), %xmm6
169	movdqu	48(%r8, %rsi), %xmm7
170
171	movdqu	%xmm0, (%rdi)
172	movdqu	%xmm1, 16(%rdi)
173	movdqu	%xmm2, 32(%rdi)
174	movdqu	%xmm3, 48(%rdi)
175	movdqa	%xmm4, (%r8)
176	movaps	%xmm5, 16(%r8)
177	movaps	%xmm6, 32(%r8)
178	movaps	%xmm7, 48(%r8)
179	add	$64, %r8
180
181	lea	(%rdi, %rdx), %rbx
182	and	$-64, %rbx
183	cmp	%r8, %rbx
184	jbe	L(mm_copy_remaining_forward)
185
186	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
187	jae	L(mm_large_page_loop_forward)
188
189	.p2align 4
190L(mm_main_loop_forward):
191
192	prefetcht0 128(%r8, %rsi)
193
194	movdqu	(%r8, %rsi), %xmm0
195	movdqu	16(%r8, %rsi), %xmm1
196	movdqu	32(%r8, %rsi), %xmm2
197	movdqu	48(%r8, %rsi), %xmm3
198	movdqa	%xmm0, (%r8)
199	movaps	%xmm1, 16(%r8)
200	movaps	%xmm2, 32(%r8)
201	movaps	%xmm3, 48(%r8)
202	lea	64(%r8), %r8
203	cmp	%r8, %rbx
204	ja	L(mm_main_loop_forward)
205
206L(mm_copy_remaining_forward):
207	add	%rdi, %rdx
208	sub	%r8, %rdx
209/* We copied all up till %rdi position in the dst.
210	In %rdx now is how many bytes are left to copy.
211	Now we need to advance %r8. */
212	lea	(%r8, %rsi), %r9
213
214L(mm_remaining_0_64_bytes_forward):
215	cmp	$32, %rdx
216	ja	L(mm_remaining_33_64_bytes_forward)
217	cmp	$16, %rdx
218	ja	L(mm_remaining_17_32_bytes_forward)
219	test	%rdx, %rdx
220	.p2align 4,,2
221	je	L(mm_return)
222
223	cmpb	$8, %dl
224	ja	L(mm_remaining_9_16_bytes_forward)
225	cmpb	$4, %dl
226	.p2align 4,,5
227	ja	L(mm_remaining_5_8_bytes_forward)
228	cmpb	$2, %dl
229	.p2align 4,,1
230	ja	L(mm_remaining_3_4_bytes_forward)
231	movzbl	-1(%r9,%rdx), %esi
232	movzbl	(%r9), %ebx
233	movb	%sil, -1(%r8,%rdx)
234	movb	%bl, (%r8)
235	jmp	L(mm_return)
236
237L(mm_remaining_33_64_bytes_forward):
238	movdqu	(%r9), %xmm0
239	movdqu	16(%r9), %xmm1
240	movdqu	-32(%r9, %rdx), %xmm2
241	movdqu	-16(%r9, %rdx), %xmm3
242	movdqu	%xmm0, (%r8)
243	movdqu	%xmm1, 16(%r8)
244	movdqu	%xmm2, -32(%r8, %rdx)
245	movdqu	%xmm3, -16(%r8, %rdx)
246	jmp	L(mm_return)
247
248L(mm_remaining_17_32_bytes_forward):
249	movdqu	(%r9), %xmm0
250	movdqu	-16(%r9, %rdx), %xmm1
251	movdqu	%xmm0, (%r8)
252	movdqu	%xmm1, -16(%r8, %rdx)
253	jmp	L(mm_return)
254
255L(mm_remaining_5_8_bytes_forward):
256	movl	(%r9), %esi
257	movl	-4(%r9,%rdx), %ebx
258	movl	%esi, (%r8)
259	movl	%ebx, -4(%r8,%rdx)
260	jmp	L(mm_return)
261
262L(mm_remaining_9_16_bytes_forward):
263	mov	(%r9), %rsi
264	mov	-8(%r9, %rdx), %rbx
265	mov	%rsi, (%r8)
266	mov	%rbx, -8(%r8, %rdx)
267	jmp	L(mm_return)
268
269L(mm_remaining_3_4_bytes_forward):
270	movzwl	-2(%r9,%rdx), %esi
271	movzwl	(%r9), %ebx
272	movw	%si, -2(%r8,%rdx)
273	movw	%bx, (%r8)
274	jmp	L(mm_return)
275
276L(mm_len_0_16_bytes_forward):
277	testb	$24, %dl
278	jne	L(mm_len_9_16_bytes_forward)
279	testb	$4, %dl
280	.p2align 4,,5
281	jne	L(mm_len_5_8_bytes_forward)
282	test	%rdx, %rdx
283	.p2align 4,,2
284	je	L(mm_return)
285	testb	$2, %dl
286	.p2align 4,,1
287	jne	L(mm_len_2_4_bytes_forward)
288	movzbl	-1(%rsi,%rdx), %ebx
289	movzbl	(%rsi), %esi
290	movb	%bl, -1(%rdi,%rdx)
291	movb	%sil, (%rdi)
292	jmp	L(mm_return)
293
294L(mm_len_2_4_bytes_forward):
295	movzwl	-2(%rsi,%rdx), %ebx
296	movzwl	(%rsi), %esi
297	movw	%bx, -2(%rdi,%rdx)
298	movw	%si, (%rdi)
299	jmp	L(mm_return)
300
301L(mm_len_5_8_bytes_forward):
302	movl	(%rsi), %ebx
303	movl	-4(%rsi,%rdx), %esi
304	movl	%ebx, (%rdi)
305	movl	%esi, -4(%rdi,%rdx)
306	jmp	L(mm_return)
307
308L(mm_len_9_16_bytes_forward):
309	mov	(%rsi), %rbx
310	mov	-8(%rsi, %rdx), %rsi
311	mov	%rbx, (%rdi)
312	mov	%rsi, -8(%rdi, %rdx)
313	jmp	L(mm_return)
314
315L(mm_recalc_len):
316/* Compute in %rdx how many bytes are left to copy after
317	the main loop stops.  */
318	mov 	%rbx, %rdx
319	sub 	%rdi, %rdx
320/* The code for copying backwards.  */
321L(mm_len_0_or_more_backward):
322
323/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
324	separately.  */
325	cmp	$16, %rdx
326	jbe	L(mm_len_0_16_bytes_backward)
327
328	cmp	$32, %rdx
329	ja	L(mm_len_32_or_more_backward)
330
331/* Copy [0..32] and return.  */
332	movdqu	(%rsi), %xmm0
333	movdqu	-16(%rsi, %rdx), %xmm1
334	movdqu	%xmm0, (%rdi)
335	movdqu	%xmm1, -16(%rdi, %rdx)
336	jmp	L(mm_return)
337
338L(mm_len_32_or_more_backward):
339	cmp	$64, %rdx
340	ja	L(mm_len_64_or_more_backward)
341
342/* Copy [0..64] and return.  */
343	movdqu	(%rsi), %xmm0
344	movdqu	16(%rsi), %xmm1
345	movdqu	-16(%rsi, %rdx), %xmm2
346	movdqu	-32(%rsi, %rdx), %xmm3
347	movdqu	%xmm0, (%rdi)
348	movdqu	%xmm1, 16(%rdi)
349	movdqu	%xmm2, -16(%rdi, %rdx)
350	movdqu	%xmm3, -32(%rdi, %rdx)
351	jmp	L(mm_return)
352
353L(mm_len_64_or_more_backward):
354	cmp	$128, %rdx
355	ja	L(mm_len_128_or_more_backward)
356
357/* Copy [0..128] and return.  */
358	movdqu	(%rsi), %xmm0
359	movdqu	16(%rsi), %xmm1
360	movdqu	32(%rsi), %xmm2
361	movdqu	48(%rsi), %xmm3
362	movdqu	-64(%rsi, %rdx), %xmm4
363	movdqu	-48(%rsi, %rdx), %xmm5
364	movdqu	-32(%rsi, %rdx), %xmm6
365	movdqu	-16(%rsi, %rdx), %xmm7
366	movdqu	%xmm0, (%rdi)
367	movdqu	%xmm1, 16(%rdi)
368	movdqu	%xmm2, 32(%rdi)
369	movdqu	%xmm3, 48(%rdi)
370	movdqu	%xmm4, -64(%rdi, %rdx)
371	movdqu	%xmm5, -48(%rdi, %rdx)
372	movdqu	%xmm6, -32(%rdi, %rdx)
373	movdqu	%xmm7, -16(%rdi, %rdx)
374	jmp	L(mm_return)
375
376L(mm_len_128_or_more_backward):
377/* Aligning the address of destination. We need to save
378	16 bits from the source in order not to overwrite them.  */
379	movdqu	-16(%rsi, %rdx), %xmm0
380	movdqu	-32(%rsi, %rdx), %xmm1
381	movdqu	-48(%rsi, %rdx), %xmm2
382	movdqu	-64(%rsi, %rdx), %xmm3
383
384	lea	(%rdi, %rdx), %r9
385	and	$-64, %r9 /* r9 = aligned dst */
386
387	mov	%rsi, %r8
388	sub	%rdi, %r8 /* r8 = src - dst, diff */
389
390	movdqu	-16(%r9, %r8), %xmm4
391	movdqu	-32(%r9, %r8), %xmm5
392	movdqu	-48(%r9, %r8), %xmm6
393	movdqu	-64(%r9, %r8), %xmm7
394
395	movdqu	%xmm0, -16(%rdi, %rdx)
396	movdqu	%xmm1, -32(%rdi, %rdx)
397	movdqu	%xmm2, -48(%rdi, %rdx)
398	movdqu	%xmm3, -64(%rdi, %rdx)
399	movdqa	%xmm4, -16(%r9)
400	movaps	%xmm5, -32(%r9)
401	movaps	%xmm6, -48(%r9)
402	movaps	%xmm7, -64(%r9)
403	lea	-64(%r9), %r9
404
405	lea	64(%rdi), %rbx
406	and	$-64, %rbx
407
408	cmp	%r9, %rbx
409	jae	L(mm_recalc_len)
410
411	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
412	jae	L(mm_large_page_loop_backward)
413
414	.p2align 4
415L(mm_main_loop_backward):
416
417	prefetcht0 -128(%r9, %r8)
418
419	movdqu	-64(%r9, %r8), %xmm0
420	movdqu	-48(%r9, %r8), %xmm1
421	movdqu	-32(%r9, %r8), %xmm2
422	movdqu	-16(%r9, %r8), %xmm3
423	movdqa	%xmm0, -64(%r9)
424	movaps	%xmm1, -48(%r9)
425	movaps	%xmm2, -32(%r9)
426	movaps	%xmm3, -16(%r9)
427	lea	-64(%r9), %r9
428	cmp	%r9, %rbx
429	jb	L(mm_main_loop_backward)
430	jmp	L(mm_recalc_len)
431
432/* Copy [0..16] and return.  */
433L(mm_len_0_16_bytes_backward):
434	testb	$24, %dl
435	jnz	L(mm_len_9_16_bytes_backward)
436	testb	$4, %dl
437	.p2align 4,,5
438	jnz	L(mm_len_5_8_bytes_backward)
439	test	%rdx, %rdx
440	.p2align 4,,2
441	je	L(mm_return)
442	testb	$2, %dl
443	.p2align 4,,1
444	jne	L(mm_len_3_4_bytes_backward)
445	movzbl	-1(%rsi,%rdx), %ebx
446	movzbl	(%rsi), %ecx
447	movb	%bl, -1(%rdi,%rdx)
448	movb	%cl, (%rdi)
449	jmp	L(mm_return)
450
451L(mm_len_3_4_bytes_backward):
452	movzwl	-2(%rsi,%rdx), %ebx
453	movzwl	(%rsi), %ecx
454	movw	%bx, -2(%rdi,%rdx)
455	movw	%cx, (%rdi)
456	jmp	L(mm_return)
457
458L(mm_len_9_16_bytes_backward):
459	movl	-4(%rsi,%rdx), %ebx
460	movl	-8(%rsi,%rdx), %ecx
461	movl	%ebx, -4(%rdi,%rdx)
462	movl	%ecx, -8(%rdi,%rdx)
463	sub	$8, %rdx
464	jmp	L(mm_len_0_16_bytes_backward)
465
466L(mm_len_5_8_bytes_backward):
467	movl	(%rsi), %ebx
468	movl	-4(%rsi,%rdx), %ecx
469	movl	%ebx, (%rdi)
470	movl	%ecx, -4(%rdi,%rdx)
471
472L(mm_return):
473	RETURN
474
475/* Big length copy forward part.  */
476
477	.p2align 4
478L(mm_large_page_loop_forward):
479	movdqu	(%r8, %rsi), %xmm0
480	movdqu	16(%r8, %rsi), %xmm1
481	movdqu	32(%r8, %rsi), %xmm2
482	movdqu	48(%r8, %rsi), %xmm3
483	movntdq	%xmm0, (%r8)
484	movntdq	%xmm1, 16(%r8)
485	movntdq	%xmm2, 32(%r8)
486	movntdq	%xmm3, 48(%r8)
487	lea 	64(%r8), %r8
488	cmp	%r8, %rbx
489	ja	L(mm_large_page_loop_forward)
490	sfence
491	jmp	L(mm_copy_remaining_forward)
492
493/* Big length copy backward part.  */
494	.p2align 4
495L(mm_large_page_loop_backward):
496	movdqu	-64(%r9, %r8), %xmm0
497	movdqu	-48(%r9, %r8), %xmm1
498	movdqu	-32(%r9, %r8), %xmm2
499	movdqu	-16(%r9, %r8), %xmm3
500	movntdq	%xmm0, -64(%r9)
501	movntdq	%xmm1, -48(%r9)
502	movntdq	%xmm2, -32(%r9)
503	movntdq	%xmm3, -16(%r9)
504	lea 	-64(%r9), %r9
505	cmp	%r9, %rbx
506	jb	L(mm_large_page_loop_backward)
507	sfence
508	jmp	L(mm_recalc_len)
509
510END (MEMMOVE)
511