• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef MEMCPY
32# define MEMCPY         ssse3_memcpy5
33#endif
34
35#ifndef L
36# define L(label)	.L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n)	.p2align n
41#endif
42
43#ifndef cfi_startproc
44# define cfi_startproc			.cfi_startproc
45#endif
46
47#ifndef cfi_endproc
48# define cfi_endproc			.cfi_endproc
49#endif
50
51#ifndef cfi_rel_offset
52# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
53#endif
54
55#ifndef cfi_restore
56# define cfi_restore(reg)		.cfi_restore (reg)
57#endif
58
59#ifndef cfi_adjust_cfa_offset
60# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
61#endif
62
63#ifndef ENTRY
64# define ENTRY(name)			\
65	.type name,  @function; 	\
66	.globl name;			\
67	.p2align 4;			\
68name:					\
69	cfi_startproc
70#endif
71
72#ifndef END
73# define END(name)			\
74	cfi_endproc;			\
75	.size name, .-name
76#endif
77
78#ifdef USE_AS_BCOPY
79# define SRC		PARMS
80# define DEST		SRC+4
81# define LEN		DEST+4
82#else
83# define DEST		PARMS
84# define SRC		DEST+4
85# define LEN		SRC+4
86#endif
87
88#define CFI_PUSH(REG)						\
89  cfi_adjust_cfa_offset (4);					\
90  cfi_rel_offset (REG, 0)
91
92#define CFI_POP(REG)						\
93  cfi_adjust_cfa_offset (-4);					\
94  cfi_restore (REG)
95
96#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
97#define POP(REG)	popl REG; CFI_POP (REG)
98
99#ifdef SHARED
100# define PARMS		8		/* Preserve EBX.  */
101# define ENTRANCE	PUSH (%ebx);
102# define RETURN_END	POP (%ebx); ret
103# define RETURN		RETURN_END; CFI_PUSH (%ebx)
104# define JMPTBL(I, B)	I - B
105
106/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
107   jump table with relative offsets.  INDEX is a register contains the
108   index into the jump table.   SCALE is the scale of INDEX. */
109# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
110    /* We first load PC into EBX.  */				\
111    call	__i686.get_pc_thunk.bx;				\
112    /* Get the address of the jump table.  */			\
113    addl	$(TABLE - .), %ebx;				\
114    /* Get the entry and convert the relative offset to the	\
115       absolute address.  */					\
116    addl	(%ebx,INDEX,SCALE), %ebx;			\
117    /* We loaded the jump table.  Go.  */			\
118    jmp		*%ebx
119
120# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
121    addl	$(TABLE - .), %ebx
122
123# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
124    addl	(%ebx,INDEX,SCALE), %ebx;			\
125    /* We loaded the jump table.  Go.  */			\
126    jmp		*%ebx
127
128	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
129	.globl	__i686.get_pc_thunk.bx
130	.hidden	__i686.get_pc_thunk.bx
131	ALIGN (4)
132	.type	__i686.get_pc_thunk.bx,@function
133__i686.get_pc_thunk.bx:
134	movl	(%esp), %ebx
135	ret
136#else
137# define PARMS		4
138# define ENTRANCE
139# define RETURN_END	ret
140# define RETURN		RETURN_END
141# define JMPTBL(I, B)	I
142
143/* Branch to an entry in a jump table.  TABLE is a jump table with
144   absolute offsets.  INDEX is a register contains the index into the
145   jump table.  SCALE is the scale of INDEX. */
146# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
147    jmp		*TABLE(,INDEX,SCALE)
148
149# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
150
151# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
152    jmp		*TABLE(,INDEX,SCALE)
153#endif
154
155	.section .text.ssse3,"ax",@progbits
156ENTRY (MEMCPY)
157	ENTRANCE
158	movl	LEN(%esp), %ecx
159	movl	SRC(%esp), %eax
160	movl	DEST(%esp), %edx
161
162#ifdef USE_AS_MEMMOVE
163	cmp	%eax, %edx
164	jb	L(copy_forward)
165	je	L(fwd_write_0bytes)
166	cmp	$32, %ecx
167	jae	L(memmove_bwd)
168	jmp	L(bk_write_less32bytes_2)
169L(memmove_bwd):
170	add	%ecx, %eax
171	cmp	%eax, %edx
172	movl	SRC(%esp), %eax
173	jb	L(copy_backward)
174
175L(copy_forward):
176#endif
177	cmp	$48, %ecx
178	jae	L(48bytesormore)
179
180L(fwd_write_less32bytes):
181#ifndef USE_AS_MEMMOVE
182	cmp	%dl, %al
183	jb	L(bk_write)
184#endif
185	add	%ecx, %edx
186	add	%ecx, %eax
187	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
188#ifndef USE_AS_MEMMOVE
189L(bk_write):
190	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
191#endif
192
193	ALIGN (4)
194/* ECX > 32 and EDX is 4 byte aligned.  */
195L(48bytesormore):
196	movdqu	(%eax), %xmm0
197	PUSH (%edi)
198	movl	%edx, %edi
199	and	$-16, %edx
200	PUSH (%esi)
201	add	$16, %edx
202	movl	%edi, %esi
203	sub	%edx, %edi
204	add	%edi, %ecx
205	sub	%edi, %eax
206
207#ifdef SHARED_CACHE_SIZE_HALF
208	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
209#else
210# ifdef SHARED
211	call	__i686.get_pc_thunk.bx
212	add	$_GLOBAL_OFFSET_TABLE_, %ebx
213	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
214# else
215	cmp	__x86_shared_cache_size_half, %ecx
216# endif
217#endif
218
219	mov	%eax, %edi
220	jae	L(large_page)
221	and	$0xf, %edi
222	jz	L(shl_0)
223
224	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
225
226	ALIGN (4)
227L(shl_0):
228	movdqu	%xmm0, (%esi)
229	xor	%edi, %edi
230	POP (%esi)
231	cmp	$127, %ecx
232	ja	L(shl_0_gobble)
233	lea	-32(%ecx), %ecx
234L(shl_0_loop):
235	movdqa	(%eax, %edi), %xmm0
236	movdqa	16(%eax, %edi), %xmm1
237	sub	$32, %ecx
238	movdqa	%xmm0, (%edx, %edi)
239	movdqa	%xmm1, 16(%edx, %edi)
240	lea	32(%edi), %edi
241	jb	L(shl_0_end)
242
243	movdqa	(%eax, %edi), %xmm0
244	movdqa	16(%eax, %edi), %xmm1
245	sub	$32, %ecx
246	movdqa	%xmm0, (%edx, %edi)
247	movdqa	%xmm1, 16(%edx, %edi)
248	lea	32(%edi), %edi
249	jb	L(shl_0_end)
250
251	movdqa	(%eax, %edi), %xmm0
252	movdqa	16(%eax, %edi), %xmm1
253	sub	$32, %ecx
254	movdqa	%xmm0, (%edx, %edi)
255	movdqa	%xmm1, 16(%edx, %edi)
256	lea	32(%edi), %edi
257	jb	L(shl_0_end)
258
259	movdqa	(%eax, %edi), %xmm0
260	movdqa	16(%eax, %edi), %xmm1
261	sub	$32, %ecx
262	movdqa	%xmm0, (%edx, %edi)
263	movdqa	%xmm1, 16(%edx, %edi)
264	lea	32(%edi), %edi
265L(shl_0_end):
266	lea	32(%ecx), %ecx
267	add	%ecx, %edi
268	add	%edi, %edx
269	add	%edi, %eax
270	POP (%edi)
271	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
272
273L(shl_0_gobble):
274
275#ifdef DATA_CACHE_SIZE_HALF
276	cmp	$DATA_CACHE_SIZE_HALF, %ecx
277#else
278# ifdef SHARED
279	call	__i686.get_pc_thunk.bx
280	add	$_GLOBAL_OFFSET_TABLE_, %ebx
281	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
282# else
283	cmp	__x86_data_cache_size_half, %ecx
284# endif
285#endif
286
287	POP (%edi)
288	lea	-128(%ecx), %ecx
289	jae	L(shl_0_gobble_mem_loop)
290L(shl_0_gobble_cache_loop):
291	movdqa	(%eax), %xmm0
292	movdqa	0x10(%eax), %xmm1
293	movdqa	0x20(%eax), %xmm2
294	movdqa	0x30(%eax), %xmm3
295	movdqa	0x40(%eax), %xmm4
296	movdqa	0x50(%eax), %xmm5
297	movdqa	0x60(%eax), %xmm6
298	movdqa	0x70(%eax), %xmm7
299	lea	0x80(%eax), %eax
300	sub	$128, %ecx
301	movdqa	%xmm0, (%edx)
302	movdqa	%xmm1, 0x10(%edx)
303	movdqa	%xmm2, 0x20(%edx)
304	movdqa	%xmm3, 0x30(%edx)
305	movdqa	%xmm4, 0x40(%edx)
306	movdqa	%xmm5, 0x50(%edx)
307	movdqa	%xmm6, 0x60(%edx)
308	movdqa	%xmm7, 0x70(%edx)
309	lea	0x80(%edx), %edx
310
311	jae	L(shl_0_gobble_cache_loop)
312	cmp	$-0x40, %ecx
313	lea	0x80(%ecx), %ecx
314	jl	L(shl_0_cache_less_64bytes)
315
316	movdqa	(%eax), %xmm0
317	sub	$0x40, %ecx
318	movdqa	0x10(%eax), %xmm1
319
320	movdqa	%xmm0, (%edx)
321	movdqa	%xmm1, 0x10(%edx)
322
323	movdqa	0x20(%eax), %xmm0
324	movdqa	0x30(%eax), %xmm1
325	add	$0x40, %eax
326
327	movdqa	%xmm0, 0x20(%edx)
328	movdqa	%xmm1, 0x30(%edx)
329	add	$0x40, %edx
330L(shl_0_cache_less_64bytes):
331	cmp	$0x20, %ecx
332	jb	L(shl_0_cache_less_32bytes)
333	movdqa	(%eax), %xmm0
334	sub	$0x20, %ecx
335	movdqa	0x10(%eax), %xmm1
336	add	$0x20, %eax
337	movdqa	%xmm0, (%edx)
338	movdqa	%xmm1, 0x10(%edx)
339	add	$0x20, %edx
340L(shl_0_cache_less_32bytes):
341	cmp	$0x10, %ecx
342	jb	L(shl_0_cache_less_16bytes)
343	sub	$0x10, %ecx
344	movdqa	(%eax), %xmm0
345	add	$0x10, %eax
346	movdqa	%xmm0, (%edx)
347	add	$0x10, %edx
348L(shl_0_cache_less_16bytes):
349	add	%ecx, %edx
350	add	%ecx, %eax
351	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
352
353
354	ALIGN (4)
355L(shl_0_gobble_mem_loop):
356	prefetcht0 0x1c0(%eax)
357	prefetcht0 0x280(%eax)
358	prefetcht0 0x1c0(%edx)
359
360	movdqa	(%eax), %xmm0
361	movdqa	0x10(%eax), %xmm1
362	movdqa	0x20(%eax), %xmm2
363	movdqa	0x30(%eax), %xmm3
364	movdqa	0x40(%eax), %xmm4
365	movdqa	0x50(%eax), %xmm5
366	movdqa	0x60(%eax), %xmm6
367	movdqa	0x70(%eax), %xmm7
368	lea	0x80(%eax), %eax
369	sub	$0x80, %ecx
370	movdqa	%xmm0, (%edx)
371	movdqa	%xmm1, 0x10(%edx)
372	movdqa	%xmm2, 0x20(%edx)
373	movdqa	%xmm3, 0x30(%edx)
374	movdqa	%xmm4, 0x40(%edx)
375	movdqa	%xmm5, 0x50(%edx)
376	movdqa	%xmm6, 0x60(%edx)
377	movdqa	%xmm7, 0x70(%edx)
378	lea	0x80(%edx), %edx
379
380	jae	L(shl_0_gobble_mem_loop)
381	cmp	$-0x40, %ecx
382	lea	0x80(%ecx), %ecx
383	jl	L(shl_0_mem_less_64bytes)
384
385	movdqa	(%eax), %xmm0
386	sub	$0x40, %ecx
387	movdqa	0x10(%eax), %xmm1
388
389	movdqa	%xmm0, (%edx)
390	movdqa	%xmm1, 0x10(%edx)
391
392	movdqa	0x20(%eax), %xmm0
393	movdqa	0x30(%eax), %xmm1
394	add	$0x40, %eax
395
396	movdqa	%xmm0, 0x20(%edx)
397	movdqa	%xmm1, 0x30(%edx)
398	add	$0x40, %edx
399L(shl_0_mem_less_64bytes):
400	cmp	$0x20, %ecx
401	jb	L(shl_0_mem_less_32bytes)
402	movdqa	(%eax), %xmm0
403	sub	$0x20, %ecx
404	movdqa	0x10(%eax), %xmm1
405	add	$0x20, %eax
406	movdqa	%xmm0, (%edx)
407	movdqa	%xmm1, 0x10(%edx)
408	add	$0x20, %edx
409L(shl_0_mem_less_32bytes):
410	cmp	$0x10, %ecx
411	jb	L(shl_0_mem_less_16bytes)
412	sub	$0x10, %ecx
413	movdqa	(%eax), %xmm0
414	add	$0x10, %eax
415	movdqa	%xmm0, (%edx)
416	add	$0x10, %edx
417L(shl_0_mem_less_16bytes):
418	add	%ecx, %edx
419	add	%ecx, %eax
420	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
421
422
423	ALIGN (4)
424L(shl_1):
425	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
426	lea	-1(%eax), %eax
427	movaps	(%eax), %xmm1
428	xor	%edi, %edi
429	lea	-32(%ecx), %ecx
430	movdqu	%xmm0, (%esi)
431	POP (%esi)
432L(shl_1_loop):
433
434	movdqa	16(%eax, %edi), %xmm2
435	sub	$32, %ecx
436	movdqa	32(%eax, %edi), %xmm3
437	movdqa	%xmm3, %xmm4
438	palignr	$1, %xmm2, %xmm3
439	palignr	$1, %xmm1, %xmm2
440	lea	32(%edi), %edi
441	movdqa	%xmm2, -32(%edx, %edi)
442	movdqa	%xmm3, -16(%edx, %edi)
443
444	jb	L(shl_1_end)
445
446	movdqa	16(%eax, %edi), %xmm2
447	sub	$32, %ecx
448	movdqa	32(%eax, %edi), %xmm3
449	movdqa	%xmm3, %xmm1
450	palignr	$1, %xmm2, %xmm3
451	palignr	$1, %xmm4, %xmm2
452	lea	32(%edi), %edi
453	movdqa	%xmm2, -32(%edx, %edi)
454	movdqa	%xmm3, -16(%edx, %edi)
455
456	jae	L(shl_1_loop)
457
458L(shl_1_end):
459	lea	32(%ecx), %ecx
460	add	%ecx, %edi
461	add	%edi, %edx
462	lea	1(%edi, %eax), %eax
463	POP (%edi)
464	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
465
466	ALIGN (4)
467L(shl_2):
468	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
469	lea	-2(%eax), %eax
470	movaps	(%eax), %xmm1
471	xor	%edi, %edi
472	lea	-32(%ecx), %ecx
473	movdqu	%xmm0, (%esi)
474	POP (%esi)
475L(shl_2_loop):
476
477	movdqa	16(%eax, %edi), %xmm2
478	sub	$32, %ecx
479	movdqa	32(%eax, %edi), %xmm3
480	movdqa	%xmm3, %xmm4
481	palignr	$2, %xmm2, %xmm3
482	palignr	$2, %xmm1, %xmm2
483	lea	32(%edi), %edi
484	movdqa	%xmm2, -32(%edx, %edi)
485	movdqa	%xmm3, -16(%edx, %edi)
486
487	jb	L(shl_2_end)
488
489	movdqa	16(%eax, %edi), %xmm2
490	sub	$32, %ecx
491	movdqa	32(%eax, %edi), %xmm3
492	movdqa	%xmm3, %xmm1
493	palignr	$2, %xmm2, %xmm3
494	palignr	$2, %xmm4, %xmm2
495	lea	32(%edi), %edi
496	movdqa	%xmm2, -32(%edx, %edi)
497	movdqa	%xmm3, -16(%edx, %edi)
498
499	jae	L(shl_2_loop)
500
501L(shl_2_end):
502	lea	32(%ecx), %ecx
503	add	%ecx, %edi
504	add	%edi, %edx
505	lea	2(%edi, %eax), %eax
506	POP (%edi)
507	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
508
509	ALIGN (4)
510L(shl_3):
511	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
512	lea	-3(%eax), %eax
513	movaps	(%eax), %xmm1
514	xor	%edi, %edi
515	lea	-32(%ecx), %ecx
516	movdqu	%xmm0, (%esi)
517	POP (%esi)
518L(shl_3_loop):
519
520	movdqa	16(%eax, %edi), %xmm2
521	sub	$32, %ecx
522	movdqa	32(%eax, %edi), %xmm3
523	movdqa	%xmm3, %xmm4
524	palignr	$3, %xmm2, %xmm3
525	palignr	$3, %xmm1, %xmm2
526	lea	32(%edi), %edi
527	movdqa	%xmm2, -32(%edx, %edi)
528	movdqa	%xmm3, -16(%edx, %edi)
529
530	jb	L(shl_3_end)
531
532	movdqa	16(%eax, %edi), %xmm2
533	sub	$32, %ecx
534	movdqa	32(%eax, %edi), %xmm3
535	movdqa	%xmm3, %xmm1
536	palignr	$3, %xmm2, %xmm3
537	palignr	$3, %xmm4, %xmm2
538	lea	32(%edi), %edi
539	movdqa	%xmm2, -32(%edx, %edi)
540	movdqa	%xmm3, -16(%edx, %edi)
541
542	jae	L(shl_3_loop)
543
544L(shl_3_end):
545	lea	32(%ecx), %ecx
546	add	%ecx, %edi
547	add	%edi, %edx
548	lea	3(%edi, %eax), %eax
549	POP (%edi)
550	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
551
552	ALIGN (4)
553L(shl_4):
554	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
555	lea	-4(%eax), %eax
556	movaps	(%eax), %xmm1
557	xor	%edi, %edi
558	lea	-32(%ecx), %ecx
559	movdqu	%xmm0, (%esi)
560	POP (%esi)
561L(shl_4_loop):
562
563	movdqa	16(%eax, %edi), %xmm2
564	sub	$32, %ecx
565	movdqa	32(%eax, %edi), %xmm3
566	movdqa	%xmm3, %xmm4
567	palignr	$4, %xmm2, %xmm3
568	palignr	$4, %xmm1, %xmm2
569	lea	32(%edi), %edi
570	movdqa	%xmm2, -32(%edx, %edi)
571	movdqa	%xmm3, -16(%edx, %edi)
572
573	jb	L(shl_4_end)
574
575	movdqa	16(%eax, %edi), %xmm2
576	sub	$32, %ecx
577	movdqa	32(%eax, %edi), %xmm3
578	movdqa	%xmm3, %xmm1
579	palignr	$4, %xmm2, %xmm3
580	palignr	$4, %xmm4, %xmm2
581	lea	32(%edi), %edi
582	movdqa	%xmm2, -32(%edx, %edi)
583	movdqa	%xmm3, -16(%edx, %edi)
584
585	jae	L(shl_4_loop)
586
587L(shl_4_end):
588	lea	32(%ecx), %ecx
589	add	%ecx, %edi
590	add	%edi, %edx
591	lea	4(%edi, %eax), %eax
592	POP (%edi)
593	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
594
595	ALIGN (4)
596L(shl_5):
597	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
598	lea	-5(%eax), %eax
599	movaps	(%eax), %xmm1
600	xor	%edi, %edi
601	lea	-32(%ecx), %ecx
602	movdqu	%xmm0, (%esi)
603	POP (%esi)
604L(shl_5_loop):
605
606	movdqa	16(%eax, %edi), %xmm2
607	sub	$32, %ecx
608	movdqa	32(%eax, %edi), %xmm3
609	movdqa	%xmm3, %xmm4
610	palignr	$5, %xmm2, %xmm3
611	palignr	$5, %xmm1, %xmm2
612	lea	32(%edi), %edi
613	movdqa	%xmm2, -32(%edx, %edi)
614	movdqa	%xmm3, -16(%edx, %edi)
615
616	jb	L(shl_5_end)
617
618	movdqa	16(%eax, %edi), %xmm2
619	sub	$32, %ecx
620	movdqa	32(%eax, %edi), %xmm3
621	movdqa	%xmm3, %xmm1
622	palignr	$5, %xmm2, %xmm3
623	palignr	$5, %xmm4, %xmm2
624	lea	32(%edi), %edi
625	movdqa	%xmm2, -32(%edx, %edi)
626	movdqa	%xmm3, -16(%edx, %edi)
627
628	jae	L(shl_5_loop)
629
630L(shl_5_end):
631	lea	32(%ecx), %ecx
632	add	%ecx, %edi
633	add	%edi, %edx
634	lea	5(%edi, %eax), %eax
635	POP (%edi)
636	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
637
638
639	ALIGN (4)
640L(shl_6):
641	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
642	lea	-6(%eax), %eax
643	movaps	(%eax), %xmm1
644	xor	%edi, %edi
645	lea	-32(%ecx), %ecx
646	movdqu	%xmm0, (%esi)
647	POP (%esi)
648L(shl_6_loop):
649
650	movdqa	16(%eax, %edi), %xmm2
651	sub	$32, %ecx
652	movdqa	32(%eax, %edi), %xmm3
653	movdqa	%xmm3, %xmm4
654	palignr	$6, %xmm2, %xmm3
655	palignr	$6, %xmm1, %xmm2
656	lea	32(%edi), %edi
657	movdqa	%xmm2, -32(%edx, %edi)
658	movdqa	%xmm3, -16(%edx, %edi)
659
660	jb	L(shl_6_end)
661
662	movdqa	16(%eax, %edi), %xmm2
663	sub	$32, %ecx
664	movdqa	32(%eax, %edi), %xmm3
665	movdqa	%xmm3, %xmm1
666	palignr	$6, %xmm2, %xmm3
667	palignr	$6, %xmm4, %xmm2
668	lea	32(%edi), %edi
669	movdqa	%xmm2, -32(%edx, %edi)
670	movdqa	%xmm3, -16(%edx, %edi)
671
672	jae	L(shl_6_loop)
673
674L(shl_6_end):
675	lea	32(%ecx), %ecx
676	add	%ecx, %edi
677	add	%edi, %edx
678	lea	6(%edi, %eax), %eax
679	POP (%edi)
680	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
681
682	ALIGN (4)
683L(shl_7):
684	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
685	lea	-7(%eax), %eax
686	movaps	(%eax), %xmm1
687	xor	%edi, %edi
688	lea	-32(%ecx), %ecx
689	movdqu	%xmm0, (%esi)
690	POP (%esi)
691L(shl_7_loop):
692
693	movdqa	16(%eax, %edi), %xmm2
694	sub	$32, %ecx
695	movdqa	32(%eax, %edi), %xmm3
696	movdqa	%xmm3, %xmm4
697	palignr	$7, %xmm2, %xmm3
698	palignr	$7, %xmm1, %xmm2
699	lea	32(%edi), %edi
700	movdqa	%xmm2, -32(%edx, %edi)
701	movdqa	%xmm3, -16(%edx, %edi)
702
703	jb	L(shl_7_end)
704
705	movdqa	16(%eax, %edi), %xmm2
706	sub	$32, %ecx
707	movdqa	32(%eax, %edi), %xmm3
708	movdqa	%xmm3, %xmm1
709	palignr	$7, %xmm2, %xmm3
710	palignr	$7, %xmm4, %xmm2
711	lea	32(%edi), %edi
712	movdqa	%xmm2, -32(%edx, %edi)
713	movdqa	%xmm3, -16(%edx, %edi)
714
715	jae	L(shl_7_loop)
716
717L(shl_7_end):
718	lea	32(%ecx), %ecx
719	add	%ecx, %edi
720	add	%edi, %edx
721	lea	7(%edi, %eax), %eax
722	POP (%edi)
723	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
724
725	ALIGN (4)
726L(shl_8):
727	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
728	lea	-8(%eax), %eax
729	movaps	(%eax), %xmm1
730	xor	%edi, %edi
731	lea	-32(%ecx), %ecx
732	movdqu	%xmm0, (%esi)
733	POP (%esi)
734L(shl_8_loop):
735
736	movdqa	16(%eax, %edi), %xmm2
737	sub	$32, %ecx
738	movdqa	32(%eax, %edi), %xmm3
739	movdqa	%xmm3, %xmm4
740	palignr	$8, %xmm2, %xmm3
741	palignr	$8, %xmm1, %xmm2
742	lea	32(%edi), %edi
743	movdqa	%xmm2, -32(%edx, %edi)
744	movdqa	%xmm3, -16(%edx, %edi)
745
746	jb	L(shl_8_end)
747
748	movdqa	16(%eax, %edi), %xmm2
749	sub	$32, %ecx
750	movdqa	32(%eax, %edi), %xmm3
751	movdqa	%xmm3, %xmm1
752	palignr	$8, %xmm2, %xmm3
753	palignr	$8, %xmm4, %xmm2
754	lea	32(%edi), %edi
755	movdqa	%xmm2, -32(%edx, %edi)
756	movdqa	%xmm3, -16(%edx, %edi)
757
758	jae	L(shl_8_loop)
759
760L(shl_8_end):
761	lea	32(%ecx), %ecx
762	add	%ecx, %edi
763	add	%edi, %edx
764	lea	8(%edi, %eax), %eax
765	POP (%edi)
766	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
767
768	ALIGN (4)
769L(shl_9):
770	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
771	lea	-9(%eax), %eax
772	movaps	(%eax), %xmm1
773	xor	%edi, %edi
774	lea	-32(%ecx), %ecx
775	movdqu	%xmm0, (%esi)
776	POP (%esi)
777L(shl_9_loop):
778
779	movdqa	16(%eax, %edi), %xmm2
780	sub	$32, %ecx
781	movdqa	32(%eax, %edi), %xmm3
782	movdqa	%xmm3, %xmm4
783	palignr	$9, %xmm2, %xmm3
784	palignr	$9, %xmm1, %xmm2
785	lea	32(%edi), %edi
786	movdqa	%xmm2, -32(%edx, %edi)
787	movdqa	%xmm3, -16(%edx, %edi)
788
789	jb	L(shl_9_end)
790
791	movdqa	16(%eax, %edi), %xmm2
792	sub	$32, %ecx
793	movdqa	32(%eax, %edi), %xmm3
794	movdqa	%xmm3, %xmm1
795	palignr	$9, %xmm2, %xmm3
796	palignr	$9, %xmm4, %xmm2
797	lea	32(%edi), %edi
798	movdqa	%xmm2, -32(%edx, %edi)
799	movdqa	%xmm3, -16(%edx, %edi)
800
801	jae	L(shl_9_loop)
802
803L(shl_9_end):
804	lea	32(%ecx), %ecx
805	add	%ecx, %edi
806	add	%edi, %edx
807	lea	9(%edi, %eax), %eax
808	POP (%edi)
809	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
810
811	ALIGN (4)
812L(shl_10):
813	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
814	lea	-10(%eax), %eax
815	movaps	(%eax), %xmm1
816	xor	%edi, %edi
817	lea	-32(%ecx), %ecx
818	movdqu	%xmm0, (%esi)
819	POP (%esi)
820L(shl_10_loop):
821
822	movdqa	16(%eax, %edi), %xmm2
823	sub	$32, %ecx
824	movdqa	32(%eax, %edi), %xmm3
825	movdqa	%xmm3, %xmm4
826	palignr	$10, %xmm2, %xmm3
827	palignr	$10, %xmm1, %xmm2
828	lea	32(%edi), %edi
829	movdqa	%xmm2, -32(%edx, %edi)
830	movdqa	%xmm3, -16(%edx, %edi)
831
832	jb	L(shl_10_end)
833
834	movdqa	16(%eax, %edi), %xmm2
835	sub	$32, %ecx
836	movdqa	32(%eax, %edi), %xmm3
837	movdqa	%xmm3, %xmm1
838	palignr	$10, %xmm2, %xmm3
839	palignr	$10, %xmm4, %xmm2
840	lea	32(%edi), %edi
841	movdqa	%xmm2, -32(%edx, %edi)
842	movdqa	%xmm3, -16(%edx, %edi)
843
844	jae	L(shl_10_loop)
845
846L(shl_10_end):
847	lea	32(%ecx), %ecx
848	add	%ecx, %edi
849	add	%edi, %edx
850	lea	10(%edi, %eax), %eax
851	POP (%edi)
852	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
853
854	ALIGN (4)
855L(shl_11):
856	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
857	lea	-11(%eax), %eax
858	movaps	(%eax), %xmm1
859	xor	%edi, %edi
860	lea	-32(%ecx), %ecx
861	movdqu	%xmm0, (%esi)
862	POP (%esi)
863L(shl_11_loop):
864
865	movdqa	16(%eax, %edi), %xmm2
866	sub	$32, %ecx
867	movdqa	32(%eax, %edi), %xmm3
868	movdqa	%xmm3, %xmm4
869	palignr	$11, %xmm2, %xmm3
870	palignr	$11, %xmm1, %xmm2
871	lea	32(%edi), %edi
872	movdqa	%xmm2, -32(%edx, %edi)
873	movdqa	%xmm3, -16(%edx, %edi)
874
875	jb	L(shl_11_end)
876
877	movdqa	16(%eax, %edi), %xmm2
878	sub	$32, %ecx
879	movdqa	32(%eax, %edi), %xmm3
880	movdqa	%xmm3, %xmm1
881	palignr	$11, %xmm2, %xmm3
882	palignr	$11, %xmm4, %xmm2
883	lea	32(%edi), %edi
884	movdqa	%xmm2, -32(%edx, %edi)
885	movdqa	%xmm3, -16(%edx, %edi)
886
887	jae	L(shl_11_loop)
888
889L(shl_11_end):
890	lea	32(%ecx), %ecx
891	add	%ecx, %edi
892	add	%edi, %edx
893	lea	11(%edi, %eax), %eax
894	POP (%edi)
895	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
896
897	ALIGN (4)
898L(shl_12):
899	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
900	lea	-12(%eax), %eax
901	movaps	(%eax), %xmm1
902	xor	%edi, %edi
903	lea	-32(%ecx), %ecx
904	movdqu	%xmm0, (%esi)
905	POP (%esi)
906L(shl_12_loop):
907
908	movdqa	16(%eax, %edi), %xmm2
909	sub	$32, %ecx
910	movdqa	32(%eax, %edi), %xmm3
911	movdqa	%xmm3, %xmm4
912	palignr	$12, %xmm2, %xmm3
913	palignr	$12, %xmm1, %xmm2
914	lea	32(%edi), %edi
915	movdqa	%xmm2, -32(%edx, %edi)
916	movdqa	%xmm3, -16(%edx, %edi)
917
918	jb	L(shl_12_end)
919
920	movdqa	16(%eax, %edi), %xmm2
921	sub	$32, %ecx
922	movdqa	32(%eax, %edi), %xmm3
923	movdqa	%xmm3, %xmm1
924	palignr	$12, %xmm2, %xmm3
925	palignr	$12, %xmm4, %xmm2
926	lea	32(%edi), %edi
927	movdqa	%xmm2, -32(%edx, %edi)
928	movdqa	%xmm3, -16(%edx, %edi)
929
930	jae	L(shl_12_loop)
931
932L(shl_12_end):
933	lea	32(%ecx), %ecx
934	add	%ecx, %edi
935	add	%edi, %edx
936	lea	12(%edi, %eax), %eax
937	POP (%edi)
938	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
939
940	ALIGN (4)
941L(shl_13):
942	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
943	lea	-13(%eax), %eax
944	movaps	(%eax), %xmm1
945	xor	%edi, %edi
946	lea	-32(%ecx), %ecx
947	movdqu	%xmm0, (%esi)
948	POP (%esi)
949L(shl_13_loop):
950
951	movdqa	16(%eax, %edi), %xmm2
952	sub	$32, %ecx
953	movdqa	32(%eax, %edi), %xmm3
954	movdqa	%xmm3, %xmm4
955	palignr	$13, %xmm2, %xmm3
956	palignr	$13, %xmm1, %xmm2
957	lea	32(%edi), %edi
958	movdqa	%xmm2, -32(%edx, %edi)
959	movdqa	%xmm3, -16(%edx, %edi)
960
961	jb	L(shl_13_end)
962
963	movdqa	16(%eax, %edi), %xmm2
964	sub	$32, %ecx
965	movdqa	32(%eax, %edi), %xmm3
966	movdqa	%xmm3, %xmm1
967	palignr	$13, %xmm2, %xmm3
968	palignr	$13, %xmm4, %xmm2
969	lea	32(%edi), %edi
970	movdqa	%xmm2, -32(%edx, %edi)
971	movdqa	%xmm3, -16(%edx, %edi)
972
973	jae	L(shl_13_loop)
974
975L(shl_13_end):
976	lea	32(%ecx), %ecx
977	add	%ecx, %edi
978	add	%edi, %edx
979	lea	13(%edi, %eax), %eax
980	POP (%edi)
981	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
982
983	ALIGN (4)
984L(shl_14):
985	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
986	lea	-14(%eax), %eax
987	movaps	(%eax), %xmm1
988	xor	%edi, %edi
989	lea	-32(%ecx), %ecx
990	movdqu	%xmm0, (%esi)
991	POP (%esi)
992L(shl_14_loop):
993
994	movdqa	16(%eax, %edi), %xmm2
995	sub	$32, %ecx
996	movdqa	32(%eax, %edi), %xmm3
997	movdqa	%xmm3, %xmm4
998	palignr	$14, %xmm2, %xmm3
999	palignr	$14, %xmm1, %xmm2
1000	lea	32(%edi), %edi
1001	movdqa	%xmm2, -32(%edx, %edi)
1002	movdqa	%xmm3, -16(%edx, %edi)
1003
1004	jb	L(shl_14_end)
1005
1006	movdqa	16(%eax, %edi), %xmm2
1007	sub	$32, %ecx
1008	movdqa	32(%eax, %edi), %xmm3
1009	movdqa	%xmm3, %xmm1
1010	palignr	$14, %xmm2, %xmm3
1011	palignr	$14, %xmm4, %xmm2
1012	lea	32(%edi), %edi
1013	movdqa	%xmm2, -32(%edx, %edi)
1014	movdqa	%xmm3, -16(%edx, %edi)
1015
1016	jae	L(shl_14_loop)
1017
1018L(shl_14_end):
1019	lea	32(%ecx), %ecx
1020	add	%ecx, %edi
1021	add	%edi, %edx
1022	lea	14(%edi, %eax), %eax
1023	POP (%edi)
1024	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1025
1026
1027	ALIGN (4)
1028L(shl_15):
1029	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1030	lea	-15(%eax), %eax
1031	movaps	(%eax), %xmm1
1032	xor	%edi, %edi
1033	lea	-32(%ecx), %ecx
1034	movdqu	%xmm0, (%esi)
1035	POP (%esi)
1036L(shl_15_loop):
1037
1038	movdqa	16(%eax, %edi), %xmm2
1039	sub	$32, %ecx
1040	movdqa	32(%eax, %edi), %xmm3
1041	movdqa	%xmm3, %xmm4
1042	palignr	$15, %xmm2, %xmm3
1043	palignr	$15, %xmm1, %xmm2
1044	lea	32(%edi), %edi
1045	movdqa	%xmm2, -32(%edx, %edi)
1046	movdqa	%xmm3, -16(%edx, %edi)
1047
1048	jb	L(shl_15_end)
1049
1050	movdqa	16(%eax, %edi), %xmm2
1051	sub	$32, %ecx
1052	movdqa	32(%eax, %edi), %xmm3
1053	movdqa	%xmm3, %xmm1
1054	palignr	$15, %xmm2, %xmm3
1055	palignr	$15, %xmm4, %xmm2
1056	lea	32(%edi), %edi
1057	movdqa	%xmm2, -32(%edx, %edi)
1058	movdqa	%xmm3, -16(%edx, %edi)
1059
1060	jae	L(shl_15_loop)
1061
1062L(shl_15_end):
1063	lea	32(%ecx), %ecx
1064	add	%ecx, %edi
1065	add	%edi, %edx
1066	lea	15(%edi, %eax), %eax
1067	POP (%edi)
1068	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1069
1070
1071	ALIGN (4)
1072L(fwd_write_44bytes):
1073	movl	-44(%eax), %ecx
1074	movl	%ecx, -44(%edx)
1075L(fwd_write_40bytes):
1076	movl	-40(%eax), %ecx
1077	movl	%ecx, -40(%edx)
1078L(fwd_write_36bytes):
1079	movl	-36(%eax), %ecx
1080	movl	%ecx, -36(%edx)
1081L(fwd_write_32bytes):
1082	movl	-32(%eax), %ecx
1083	movl	%ecx, -32(%edx)
1084L(fwd_write_28bytes):
1085	movl	-28(%eax), %ecx
1086	movl	%ecx, -28(%edx)
1087L(fwd_write_24bytes):
1088	movl	-24(%eax), %ecx
1089	movl	%ecx, -24(%edx)
1090L(fwd_write_20bytes):
1091	movl	-20(%eax), %ecx
1092	movl	%ecx, -20(%edx)
1093L(fwd_write_16bytes):
1094	movl	-16(%eax), %ecx
1095	movl	%ecx, -16(%edx)
1096L(fwd_write_12bytes):
1097	movl	-12(%eax), %ecx
1098	movl	%ecx, -12(%edx)
1099L(fwd_write_8bytes):
1100	movl	-8(%eax), %ecx
1101	movl	%ecx, -8(%edx)
1102L(fwd_write_4bytes):
1103	movl	-4(%eax), %ecx
1104	movl	%ecx, -4(%edx)
1105L(fwd_write_0bytes):
1106#ifndef USE_AS_BCOPY
1107# ifdef USE_AS_MEMPCPY
1108	movl	%edx, %eax
1109# else
1110	movl	DEST(%esp), %eax
1111# endif
1112#endif
1113	RETURN
1114
1115	ALIGN (4)
1116L(fwd_write_5bytes):
1117	movl	-5(%eax), %ecx
1118	movl	-4(%eax), %eax
1119	movl	%ecx, -5(%edx)
1120	movl	%eax, -4(%edx)
1121#ifndef USE_AS_BCOPY
1122# ifdef USE_AS_MEMPCPY
1123	movl	%edx, %eax
1124# else
1125	movl	DEST(%esp), %eax
1126# endif
1127#endif
1128	RETURN
1129
1130	ALIGN (4)
1131L(fwd_write_45bytes):
1132	movl	-45(%eax), %ecx
1133	movl	%ecx, -45(%edx)
1134L(fwd_write_41bytes):
1135	movl	-41(%eax), %ecx
1136	movl	%ecx, -41(%edx)
1137L(fwd_write_37bytes):
1138	movl	-37(%eax), %ecx
1139	movl	%ecx, -37(%edx)
1140L(fwd_write_33bytes):
1141	movl	-33(%eax), %ecx
1142	movl	%ecx, -33(%edx)
1143L(fwd_write_29bytes):
1144	movl	-29(%eax), %ecx
1145	movl	%ecx, -29(%edx)
1146L(fwd_write_25bytes):
1147	movl	-25(%eax), %ecx
1148	movl	%ecx, -25(%edx)
1149L(fwd_write_21bytes):
1150	movl	-21(%eax), %ecx
1151	movl	%ecx, -21(%edx)
1152L(fwd_write_17bytes):
1153	movl	-17(%eax), %ecx
1154	movl	%ecx, -17(%edx)
1155L(fwd_write_13bytes):
1156	movl	-13(%eax), %ecx
1157	movl	%ecx, -13(%edx)
1158L(fwd_write_9bytes):
1159	movl	-9(%eax), %ecx
1160	movl	%ecx, -9(%edx)
1161	movl	-5(%eax), %ecx
1162	movl	%ecx, -5(%edx)
1163L(fwd_write_1bytes):
1164	movzbl	-1(%eax), %ecx
1165	movb	%cl, -1(%edx)
1166#ifndef USE_AS_BCOPY
1167# ifdef USE_AS_MEMPCPY
1168	movl	%edx, %eax
1169# else
1170	movl	DEST(%esp), %eax
1171# endif
1172#endif
1173	RETURN
1174
1175	ALIGN (4)
1176L(fwd_write_46bytes):
1177	movl	-46(%eax), %ecx
1178	movl	%ecx, -46(%edx)
1179L(fwd_write_42bytes):
1180	movl	-42(%eax), %ecx
1181	movl	%ecx, -42(%edx)
1182L(fwd_write_38bytes):
1183	movl	-38(%eax), %ecx
1184	movl	%ecx, -38(%edx)
1185L(fwd_write_34bytes):
1186	movl	-34(%eax), %ecx
1187	movl	%ecx, -34(%edx)
1188L(fwd_write_30bytes):
1189	movl	-30(%eax), %ecx
1190	movl	%ecx, -30(%edx)
1191L(fwd_write_26bytes):
1192	movl	-26(%eax), %ecx
1193	movl	%ecx, -26(%edx)
1194L(fwd_write_22bytes):
1195	movl	-22(%eax), %ecx
1196	movl	%ecx, -22(%edx)
1197L(fwd_write_18bytes):
1198	movl	-18(%eax), %ecx
1199	movl	%ecx, -18(%edx)
1200L(fwd_write_14bytes):
1201	movl	-14(%eax), %ecx
1202	movl	%ecx, -14(%edx)
1203L(fwd_write_10bytes):
1204	movl	-10(%eax), %ecx
1205	movl	%ecx, -10(%edx)
1206L(fwd_write_6bytes):
1207	movl	-6(%eax), %ecx
1208	movl	%ecx, -6(%edx)
1209L(fwd_write_2bytes):
1210	movzwl	-2(%eax), %ecx
1211	movw	%cx, -2(%edx)
1212#ifndef USE_AS_BCOPY
1213# ifdef USE_AS_MEMPCPY
1214	movl	%edx, %eax
1215# else
1216	movl	DEST(%esp), %eax
1217# endif
1218#endif
1219	RETURN
1220
1221	ALIGN (4)
1222L(fwd_write_47bytes):
1223	movl	-47(%eax), %ecx
1224	movl	%ecx, -47(%edx)
1225L(fwd_write_43bytes):
1226	movl	-43(%eax), %ecx
1227	movl	%ecx, -43(%edx)
1228L(fwd_write_39bytes):
1229	movl	-39(%eax), %ecx
1230	movl	%ecx, -39(%edx)
1231L(fwd_write_35bytes):
1232	movl	-35(%eax), %ecx
1233	movl	%ecx, -35(%edx)
1234L(fwd_write_31bytes):
1235	movl	-31(%eax), %ecx
1236	movl	%ecx, -31(%edx)
1237L(fwd_write_27bytes):
1238	movl	-27(%eax), %ecx
1239	movl	%ecx, -27(%edx)
1240L(fwd_write_23bytes):
1241	movl	-23(%eax), %ecx
1242	movl	%ecx, -23(%edx)
1243L(fwd_write_19bytes):
1244	movl	-19(%eax), %ecx
1245	movl	%ecx, -19(%edx)
1246L(fwd_write_15bytes):
1247	movl	-15(%eax), %ecx
1248	movl	%ecx, -15(%edx)
1249L(fwd_write_11bytes):
1250	movl	-11(%eax), %ecx
1251	movl	%ecx, -11(%edx)
1252L(fwd_write_7bytes):
1253	movl	-7(%eax), %ecx
1254	movl	%ecx, -7(%edx)
1255L(fwd_write_3bytes):
1256	movzwl	-3(%eax), %ecx
1257	movzbl	-1(%eax), %eax
1258	movw	%cx, -3(%edx)
1259	movb	%al, -1(%edx)
1260#ifndef USE_AS_BCOPY
1261# ifdef USE_AS_MEMPCPY
1262	movl	%edx, %eax
1263# else
1264	movl	DEST(%esp), %eax
1265# endif
1266#endif
1267	RETURN
1268
1269	ALIGN (4)
1270L(large_page):
1271	movdqu	(%eax), %xmm1
1272	lea	16(%eax), %eax
1273	movdqu	%xmm0, (%esi)
1274	movntdq	%xmm1, (%edx)
1275	lea	16(%edx), %edx
1276	POP (%esi)
1277	lea	-0x90(%ecx), %ecx
1278	POP (%edi)
1279L(large_page_loop):
1280	movdqu	(%eax), %xmm0
1281	movdqu	0x10(%eax), %xmm1
1282	movdqu	0x20(%eax), %xmm2
1283	movdqu	0x30(%eax), %xmm3
1284	movdqu	0x40(%eax), %xmm4
1285	movdqu	0x50(%eax), %xmm5
1286	movdqu	0x60(%eax), %xmm6
1287	movdqu	0x70(%eax), %xmm7
1288	lea	0x80(%eax), %eax
1289
1290	sub	$0x80, %ecx
1291	movntdq	%xmm0, (%edx)
1292	movntdq	%xmm1, 0x10(%edx)
1293	movntdq	%xmm2, 0x20(%edx)
1294	movntdq	%xmm3, 0x30(%edx)
1295	movntdq	%xmm4, 0x40(%edx)
1296	movntdq	%xmm5, 0x50(%edx)
1297	movntdq	%xmm6, 0x60(%edx)
1298	movntdq	%xmm7, 0x70(%edx)
1299	lea	0x80(%edx), %edx
1300	jae	L(large_page_loop)
1301	cmp	$-0x40, %ecx
1302	lea	0x80(%ecx), %ecx
1303	jl	L(large_page_less_64bytes)
1304
1305	movdqu	(%eax), %xmm0
1306	movdqu	0x10(%eax), %xmm1
1307	movdqu	0x20(%eax), %xmm2
1308	movdqu	0x30(%eax), %xmm3
1309	lea	0x40(%eax), %eax
1310
1311	movntdq	%xmm0, (%edx)
1312	movntdq	%xmm1, 0x10(%edx)
1313	movntdq	%xmm2, 0x20(%edx)
1314	movntdq	%xmm3, 0x30(%edx)
1315	lea	0x40(%edx), %edx
1316	sub	$0x40, %ecx
1317L(large_page_less_64bytes):
1318	cmp	$32, %ecx
1319	jb	L(large_page_less_32bytes)
1320	movdqu	(%eax), %xmm0
1321	movdqu	0x10(%eax), %xmm1
1322	lea	0x20(%eax), %eax
1323	movntdq	%xmm0, (%edx)
1324	movntdq	%xmm1, 0x10(%edx)
1325	lea	0x20(%edx), %edx
1326	sub	$0x20, %ecx
1327L(large_page_less_32bytes):
1328	add	%ecx, %edx
1329	add	%ecx, %eax
1330	sfence
1331	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1332
1333
1334	ALIGN (4)
1335L(bk_write_44bytes):
1336	movl	40(%eax), %ecx
1337	movl	%ecx, 40(%edx)
1338L(bk_write_40bytes):
1339	movl	36(%eax), %ecx
1340	movl	%ecx, 36(%edx)
1341L(bk_write_36bytes):
1342	movl	32(%eax), %ecx
1343	movl	%ecx, 32(%edx)
1344L(bk_write_32bytes):
1345	movl	28(%eax), %ecx
1346	movl	%ecx, 28(%edx)
1347L(bk_write_28bytes):
1348	movl	24(%eax), %ecx
1349	movl	%ecx, 24(%edx)
1350L(bk_write_24bytes):
1351	movl	20(%eax), %ecx
1352	movl	%ecx, 20(%edx)
1353L(bk_write_20bytes):
1354	movl	16(%eax), %ecx
1355	movl	%ecx, 16(%edx)
1356L(bk_write_16bytes):
1357	movl	12(%eax), %ecx
1358	movl	%ecx, 12(%edx)
1359L(bk_write_12bytes):
1360	movl	8(%eax), %ecx
1361	movl	%ecx, 8(%edx)
1362L(bk_write_8bytes):
1363	movl	4(%eax), %ecx
1364	movl	%ecx, 4(%edx)
1365L(bk_write_4bytes):
1366	movl	(%eax), %ecx
1367	movl	%ecx, (%edx)
1368L(bk_write_0bytes):
1369#ifndef USE_AS_BCOPY
1370	movl	DEST(%esp), %eax
1371# ifdef USE_AS_MEMPCPY
1372	movl	LEN(%esp), %ecx
1373	add	%ecx, %eax
1374# endif
1375#endif
1376	RETURN
1377
1378	ALIGN (4)
1379L(bk_write_45bytes):
1380	movl	41(%eax), %ecx
1381	movl	%ecx, 41(%edx)
1382L(bk_write_41bytes):
1383	movl	37(%eax), %ecx
1384	movl	%ecx, 37(%edx)
1385L(bk_write_37bytes):
1386	movl	33(%eax), %ecx
1387	movl	%ecx, 33(%edx)
1388L(bk_write_33bytes):
1389	movl	29(%eax), %ecx
1390	movl	%ecx, 29(%edx)
1391L(bk_write_29bytes):
1392	movl	25(%eax), %ecx
1393	movl	%ecx, 25(%edx)
1394L(bk_write_25bytes):
1395	movl	21(%eax), %ecx
1396	movl	%ecx, 21(%edx)
1397L(bk_write_21bytes):
1398	movl	17(%eax), %ecx
1399	movl	%ecx, 17(%edx)
1400L(bk_write_17bytes):
1401	movl	13(%eax), %ecx
1402	movl	%ecx, 13(%edx)
1403L(bk_write_13bytes):
1404	movl	9(%eax), %ecx
1405	movl	%ecx, 9(%edx)
1406L(bk_write_9bytes):
1407	movl	5(%eax), %ecx
1408	movl	%ecx, 5(%edx)
1409L(bk_write_5bytes):
1410	movl	1(%eax), %ecx
1411	movl	%ecx, 1(%edx)
1412L(bk_write_1bytes):
1413	movzbl	(%eax), %ecx
1414	movb	%cl, (%edx)
1415#ifndef USE_AS_BCOPY
1416	movl	DEST(%esp), %eax
1417# ifdef USE_AS_MEMPCPY
1418	movl	LEN(%esp), %ecx
1419	add	%ecx, %eax
1420# endif
1421#endif
1422	RETURN
1423
1424	ALIGN (4)
1425L(bk_write_46bytes):
1426	movl	42(%eax), %ecx
1427	movl	%ecx, 42(%edx)
1428L(bk_write_42bytes):
1429	movl	38(%eax), %ecx
1430	movl	%ecx, 38(%edx)
1431L(bk_write_38bytes):
1432	movl	34(%eax), %ecx
1433	movl	%ecx, 34(%edx)
1434L(bk_write_34bytes):
1435	movl	30(%eax), %ecx
1436	movl	%ecx, 30(%edx)
1437L(bk_write_30bytes):
1438	movl	26(%eax), %ecx
1439	movl	%ecx, 26(%edx)
1440L(bk_write_26bytes):
1441	movl	22(%eax), %ecx
1442	movl	%ecx, 22(%edx)
1443L(bk_write_22bytes):
1444	movl	18(%eax), %ecx
1445	movl	%ecx, 18(%edx)
1446L(bk_write_18bytes):
1447	movl	14(%eax), %ecx
1448	movl	%ecx, 14(%edx)
1449L(bk_write_14bytes):
1450	movl	10(%eax), %ecx
1451	movl	%ecx, 10(%edx)
1452L(bk_write_10bytes):
1453	movl	6(%eax), %ecx
1454	movl	%ecx, 6(%edx)
1455L(bk_write_6bytes):
1456	movl	2(%eax), %ecx
1457	movl	%ecx, 2(%edx)
1458L(bk_write_2bytes):
1459	movzwl	(%eax), %ecx
1460	movw	%cx, (%edx)
1461#ifndef USE_AS_BCOPY
1462	movl	DEST(%esp), %eax
1463# ifdef USE_AS_MEMPCPY
1464	movl	LEN(%esp), %ecx
1465	add	%ecx, %eax
1466# endif
1467#endif
1468	RETURN
1469
1470	ALIGN (4)
1471L(bk_write_47bytes):
1472	movl	43(%eax), %ecx
1473	movl	%ecx, 43(%edx)
1474L(bk_write_43bytes):
1475	movl	39(%eax), %ecx
1476	movl	%ecx, 39(%edx)
1477L(bk_write_39bytes):
1478	movl	35(%eax), %ecx
1479	movl	%ecx, 35(%edx)
1480L(bk_write_35bytes):
1481	movl	31(%eax), %ecx
1482	movl	%ecx, 31(%edx)
1483L(bk_write_31bytes):
1484	movl	27(%eax), %ecx
1485	movl	%ecx, 27(%edx)
1486L(bk_write_27bytes):
1487	movl	23(%eax), %ecx
1488	movl	%ecx, 23(%edx)
1489L(bk_write_23bytes):
1490	movl	19(%eax), %ecx
1491	movl	%ecx, 19(%edx)
1492L(bk_write_19bytes):
1493	movl	15(%eax), %ecx
1494	movl	%ecx, 15(%edx)
1495L(bk_write_15bytes):
1496	movl	11(%eax), %ecx
1497	movl	%ecx, 11(%edx)
1498L(bk_write_11bytes):
1499	movl	7(%eax), %ecx
1500	movl	%ecx, 7(%edx)
1501L(bk_write_7bytes):
1502	movl	3(%eax), %ecx
1503	movl	%ecx, 3(%edx)
1504L(bk_write_3bytes):
1505	movzwl	1(%eax), %ecx
1506	movw	%cx, 1(%edx)
1507	movzbl	(%eax), %eax
1508	movb	%al, (%edx)
1509#ifndef USE_AS_BCOPY
1510	movl	DEST(%esp), %eax
1511# ifdef USE_AS_MEMPCPY
1512	movl	LEN(%esp), %ecx
1513	add	%ecx, %eax
1514# endif
1515#endif
1516	RETURN_END
1517
1518
1519	.pushsection .rodata.ssse3,"a",@progbits
1520	ALIGN (2)
1521L(table_48bytes_fwd):
1522	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1523	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1524	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1525	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1526	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1527	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1528	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1529	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1530	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1531	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1532	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1533	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1534	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1535	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1536	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1537	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1538	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1539	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1540	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1541	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1542	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1543	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1544	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1545	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1546	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1547	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1548	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1549	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1550	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1551	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1552	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1553	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1554	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1555	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1556	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1557	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1558	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1559	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1560	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1561	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1562	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1563	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1564	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1565	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1566	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1567	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1568	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1569	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1570
1571	ALIGN (2)
1572L(shl_table):
1573	.int	JMPTBL (L(shl_0), L(shl_table))
1574	.int	JMPTBL (L(shl_1), L(shl_table))
1575	.int	JMPTBL (L(shl_2), L(shl_table))
1576	.int	JMPTBL (L(shl_3), L(shl_table))
1577	.int	JMPTBL (L(shl_4), L(shl_table))
1578	.int	JMPTBL (L(shl_5), L(shl_table))
1579	.int	JMPTBL (L(shl_6), L(shl_table))
1580	.int	JMPTBL (L(shl_7), L(shl_table))
1581	.int	JMPTBL (L(shl_8), L(shl_table))
1582	.int	JMPTBL (L(shl_9), L(shl_table))
1583	.int	JMPTBL (L(shl_10), L(shl_table))
1584	.int	JMPTBL (L(shl_11), L(shl_table))
1585	.int	JMPTBL (L(shl_12), L(shl_table))
1586	.int	JMPTBL (L(shl_13), L(shl_table))
1587	.int	JMPTBL (L(shl_14), L(shl_table))
1588	.int	JMPTBL (L(shl_15), L(shl_table))
1589
1590	ALIGN (2)
1591L(table_48_bytes_bwd):
1592	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1593	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1594	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1595	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1596	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1597	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1598	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1599	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1600	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1601	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1602	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1603	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1604	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1605	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1606	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1607	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1608	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1609	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1610	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1611	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1612	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1613	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1614	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1615	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1616	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1617	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1618	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1619	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1620	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1621	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1622	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1623	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1624	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1625	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1626	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1627	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1628	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1629	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1630	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1631	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1632	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1633	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1634	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1635	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1636	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1637	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1638	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1639	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1640
1641	.popsection
1642
1643#ifdef USE_AS_MEMMOVE
1644	ALIGN (4)
1645L(copy_backward):
1646	PUSH (%esi)
1647	movl	%eax, %esi
1648	lea	(%ecx,%edx,1),%edx
1649	lea	(%ecx,%esi,1),%esi
1650	testl	$0x3, %edx
1651	jnz	L(bk_align)
1652
1653L(bk_aligned_4):
1654	cmp	$64, %ecx
1655	jae	L(bk_write_more64bytes)
1656
1657L(bk_write_64bytesless):
1658	cmp	$32, %ecx
1659	jb	L(bk_write_less32bytes)
1660
1661L(bk_write_more32bytes):
1662	/* Copy 32 bytes at a time.  */
1663	sub	$32, %ecx
1664	movl	-4(%esi), %eax
1665	movl	%eax, -4(%edx)
1666	movl	-8(%esi), %eax
1667	movl	%eax, -8(%edx)
1668	movl	-12(%esi), %eax
1669	movl	%eax, -12(%edx)
1670	movl	-16(%esi), %eax
1671	movl	%eax, -16(%edx)
1672	movl	-20(%esi), %eax
1673	movl	%eax, -20(%edx)
1674	movl	-24(%esi), %eax
1675	movl	%eax, -24(%edx)
1676	movl	-28(%esi), %eax
1677	movl	%eax, -28(%edx)
1678	movl	-32(%esi), %eax
1679	movl	%eax, -32(%edx)
1680	sub	$32, %edx
1681	sub	$32, %esi
1682
1683L(bk_write_less32bytes):
1684	movl	%esi, %eax
1685	sub	%ecx, %edx
1686	sub	%ecx, %eax
1687	POP (%esi)
1688L(bk_write_less32bytes_2):
1689	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1690
1691	ALIGN (4)
1692L(bk_align):
1693	cmp	$8, %ecx
1694	jbe	L(bk_write_less32bytes)
1695	testl	$1, %edx
1696	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1697	   then (EDX & 2) must be != 0.  */
1698	jz	L(bk_got2)
1699	sub	$1, %esi
1700	sub	$1, %ecx
1701	sub	$1, %edx
1702	movzbl	(%esi), %eax
1703	movb	%al, (%edx)
1704
1705	testl	$2, %edx
1706	jz	L(bk_aligned_4)
1707
1708L(bk_got2):
1709	sub	$2, %esi
1710	sub	$2, %ecx
1711	sub	$2, %edx
1712	movzwl	(%esi), %eax
1713	movw	%ax, (%edx)
1714	jmp	L(bk_aligned_4)
1715
1716	ALIGN (4)
1717L(bk_write_more64bytes):
1718	/* Check alignment of last byte.  */
1719	testl	$15, %edx
1720	jz	L(bk_ssse3_cpy_pre)
1721
1722/* EDX is aligned 4 bytes, but not 16 bytes.  */
1723L(bk_ssse3_align):
1724	sub	$4, %esi
1725	sub	$4, %ecx
1726	sub	$4, %edx
1727	movl	(%esi), %eax
1728	movl	%eax, (%edx)
1729
1730	testl	$15, %edx
1731	jz	L(bk_ssse3_cpy_pre)
1732
1733	sub	$4, %esi
1734	sub	$4, %ecx
1735	sub	$4, %edx
1736	movl	(%esi), %eax
1737	movl	%eax, (%edx)
1738
1739	testl	$15, %edx
1740	jz	L(bk_ssse3_cpy_pre)
1741
1742	sub	$4, %esi
1743	sub	$4, %ecx
1744	sub	$4, %edx
1745	movl	(%esi), %eax
1746	movl	%eax, (%edx)
1747
1748L(bk_ssse3_cpy_pre):
1749	cmp	$64, %ecx
1750	jb	L(bk_write_more32bytes)
1751
1752L(bk_ssse3_cpy):
1753	sub	$64, %esi
1754	sub	$64, %ecx
1755	sub	$64, %edx
1756	movdqu	0x30(%esi), %xmm3
1757	movdqa	%xmm3, 0x30(%edx)
1758	movdqu	0x20(%esi), %xmm2
1759	movdqa	%xmm2, 0x20(%edx)
1760	movdqu	0x10(%esi), %xmm1
1761	movdqa	%xmm1, 0x10(%edx)
1762	movdqu	(%esi), %xmm0
1763	movdqa	%xmm0, (%edx)
1764	cmp	$64, %ecx
1765	jae	L(bk_ssse3_cpy)
1766	jmp	L(bk_write_64bytesless)
1767
1768#endif
1769
1770END (MEMCPY)
1771