• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef MEMCPY
32# define MEMCPY         ssse3_memcpy5
33#endif
34
35#ifndef L
36# define L(label)	.L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n)	.p2align n
41#endif
42
43#ifndef cfi_startproc
44# define cfi_startproc			.cfi_startproc
45#endif
46
47#ifndef cfi_endproc
48# define cfi_endproc			.cfi_endproc
49#endif
50
51#ifndef cfi_rel_offset
52# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
53#endif
54
55#ifndef cfi_restore
56# define cfi_restore(reg)		.cfi_restore reg
57#endif
58
59#ifndef cfi_adjust_cfa_offset
60# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
61#endif
62
63#ifndef cfi_remember_state
64# define cfi_remember_state		.cfi_remember_state
65#endif
66
67#ifndef cfi_restore_state
68# define cfi_restore_state		.cfi_restore_state
69#endif
70
71#ifndef ENTRY
72# define ENTRY(name)			\
73	.type name,  @function; 	\
74	.globl name;			\
75	.p2align 4;			\
76name:					\
77	cfi_startproc
78#endif
79
80#ifndef END
81# define END(name)			\
82	cfi_endproc;			\
83	.size name, .-name
84#endif
85
86#ifdef USE_AS_BCOPY
87# define SRC		PARMS
88# define DEST		SRC+4
89# define LEN		DEST+4
90#else
91# define DEST		PARMS
92# define SRC		DEST+4
93# define LEN		SRC+4
94#endif
95
96#define CFI_PUSH(REG)						\
97  cfi_adjust_cfa_offset (4);					\
98  cfi_rel_offset (REG, 0)
99
100#define CFI_POP(REG)						\
101  cfi_adjust_cfa_offset (-4);					\
102  cfi_restore (REG)
103
104#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
105#define POP(REG)	popl REG; CFI_POP (REG)
106
107#ifdef SHARED
108# define PARMS		8		/* Preserve EBX.  */
109# define ENTRANCE	PUSH (%ebx);
110# define RETURN_END	POP (%ebx); ret
111# define RETURN		RETURN_END; CFI_PUSH (%ebx)
112# define JMPTBL(I, B)	I - B
113
114/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
115   jump table with relative offsets.  INDEX is a register contains the
116   index into the jump table.   SCALE is the scale of INDEX. */
117# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
118    /* We first load PC into EBX.  */				\
119    call	__i686.get_pc_thunk.bx;				\
120    /* Get the address of the jump table.  */			\
121    addl	$(TABLE - .), %ebx;				\
122    /* Get the entry and convert the relative offset to the	\
123       absolute address.  */					\
124    addl	(%ebx,INDEX,SCALE), %ebx;			\
125    /* We loaded the jump table.  Go.  */			\
126    jmp		*%ebx
127
128# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
129    addl	$(TABLE - .), %ebx
130
131# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
132    addl	(%ebx,INDEX,SCALE), %ebx;			\
133    /* We loaded the jump table.  Go.  */			\
134    jmp		*%ebx
135
136	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
137	.globl	__i686.get_pc_thunk.bx
138	.hidden	__i686.get_pc_thunk.bx
139	ALIGN (4)
140	.type	__i686.get_pc_thunk.bx,@function
141__i686.get_pc_thunk.bx:
142	movl	(%esp), %ebx
143	ret
144#else
145# define PARMS		4
146# define ENTRANCE
147# define RETURN_END	ret
148# define RETURN		RETURN_END
149# define JMPTBL(I, B)	I
150
151/* Branch to an entry in a jump table.  TABLE is a jump table with
152   absolute offsets.  INDEX is a register contains the index into the
153   jump table.  SCALE is the scale of INDEX. */
154# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
155    jmp		*TABLE(,INDEX,SCALE)
156
157# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
158
159# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
160    jmp		*TABLE(,INDEX,SCALE)
161#endif
162
163	.section .text.ssse3,"ax",@progbits
164ENTRY (MEMCPY)
165	ENTRANCE
166	movl	LEN(%esp), %ecx
167	movl	SRC(%esp), %eax
168	movl	DEST(%esp), %edx
169
170#ifdef USE_AS_MEMMOVE
171	cmp	%eax, %edx
172	jb	L(copy_forward)
173	je	L(fwd_write_0bytes)
174	cmp	$32, %ecx
175	jae	L(memmove_bwd)
176	jmp	L(bk_write_less32bytes_2)
177L(memmove_bwd):
178	add	%ecx, %eax
179	cmp	%eax, %edx
180	movl	SRC(%esp), %eax
181	jb	L(copy_backward)
182
183L(copy_forward):
184#endif
185	cmp	$48, %ecx
186	jae	L(48bytesormore)
187
188L(fwd_write_less32bytes):
189#ifndef USE_AS_MEMMOVE
190	cmp	%dl, %al
191	jb	L(bk_write)
192#endif
193	add	%ecx, %edx
194	add	%ecx, %eax
195	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
196#ifndef USE_AS_MEMMOVE
197L(bk_write):
198	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
199#endif
200
201	ALIGN (4)
202/* ECX > 32 and EDX is 4 byte aligned.  */
203L(48bytesormore):
204	movdqu	(%eax), %xmm0
205	PUSH (%edi)
206	movl	%edx, %edi
207	and	$-16, %edx
208	PUSH (%esi)
209	cfi_remember_state
210	add	$16, %edx
211	movl	%edi, %esi
212	sub	%edx, %edi
213	add	%edi, %ecx
214	sub	%edi, %eax
215
216#ifdef SHARED_CACHE_SIZE_HALF
217	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
218#else
219# ifdef SHARED
220	call	__i686.get_pc_thunk.bx
221	add	$_GLOBAL_OFFSET_TABLE_, %ebx
222	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
223# else
224	cmp	__x86_shared_cache_size_half, %ecx
225# endif
226#endif
227
228	mov	%eax, %edi
229	jae	L(large_page)
230	and	$0xf, %edi
231	jz	L(shl_0)
232
233	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
234
235	cfi_restore_state
236	cfi_remember_state
237	ALIGN (4)
238L(shl_0):
239	movdqu	%xmm0, (%esi)
240	xor	%edi, %edi
241	POP (%esi)
242	cmp	$127, %ecx
243	ja	L(shl_0_gobble)
244	lea	-32(%ecx), %ecx
245L(shl_0_loop):
246	movdqa	(%eax, %edi), %xmm0
247	movdqa	16(%eax, %edi), %xmm1
248	sub	$32, %ecx
249	movdqa	%xmm0, (%edx, %edi)
250	movdqa	%xmm1, 16(%edx, %edi)
251	lea	32(%edi), %edi
252	jb	L(shl_0_end)
253
254	movdqa	(%eax, %edi), %xmm0
255	movdqa	16(%eax, %edi), %xmm1
256	sub	$32, %ecx
257	movdqa	%xmm0, (%edx, %edi)
258	movdqa	%xmm1, 16(%edx, %edi)
259	lea	32(%edi), %edi
260	jb	L(shl_0_end)
261
262	movdqa	(%eax, %edi), %xmm0
263	movdqa	16(%eax, %edi), %xmm1
264	sub	$32, %ecx
265	movdqa	%xmm0, (%edx, %edi)
266	movdqa	%xmm1, 16(%edx, %edi)
267	lea	32(%edi), %edi
268	jb	L(shl_0_end)
269
270	movdqa	(%eax, %edi), %xmm0
271	movdqa	16(%eax, %edi), %xmm1
272	sub	$32, %ecx
273	movdqa	%xmm0, (%edx, %edi)
274	movdqa	%xmm1, 16(%edx, %edi)
275	lea	32(%edi), %edi
276L(shl_0_end):
277	lea	32(%ecx), %ecx
278	add	%ecx, %edi
279	add	%edi, %edx
280	add	%edi, %eax
281	POP (%edi)
282	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
283
284	CFI_PUSH (%edi)
285L(shl_0_gobble):
286
287#ifdef DATA_CACHE_SIZE_HALF
288	cmp	$DATA_CACHE_SIZE_HALF, %ecx
289#else
290# ifdef SHARED
291	call	__i686.get_pc_thunk.bx
292	add	$_GLOBAL_OFFSET_TABLE_, %ebx
293	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
294# else
295	cmp	__x86_data_cache_size_half, %ecx
296# endif
297#endif
298
299	POP (%edi)
300	lea	-128(%ecx), %ecx
301	jae	L(shl_0_gobble_mem_loop)
302L(shl_0_gobble_cache_loop):
303	movdqa	(%eax), %xmm0
304	movdqa	0x10(%eax), %xmm1
305	movdqa	0x20(%eax), %xmm2
306	movdqa	0x30(%eax), %xmm3
307	movdqa	0x40(%eax), %xmm4
308	movdqa	0x50(%eax), %xmm5
309	movdqa	0x60(%eax), %xmm6
310	movdqa	0x70(%eax), %xmm7
311	lea	0x80(%eax), %eax
312	sub	$128, %ecx
313	movdqa	%xmm0, (%edx)
314	movdqa	%xmm1, 0x10(%edx)
315	movdqa	%xmm2, 0x20(%edx)
316	movdqa	%xmm3, 0x30(%edx)
317	movdqa	%xmm4, 0x40(%edx)
318	movdqa	%xmm5, 0x50(%edx)
319	movdqa	%xmm6, 0x60(%edx)
320	movdqa	%xmm7, 0x70(%edx)
321	lea	0x80(%edx), %edx
322
323	jae	L(shl_0_gobble_cache_loop)
324	cmp	$-0x40, %ecx
325	lea	0x80(%ecx), %ecx
326	jl	L(shl_0_cache_less_64bytes)
327
328	movdqa	(%eax), %xmm0
329	sub	$0x40, %ecx
330	movdqa	0x10(%eax), %xmm1
331
332	movdqa	%xmm0, (%edx)
333	movdqa	%xmm1, 0x10(%edx)
334
335	movdqa	0x20(%eax), %xmm0
336	movdqa	0x30(%eax), %xmm1
337	add	$0x40, %eax
338
339	movdqa	%xmm0, 0x20(%edx)
340	movdqa	%xmm1, 0x30(%edx)
341	add	$0x40, %edx
342L(shl_0_cache_less_64bytes):
343	cmp	$0x20, %ecx
344	jb	L(shl_0_cache_less_32bytes)
345	movdqa	(%eax), %xmm0
346	sub	$0x20, %ecx
347	movdqa	0x10(%eax), %xmm1
348	add	$0x20, %eax
349	movdqa	%xmm0, (%edx)
350	movdqa	%xmm1, 0x10(%edx)
351	add	$0x20, %edx
352L(shl_0_cache_less_32bytes):
353	cmp	$0x10, %ecx
354	jb	L(shl_0_cache_less_16bytes)
355	sub	$0x10, %ecx
356	movdqa	(%eax), %xmm0
357	add	$0x10, %eax
358	movdqa	%xmm0, (%edx)
359	add	$0x10, %edx
360L(shl_0_cache_less_16bytes):
361	add	%ecx, %edx
362	add	%ecx, %eax
363	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
364
365
366	ALIGN (4)
367L(shl_0_gobble_mem_loop):
368	prefetcht0 0x1c0(%eax)
369	prefetcht0 0x280(%eax)
370	prefetcht0 0x1c0(%edx)
371
372	movdqa	(%eax), %xmm0
373	movdqa	0x10(%eax), %xmm1
374	movdqa	0x20(%eax), %xmm2
375	movdqa	0x30(%eax), %xmm3
376	movdqa	0x40(%eax), %xmm4
377	movdqa	0x50(%eax), %xmm5
378	movdqa	0x60(%eax), %xmm6
379	movdqa	0x70(%eax), %xmm7
380	lea	0x80(%eax), %eax
381	sub	$0x80, %ecx
382	movdqa	%xmm0, (%edx)
383	movdqa	%xmm1, 0x10(%edx)
384	movdqa	%xmm2, 0x20(%edx)
385	movdqa	%xmm3, 0x30(%edx)
386	movdqa	%xmm4, 0x40(%edx)
387	movdqa	%xmm5, 0x50(%edx)
388	movdqa	%xmm6, 0x60(%edx)
389	movdqa	%xmm7, 0x70(%edx)
390	lea	0x80(%edx), %edx
391
392	jae	L(shl_0_gobble_mem_loop)
393	cmp	$-0x40, %ecx
394	lea	0x80(%ecx), %ecx
395	jl	L(shl_0_mem_less_64bytes)
396
397	movdqa	(%eax), %xmm0
398	sub	$0x40, %ecx
399	movdqa	0x10(%eax), %xmm1
400
401	movdqa	%xmm0, (%edx)
402	movdqa	%xmm1, 0x10(%edx)
403
404	movdqa	0x20(%eax), %xmm0
405	movdqa	0x30(%eax), %xmm1
406	add	$0x40, %eax
407
408	movdqa	%xmm0, 0x20(%edx)
409	movdqa	%xmm1, 0x30(%edx)
410	add	$0x40, %edx
411L(shl_0_mem_less_64bytes):
412	cmp	$0x20, %ecx
413	jb	L(shl_0_mem_less_32bytes)
414	movdqa	(%eax), %xmm0
415	sub	$0x20, %ecx
416	movdqa	0x10(%eax), %xmm1
417	add	$0x20, %eax
418	movdqa	%xmm0, (%edx)
419	movdqa	%xmm1, 0x10(%edx)
420	add	$0x20, %edx
421L(shl_0_mem_less_32bytes):
422	cmp	$0x10, %ecx
423	jb	L(shl_0_mem_less_16bytes)
424	sub	$0x10, %ecx
425	movdqa	(%eax), %xmm0
426	add	$0x10, %eax
427	movdqa	%xmm0, (%edx)
428	add	$0x10, %edx
429L(shl_0_mem_less_16bytes):
430	add	%ecx, %edx
431	add	%ecx, %eax
432	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
433
434	cfi_restore_state
435	cfi_remember_state
436	ALIGN (4)
437L(shl_1):
438	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
439	lea	-1(%eax), %eax
440	movaps	(%eax), %xmm1
441	xor	%edi, %edi
442	lea	-32(%ecx), %ecx
443	movdqu	%xmm0, (%esi)
444	POP (%esi)
445L(shl_1_loop):
446
447	movdqa	16(%eax, %edi), %xmm2
448	sub	$32, %ecx
449	movdqa	32(%eax, %edi), %xmm3
450	movdqa	%xmm3, %xmm4
451	palignr	$1, %xmm2, %xmm3
452	palignr	$1, %xmm1, %xmm2
453	lea	32(%edi), %edi
454	movdqa	%xmm2, -32(%edx, %edi)
455	movdqa	%xmm3, -16(%edx, %edi)
456
457	jb	L(shl_1_end)
458
459	movdqa	16(%eax, %edi), %xmm2
460	sub	$32, %ecx
461	movdqa	32(%eax, %edi), %xmm3
462	movdqa	%xmm3, %xmm1
463	palignr	$1, %xmm2, %xmm3
464	palignr	$1, %xmm4, %xmm2
465	lea	32(%edi), %edi
466	movdqa	%xmm2, -32(%edx, %edi)
467	movdqa	%xmm3, -16(%edx, %edi)
468
469	jae	L(shl_1_loop)
470
471L(shl_1_end):
472	lea	32(%ecx), %ecx
473	add	%ecx, %edi
474	add	%edi, %edx
475	lea	1(%edi, %eax), %eax
476	POP (%edi)
477	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
478
479	cfi_restore_state
480	cfi_remember_state
481	ALIGN (4)
482L(shl_2):
483	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
484	lea	-2(%eax), %eax
485	movaps	(%eax), %xmm1
486	xor	%edi, %edi
487	lea	-32(%ecx), %ecx
488	movdqu	%xmm0, (%esi)
489	POP (%esi)
490L(shl_2_loop):
491
492	movdqa	16(%eax, %edi), %xmm2
493	sub	$32, %ecx
494	movdqa	32(%eax, %edi), %xmm3
495	movdqa	%xmm3, %xmm4
496	palignr	$2, %xmm2, %xmm3
497	palignr	$2, %xmm1, %xmm2
498	lea	32(%edi), %edi
499	movdqa	%xmm2, -32(%edx, %edi)
500	movdqa	%xmm3, -16(%edx, %edi)
501
502	jb	L(shl_2_end)
503
504	movdqa	16(%eax, %edi), %xmm2
505	sub	$32, %ecx
506	movdqa	32(%eax, %edi), %xmm3
507	movdqa	%xmm3, %xmm1
508	palignr	$2, %xmm2, %xmm3
509	palignr	$2, %xmm4, %xmm2
510	lea	32(%edi), %edi
511	movdqa	%xmm2, -32(%edx, %edi)
512	movdqa	%xmm3, -16(%edx, %edi)
513
514	jae	L(shl_2_loop)
515
516L(shl_2_end):
517	lea	32(%ecx), %ecx
518	add	%ecx, %edi
519	add	%edi, %edx
520	lea	2(%edi, %eax), %eax
521	POP (%edi)
522	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
523
524	cfi_restore_state
525	cfi_remember_state
526	ALIGN (4)
527L(shl_3):
528	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
529	lea	-3(%eax), %eax
530	movaps	(%eax), %xmm1
531	xor	%edi, %edi
532	lea	-32(%ecx), %ecx
533	movdqu	%xmm0, (%esi)
534	POP (%esi)
535L(shl_3_loop):
536
537	movdqa	16(%eax, %edi), %xmm2
538	sub	$32, %ecx
539	movdqa	32(%eax, %edi), %xmm3
540	movdqa	%xmm3, %xmm4
541	palignr	$3, %xmm2, %xmm3
542	palignr	$3, %xmm1, %xmm2
543	lea	32(%edi), %edi
544	movdqa	%xmm2, -32(%edx, %edi)
545	movdqa	%xmm3, -16(%edx, %edi)
546
547	jb	L(shl_3_end)
548
549	movdqa	16(%eax, %edi), %xmm2
550	sub	$32, %ecx
551	movdqa	32(%eax, %edi), %xmm3
552	movdqa	%xmm3, %xmm1
553	palignr	$3, %xmm2, %xmm3
554	palignr	$3, %xmm4, %xmm2
555	lea	32(%edi), %edi
556	movdqa	%xmm2, -32(%edx, %edi)
557	movdqa	%xmm3, -16(%edx, %edi)
558
559	jae	L(shl_3_loop)
560
561L(shl_3_end):
562	lea	32(%ecx), %ecx
563	add	%ecx, %edi
564	add	%edi, %edx
565	lea	3(%edi, %eax), %eax
566	POP (%edi)
567	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
568
569	cfi_restore_state
570	cfi_remember_state
571	ALIGN (4)
572L(shl_4):
573	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
574	lea	-4(%eax), %eax
575	movaps	(%eax), %xmm1
576	xor	%edi, %edi
577	lea	-32(%ecx), %ecx
578	movdqu	%xmm0, (%esi)
579	POP (%esi)
580L(shl_4_loop):
581
582	movdqa	16(%eax, %edi), %xmm2
583	sub	$32, %ecx
584	movdqa	32(%eax, %edi), %xmm3
585	movdqa	%xmm3, %xmm4
586	palignr	$4, %xmm2, %xmm3
587	palignr	$4, %xmm1, %xmm2
588	lea	32(%edi), %edi
589	movdqa	%xmm2, -32(%edx, %edi)
590	movdqa	%xmm3, -16(%edx, %edi)
591
592	jb	L(shl_4_end)
593
594	movdqa	16(%eax, %edi), %xmm2
595	sub	$32, %ecx
596	movdqa	32(%eax, %edi), %xmm3
597	movdqa	%xmm3, %xmm1
598	palignr	$4, %xmm2, %xmm3
599	palignr	$4, %xmm4, %xmm2
600	lea	32(%edi), %edi
601	movdqa	%xmm2, -32(%edx, %edi)
602	movdqa	%xmm3, -16(%edx, %edi)
603
604	jae	L(shl_4_loop)
605
606L(shl_4_end):
607	lea	32(%ecx), %ecx
608	add	%ecx, %edi
609	add	%edi, %edx
610	lea	4(%edi, %eax), %eax
611	POP (%edi)
612	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
613
614	cfi_restore_state
615	cfi_remember_state
616	ALIGN (4)
617L(shl_5):
618	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
619	lea	-5(%eax), %eax
620	movaps	(%eax), %xmm1
621	xor	%edi, %edi
622	lea	-32(%ecx), %ecx
623	movdqu	%xmm0, (%esi)
624	POP (%esi)
625L(shl_5_loop):
626
627	movdqa	16(%eax, %edi), %xmm2
628	sub	$32, %ecx
629	movdqa	32(%eax, %edi), %xmm3
630	movdqa	%xmm3, %xmm4
631	palignr	$5, %xmm2, %xmm3
632	palignr	$5, %xmm1, %xmm2
633	lea	32(%edi), %edi
634	movdqa	%xmm2, -32(%edx, %edi)
635	movdqa	%xmm3, -16(%edx, %edi)
636
637	jb	L(shl_5_end)
638
639	movdqa	16(%eax, %edi), %xmm2
640	sub	$32, %ecx
641	movdqa	32(%eax, %edi), %xmm3
642	movdqa	%xmm3, %xmm1
643	palignr	$5, %xmm2, %xmm3
644	palignr	$5, %xmm4, %xmm2
645	lea	32(%edi), %edi
646	movdqa	%xmm2, -32(%edx, %edi)
647	movdqa	%xmm3, -16(%edx, %edi)
648
649	jae	L(shl_5_loop)
650
651L(shl_5_end):
652	lea	32(%ecx), %ecx
653	add	%ecx, %edi
654	add	%edi, %edx
655	lea	5(%edi, %eax), %eax
656	POP (%edi)
657	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
658
659	cfi_restore_state
660	cfi_remember_state
661	ALIGN (4)
662L(shl_6):
663	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
664	lea	-6(%eax), %eax
665	movaps	(%eax), %xmm1
666	xor	%edi, %edi
667	lea	-32(%ecx), %ecx
668	movdqu	%xmm0, (%esi)
669	POP (%esi)
670L(shl_6_loop):
671
672	movdqa	16(%eax, %edi), %xmm2
673	sub	$32, %ecx
674	movdqa	32(%eax, %edi), %xmm3
675	movdqa	%xmm3, %xmm4
676	palignr	$6, %xmm2, %xmm3
677	palignr	$6, %xmm1, %xmm2
678	lea	32(%edi), %edi
679	movdqa	%xmm2, -32(%edx, %edi)
680	movdqa	%xmm3, -16(%edx, %edi)
681
682	jb	L(shl_6_end)
683
684	movdqa	16(%eax, %edi), %xmm2
685	sub	$32, %ecx
686	movdqa	32(%eax, %edi), %xmm3
687	movdqa	%xmm3, %xmm1
688	palignr	$6, %xmm2, %xmm3
689	palignr	$6, %xmm4, %xmm2
690	lea	32(%edi), %edi
691	movdqa	%xmm2, -32(%edx, %edi)
692	movdqa	%xmm3, -16(%edx, %edi)
693
694	jae	L(shl_6_loop)
695
696L(shl_6_end):
697	lea	32(%ecx), %ecx
698	add	%ecx, %edi
699	add	%edi, %edx
700	lea	6(%edi, %eax), %eax
701	POP (%edi)
702	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
703
704	cfi_restore_state
705	cfi_remember_state
706	ALIGN (4)
707L(shl_7):
708	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
709	lea	-7(%eax), %eax
710	movaps	(%eax), %xmm1
711	xor	%edi, %edi
712	lea	-32(%ecx), %ecx
713	movdqu	%xmm0, (%esi)
714	POP (%esi)
715L(shl_7_loop):
716
717	movdqa	16(%eax, %edi), %xmm2
718	sub	$32, %ecx
719	movdqa	32(%eax, %edi), %xmm3
720	movdqa	%xmm3, %xmm4
721	palignr	$7, %xmm2, %xmm3
722	palignr	$7, %xmm1, %xmm2
723	lea	32(%edi), %edi
724	movdqa	%xmm2, -32(%edx, %edi)
725	movdqa	%xmm3, -16(%edx, %edi)
726
727	jb	L(shl_7_end)
728
729	movdqa	16(%eax, %edi), %xmm2
730	sub	$32, %ecx
731	movdqa	32(%eax, %edi), %xmm3
732	movdqa	%xmm3, %xmm1
733	palignr	$7, %xmm2, %xmm3
734	palignr	$7, %xmm4, %xmm2
735	lea	32(%edi), %edi
736	movdqa	%xmm2, -32(%edx, %edi)
737	movdqa	%xmm3, -16(%edx, %edi)
738
739	jae	L(shl_7_loop)
740
741L(shl_7_end):
742	lea	32(%ecx), %ecx
743	add	%ecx, %edi
744	add	%edi, %edx
745	lea	7(%edi, %eax), %eax
746	POP (%edi)
747	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
748
749	cfi_restore_state
750	cfi_remember_state
751	ALIGN (4)
752L(shl_8):
753	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
754	lea	-8(%eax), %eax
755	movaps	(%eax), %xmm1
756	xor	%edi, %edi
757	lea	-32(%ecx), %ecx
758	movdqu	%xmm0, (%esi)
759	POP (%esi)
760L(shl_8_loop):
761
762	movdqa	16(%eax, %edi), %xmm2
763	sub	$32, %ecx
764	movdqa	32(%eax, %edi), %xmm3
765	movdqa	%xmm3, %xmm4
766	palignr	$8, %xmm2, %xmm3
767	palignr	$8, %xmm1, %xmm2
768	lea	32(%edi), %edi
769	movdqa	%xmm2, -32(%edx, %edi)
770	movdqa	%xmm3, -16(%edx, %edi)
771
772	jb	L(shl_8_end)
773
774	movdqa	16(%eax, %edi), %xmm2
775	sub	$32, %ecx
776	movdqa	32(%eax, %edi), %xmm3
777	movdqa	%xmm3, %xmm1
778	palignr	$8, %xmm2, %xmm3
779	palignr	$8, %xmm4, %xmm2
780	lea	32(%edi), %edi
781	movdqa	%xmm2, -32(%edx, %edi)
782	movdqa	%xmm3, -16(%edx, %edi)
783
784	jae	L(shl_8_loop)
785
786L(shl_8_end):
787	lea	32(%ecx), %ecx
788	add	%ecx, %edi
789	add	%edi, %edx
790	lea	8(%edi, %eax), %eax
791	POP (%edi)
792	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
793
794	cfi_restore_state
795	cfi_remember_state
796	ALIGN (4)
797L(shl_9):
798	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
799	lea	-9(%eax), %eax
800	movaps	(%eax), %xmm1
801	xor	%edi, %edi
802	lea	-32(%ecx), %ecx
803	movdqu	%xmm0, (%esi)
804	POP (%esi)
805L(shl_9_loop):
806
807	movdqa	16(%eax, %edi), %xmm2
808	sub	$32, %ecx
809	movdqa	32(%eax, %edi), %xmm3
810	movdqa	%xmm3, %xmm4
811	palignr	$9, %xmm2, %xmm3
812	palignr	$9, %xmm1, %xmm2
813	lea	32(%edi), %edi
814	movdqa	%xmm2, -32(%edx, %edi)
815	movdqa	%xmm3, -16(%edx, %edi)
816
817	jb	L(shl_9_end)
818
819	movdqa	16(%eax, %edi), %xmm2
820	sub	$32, %ecx
821	movdqa	32(%eax, %edi), %xmm3
822	movdqa	%xmm3, %xmm1
823	palignr	$9, %xmm2, %xmm3
824	palignr	$9, %xmm4, %xmm2
825	lea	32(%edi), %edi
826	movdqa	%xmm2, -32(%edx, %edi)
827	movdqa	%xmm3, -16(%edx, %edi)
828
829	jae	L(shl_9_loop)
830
831L(shl_9_end):
832	lea	32(%ecx), %ecx
833	add	%ecx, %edi
834	add	%edi, %edx
835	lea	9(%edi, %eax), %eax
836	POP (%edi)
837	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
838
839	cfi_restore_state
840	cfi_remember_state
841	ALIGN (4)
842L(shl_10):
843	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
844	lea	-10(%eax), %eax
845	movaps	(%eax), %xmm1
846	xor	%edi, %edi
847	lea	-32(%ecx), %ecx
848	movdqu	%xmm0, (%esi)
849	POP (%esi)
850L(shl_10_loop):
851
852	movdqa	16(%eax, %edi), %xmm2
853	sub	$32, %ecx
854	movdqa	32(%eax, %edi), %xmm3
855	movdqa	%xmm3, %xmm4
856	palignr	$10, %xmm2, %xmm3
857	palignr	$10, %xmm1, %xmm2
858	lea	32(%edi), %edi
859	movdqa	%xmm2, -32(%edx, %edi)
860	movdqa	%xmm3, -16(%edx, %edi)
861
862	jb	L(shl_10_end)
863
864	movdqa	16(%eax, %edi), %xmm2
865	sub	$32, %ecx
866	movdqa	32(%eax, %edi), %xmm3
867	movdqa	%xmm3, %xmm1
868	palignr	$10, %xmm2, %xmm3
869	palignr	$10, %xmm4, %xmm2
870	lea	32(%edi), %edi
871	movdqa	%xmm2, -32(%edx, %edi)
872	movdqa	%xmm3, -16(%edx, %edi)
873
874	jae	L(shl_10_loop)
875
876L(shl_10_end):
877	lea	32(%ecx), %ecx
878	add	%ecx, %edi
879	add	%edi, %edx
880	lea	10(%edi, %eax), %eax
881	POP (%edi)
882	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
883
884	cfi_restore_state
885	cfi_remember_state
886	ALIGN (4)
887L(shl_11):
888	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
889	lea	-11(%eax), %eax
890	movaps	(%eax), %xmm1
891	xor	%edi, %edi
892	lea	-32(%ecx), %ecx
893	movdqu	%xmm0, (%esi)
894	POP (%esi)
895L(shl_11_loop):
896
897	movdqa	16(%eax, %edi), %xmm2
898	sub	$32, %ecx
899	movdqa	32(%eax, %edi), %xmm3
900	movdqa	%xmm3, %xmm4
901	palignr	$11, %xmm2, %xmm3
902	palignr	$11, %xmm1, %xmm2
903	lea	32(%edi), %edi
904	movdqa	%xmm2, -32(%edx, %edi)
905	movdqa	%xmm3, -16(%edx, %edi)
906
907	jb	L(shl_11_end)
908
909	movdqa	16(%eax, %edi), %xmm2
910	sub	$32, %ecx
911	movdqa	32(%eax, %edi), %xmm3
912	movdqa	%xmm3, %xmm1
913	palignr	$11, %xmm2, %xmm3
914	palignr	$11, %xmm4, %xmm2
915	lea	32(%edi), %edi
916	movdqa	%xmm2, -32(%edx, %edi)
917	movdqa	%xmm3, -16(%edx, %edi)
918
919	jae	L(shl_11_loop)
920
921L(shl_11_end):
922	lea	32(%ecx), %ecx
923	add	%ecx, %edi
924	add	%edi, %edx
925	lea	11(%edi, %eax), %eax
926	POP (%edi)
927	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
928
929	cfi_restore_state
930	cfi_remember_state
931	ALIGN (4)
932L(shl_12):
933	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
934	lea	-12(%eax), %eax
935	movaps	(%eax), %xmm1
936	xor	%edi, %edi
937	lea	-32(%ecx), %ecx
938	movdqu	%xmm0, (%esi)
939	POP (%esi)
940L(shl_12_loop):
941
942	movdqa	16(%eax, %edi), %xmm2
943	sub	$32, %ecx
944	movdqa	32(%eax, %edi), %xmm3
945	movdqa	%xmm3, %xmm4
946	palignr	$12, %xmm2, %xmm3
947	palignr	$12, %xmm1, %xmm2
948	lea	32(%edi), %edi
949	movdqa	%xmm2, -32(%edx, %edi)
950	movdqa	%xmm3, -16(%edx, %edi)
951
952	jb	L(shl_12_end)
953
954	movdqa	16(%eax, %edi), %xmm2
955	sub	$32, %ecx
956	movdqa	32(%eax, %edi), %xmm3
957	movdqa	%xmm3, %xmm1
958	palignr	$12, %xmm2, %xmm3
959	palignr	$12, %xmm4, %xmm2
960	lea	32(%edi), %edi
961	movdqa	%xmm2, -32(%edx, %edi)
962	movdqa	%xmm3, -16(%edx, %edi)
963
964	jae	L(shl_12_loop)
965
966L(shl_12_end):
967	lea	32(%ecx), %ecx
968	add	%ecx, %edi
969	add	%edi, %edx
970	lea	12(%edi, %eax), %eax
971	POP (%edi)
972	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
973
974	cfi_restore_state
975	cfi_remember_state
976	ALIGN (4)
977L(shl_13):
978	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
979	lea	-13(%eax), %eax
980	movaps	(%eax), %xmm1
981	xor	%edi, %edi
982	lea	-32(%ecx), %ecx
983	movdqu	%xmm0, (%esi)
984	POP (%esi)
985L(shl_13_loop):
986
987	movdqa	16(%eax, %edi), %xmm2
988	sub	$32, %ecx
989	movdqa	32(%eax, %edi), %xmm3
990	movdqa	%xmm3, %xmm4
991	palignr	$13, %xmm2, %xmm3
992	palignr	$13, %xmm1, %xmm2
993	lea	32(%edi), %edi
994	movdqa	%xmm2, -32(%edx, %edi)
995	movdqa	%xmm3, -16(%edx, %edi)
996
997	jb	L(shl_13_end)
998
999	movdqa	16(%eax, %edi), %xmm2
1000	sub	$32, %ecx
1001	movdqa	32(%eax, %edi), %xmm3
1002	movdqa	%xmm3, %xmm1
1003	palignr	$13, %xmm2, %xmm3
1004	palignr	$13, %xmm4, %xmm2
1005	lea	32(%edi), %edi
1006	movdqa	%xmm2, -32(%edx, %edi)
1007	movdqa	%xmm3, -16(%edx, %edi)
1008
1009	jae	L(shl_13_loop)
1010
1011L(shl_13_end):
1012	lea	32(%ecx), %ecx
1013	add	%ecx, %edi
1014	add	%edi, %edx
1015	lea	13(%edi, %eax), %eax
1016	POP (%edi)
1017	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1018
1019	cfi_restore_state
1020	cfi_remember_state
1021	ALIGN (4)
1022L(shl_14):
1023	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1024	lea	-14(%eax), %eax
1025	movaps	(%eax), %xmm1
1026	xor	%edi, %edi
1027	lea	-32(%ecx), %ecx
1028	movdqu	%xmm0, (%esi)
1029	POP (%esi)
1030L(shl_14_loop):
1031
1032	movdqa	16(%eax, %edi), %xmm2
1033	sub	$32, %ecx
1034	movdqa	32(%eax, %edi), %xmm3
1035	movdqa	%xmm3, %xmm4
1036	palignr	$14, %xmm2, %xmm3
1037	palignr	$14, %xmm1, %xmm2
1038	lea	32(%edi), %edi
1039	movdqa	%xmm2, -32(%edx, %edi)
1040	movdqa	%xmm3, -16(%edx, %edi)
1041
1042	jb	L(shl_14_end)
1043
1044	movdqa	16(%eax, %edi), %xmm2
1045	sub	$32, %ecx
1046	movdqa	32(%eax, %edi), %xmm3
1047	movdqa	%xmm3, %xmm1
1048	palignr	$14, %xmm2, %xmm3
1049	palignr	$14, %xmm4, %xmm2
1050	lea	32(%edi), %edi
1051	movdqa	%xmm2, -32(%edx, %edi)
1052	movdqa	%xmm3, -16(%edx, %edi)
1053
1054	jae	L(shl_14_loop)
1055
1056L(shl_14_end):
1057	lea	32(%ecx), %ecx
1058	add	%ecx, %edi
1059	add	%edi, %edx
1060	lea	14(%edi, %eax), %eax
1061	POP (%edi)
1062	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1063
1064	cfi_restore_state
1065	cfi_remember_state
1066	ALIGN (4)
1067L(shl_15):
1068	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1069	lea	-15(%eax), %eax
1070	movaps	(%eax), %xmm1
1071	xor	%edi, %edi
1072	lea	-32(%ecx), %ecx
1073	movdqu	%xmm0, (%esi)
1074	POP (%esi)
1075L(shl_15_loop):
1076
1077	movdqa	16(%eax, %edi), %xmm2
1078	sub	$32, %ecx
1079	movdqa	32(%eax, %edi), %xmm3
1080	movdqa	%xmm3, %xmm4
1081	palignr	$15, %xmm2, %xmm3
1082	palignr	$15, %xmm1, %xmm2
1083	lea	32(%edi), %edi
1084	movdqa	%xmm2, -32(%edx, %edi)
1085	movdqa	%xmm3, -16(%edx, %edi)
1086
1087	jb	L(shl_15_end)
1088
1089	movdqa	16(%eax, %edi), %xmm2
1090	sub	$32, %ecx
1091	movdqa	32(%eax, %edi), %xmm3
1092	movdqa	%xmm3, %xmm1
1093	palignr	$15, %xmm2, %xmm3
1094	palignr	$15, %xmm4, %xmm2
1095	lea	32(%edi), %edi
1096	movdqa	%xmm2, -32(%edx, %edi)
1097	movdqa	%xmm3, -16(%edx, %edi)
1098
1099	jae	L(shl_15_loop)
1100
1101L(shl_15_end):
1102	lea	32(%ecx), %ecx
1103	add	%ecx, %edi
1104	add	%edi, %edx
1105	lea	15(%edi, %eax), %eax
1106	POP (%edi)
1107	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1108
1109
1110	ALIGN (4)
1111L(fwd_write_44bytes):
1112	movl	-44(%eax), %ecx
1113	movl	%ecx, -44(%edx)
1114L(fwd_write_40bytes):
1115	movl	-40(%eax), %ecx
1116	movl	%ecx, -40(%edx)
1117L(fwd_write_36bytes):
1118	movl	-36(%eax), %ecx
1119	movl	%ecx, -36(%edx)
1120L(fwd_write_32bytes):
1121	movl	-32(%eax), %ecx
1122	movl	%ecx, -32(%edx)
1123L(fwd_write_28bytes):
1124	movl	-28(%eax), %ecx
1125	movl	%ecx, -28(%edx)
1126L(fwd_write_24bytes):
1127	movl	-24(%eax), %ecx
1128	movl	%ecx, -24(%edx)
1129L(fwd_write_20bytes):
1130	movl	-20(%eax), %ecx
1131	movl	%ecx, -20(%edx)
1132L(fwd_write_16bytes):
1133	movl	-16(%eax), %ecx
1134	movl	%ecx, -16(%edx)
1135L(fwd_write_12bytes):
1136	movl	-12(%eax), %ecx
1137	movl	%ecx, -12(%edx)
1138L(fwd_write_8bytes):
1139	movl	-8(%eax), %ecx
1140	movl	%ecx, -8(%edx)
1141L(fwd_write_4bytes):
1142	movl	-4(%eax), %ecx
1143	movl	%ecx, -4(%edx)
1144L(fwd_write_0bytes):
1145#ifndef USE_AS_BCOPY
1146# ifdef USE_AS_MEMPCPY
1147	movl	%edx, %eax
1148# else
1149	movl	DEST(%esp), %eax
1150# endif
1151#endif
1152	RETURN
1153
1154	ALIGN (4)
1155L(fwd_write_5bytes):
1156	movl	-5(%eax), %ecx
1157	movl	-4(%eax), %eax
1158	movl	%ecx, -5(%edx)
1159	movl	%eax, -4(%edx)
1160#ifndef USE_AS_BCOPY
1161# ifdef USE_AS_MEMPCPY
1162	movl	%edx, %eax
1163# else
1164	movl	DEST(%esp), %eax
1165# endif
1166#endif
1167	RETURN
1168
1169	ALIGN (4)
1170L(fwd_write_45bytes):
1171	movl	-45(%eax), %ecx
1172	movl	%ecx, -45(%edx)
1173L(fwd_write_41bytes):
1174	movl	-41(%eax), %ecx
1175	movl	%ecx, -41(%edx)
1176L(fwd_write_37bytes):
1177	movl	-37(%eax), %ecx
1178	movl	%ecx, -37(%edx)
1179L(fwd_write_33bytes):
1180	movl	-33(%eax), %ecx
1181	movl	%ecx, -33(%edx)
1182L(fwd_write_29bytes):
1183	movl	-29(%eax), %ecx
1184	movl	%ecx, -29(%edx)
1185L(fwd_write_25bytes):
1186	movl	-25(%eax), %ecx
1187	movl	%ecx, -25(%edx)
1188L(fwd_write_21bytes):
1189	movl	-21(%eax), %ecx
1190	movl	%ecx, -21(%edx)
1191L(fwd_write_17bytes):
1192	movl	-17(%eax), %ecx
1193	movl	%ecx, -17(%edx)
1194L(fwd_write_13bytes):
1195	movl	-13(%eax), %ecx
1196	movl	%ecx, -13(%edx)
1197L(fwd_write_9bytes):
1198	movl	-9(%eax), %ecx
1199	movl	%ecx, -9(%edx)
1200	movl	-5(%eax), %ecx
1201	movl	%ecx, -5(%edx)
1202L(fwd_write_1bytes):
1203	movzbl	-1(%eax), %ecx
1204	movb	%cl, -1(%edx)
1205#ifndef USE_AS_BCOPY
1206# ifdef USE_AS_MEMPCPY
1207	movl	%edx, %eax
1208# else
1209	movl	DEST(%esp), %eax
1210# endif
1211#endif
1212	RETURN
1213
1214	ALIGN (4)
1215L(fwd_write_46bytes):
1216	movl	-46(%eax), %ecx
1217	movl	%ecx, -46(%edx)
1218L(fwd_write_42bytes):
1219	movl	-42(%eax), %ecx
1220	movl	%ecx, -42(%edx)
1221L(fwd_write_38bytes):
1222	movl	-38(%eax), %ecx
1223	movl	%ecx, -38(%edx)
1224L(fwd_write_34bytes):
1225	movl	-34(%eax), %ecx
1226	movl	%ecx, -34(%edx)
1227L(fwd_write_30bytes):
1228	movl	-30(%eax), %ecx
1229	movl	%ecx, -30(%edx)
1230L(fwd_write_26bytes):
1231	movl	-26(%eax), %ecx
1232	movl	%ecx, -26(%edx)
1233L(fwd_write_22bytes):
1234	movl	-22(%eax), %ecx
1235	movl	%ecx, -22(%edx)
1236L(fwd_write_18bytes):
1237	movl	-18(%eax), %ecx
1238	movl	%ecx, -18(%edx)
1239L(fwd_write_14bytes):
1240	movl	-14(%eax), %ecx
1241	movl	%ecx, -14(%edx)
1242L(fwd_write_10bytes):
1243	movl	-10(%eax), %ecx
1244	movl	%ecx, -10(%edx)
1245L(fwd_write_6bytes):
1246	movl	-6(%eax), %ecx
1247	movl	%ecx, -6(%edx)
1248L(fwd_write_2bytes):
1249	movzwl	-2(%eax), %ecx
1250	movw	%cx, -2(%edx)
1251#ifndef USE_AS_BCOPY
1252# ifdef USE_AS_MEMPCPY
1253	movl	%edx, %eax
1254# else
1255	movl	DEST(%esp), %eax
1256# endif
1257#endif
1258	RETURN
1259
1260	ALIGN (4)
1261L(fwd_write_47bytes):
1262	movl	-47(%eax), %ecx
1263	movl	%ecx, -47(%edx)
1264L(fwd_write_43bytes):
1265	movl	-43(%eax), %ecx
1266	movl	%ecx, -43(%edx)
1267L(fwd_write_39bytes):
1268	movl	-39(%eax), %ecx
1269	movl	%ecx, -39(%edx)
1270L(fwd_write_35bytes):
1271	movl	-35(%eax), %ecx
1272	movl	%ecx, -35(%edx)
1273L(fwd_write_31bytes):
1274	movl	-31(%eax), %ecx
1275	movl	%ecx, -31(%edx)
1276L(fwd_write_27bytes):
1277	movl	-27(%eax), %ecx
1278	movl	%ecx, -27(%edx)
1279L(fwd_write_23bytes):
1280	movl	-23(%eax), %ecx
1281	movl	%ecx, -23(%edx)
1282L(fwd_write_19bytes):
1283	movl	-19(%eax), %ecx
1284	movl	%ecx, -19(%edx)
1285L(fwd_write_15bytes):
1286	movl	-15(%eax), %ecx
1287	movl	%ecx, -15(%edx)
1288L(fwd_write_11bytes):
1289	movl	-11(%eax), %ecx
1290	movl	%ecx, -11(%edx)
1291L(fwd_write_7bytes):
1292	movl	-7(%eax), %ecx
1293	movl	%ecx, -7(%edx)
1294L(fwd_write_3bytes):
1295	movzwl	-3(%eax), %ecx
1296	movzbl	-1(%eax), %eax
1297	movw	%cx, -3(%edx)
1298	movb	%al, -1(%edx)
1299#ifndef USE_AS_BCOPY
1300# ifdef USE_AS_MEMPCPY
1301	movl	%edx, %eax
1302# else
1303	movl	DEST(%esp), %eax
1304# endif
1305#endif
1306	RETURN_END
1307
1308	cfi_restore_state
1309	cfi_remember_state
1310	ALIGN (4)
1311L(large_page):
1312	movdqu	(%eax), %xmm1
1313	lea	16(%eax), %eax
1314	movdqu	%xmm0, (%esi)
1315	movntdq	%xmm1, (%edx)
1316	lea	16(%edx), %edx
1317	POP (%esi)
1318	lea	-0x90(%ecx), %ecx
1319	POP (%edi)
1320L(large_page_loop):
1321	movdqu	(%eax), %xmm0
1322	movdqu	0x10(%eax), %xmm1
1323	movdqu	0x20(%eax), %xmm2
1324	movdqu	0x30(%eax), %xmm3
1325	movdqu	0x40(%eax), %xmm4
1326	movdqu	0x50(%eax), %xmm5
1327	movdqu	0x60(%eax), %xmm6
1328	movdqu	0x70(%eax), %xmm7
1329	lea	0x80(%eax), %eax
1330
1331	sub	$0x80, %ecx
1332	movntdq	%xmm0, (%edx)
1333	movntdq	%xmm1, 0x10(%edx)
1334	movntdq	%xmm2, 0x20(%edx)
1335	movntdq	%xmm3, 0x30(%edx)
1336	movntdq	%xmm4, 0x40(%edx)
1337	movntdq	%xmm5, 0x50(%edx)
1338	movntdq	%xmm6, 0x60(%edx)
1339	movntdq	%xmm7, 0x70(%edx)
1340	lea	0x80(%edx), %edx
1341	jae	L(large_page_loop)
1342	cmp	$-0x40, %ecx
1343	lea	0x80(%ecx), %ecx
1344	jl	L(large_page_less_64bytes)
1345
1346	movdqu	(%eax), %xmm0
1347	movdqu	0x10(%eax), %xmm1
1348	movdqu	0x20(%eax), %xmm2
1349	movdqu	0x30(%eax), %xmm3
1350	lea	0x40(%eax), %eax
1351
1352	movntdq	%xmm0, (%edx)
1353	movntdq	%xmm1, 0x10(%edx)
1354	movntdq	%xmm2, 0x20(%edx)
1355	movntdq	%xmm3, 0x30(%edx)
1356	lea	0x40(%edx), %edx
1357	sub	$0x40, %ecx
1358L(large_page_less_64bytes):
1359	cmp	$32, %ecx
1360	jb	L(large_page_less_32bytes)
1361	movdqu	(%eax), %xmm0
1362	movdqu	0x10(%eax), %xmm1
1363	lea	0x20(%eax), %eax
1364	movntdq	%xmm0, (%edx)
1365	movntdq	%xmm1, 0x10(%edx)
1366	lea	0x20(%edx), %edx
1367	sub	$0x20, %ecx
1368L(large_page_less_32bytes):
1369	add	%ecx, %edx
1370	add	%ecx, %eax
1371	sfence
1372	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1373
1374
1375	ALIGN (4)
1376L(bk_write_44bytes):
1377	movl	40(%eax), %ecx
1378	movl	%ecx, 40(%edx)
1379L(bk_write_40bytes):
1380	movl	36(%eax), %ecx
1381	movl	%ecx, 36(%edx)
1382L(bk_write_36bytes):
1383	movl	32(%eax), %ecx
1384	movl	%ecx, 32(%edx)
1385L(bk_write_32bytes):
1386	movl	28(%eax), %ecx
1387	movl	%ecx, 28(%edx)
1388L(bk_write_28bytes):
1389	movl	24(%eax), %ecx
1390	movl	%ecx, 24(%edx)
1391L(bk_write_24bytes):
1392	movl	20(%eax), %ecx
1393	movl	%ecx, 20(%edx)
1394L(bk_write_20bytes):
1395	movl	16(%eax), %ecx
1396	movl	%ecx, 16(%edx)
1397L(bk_write_16bytes):
1398	movl	12(%eax), %ecx
1399	movl	%ecx, 12(%edx)
1400L(bk_write_12bytes):
1401	movl	8(%eax), %ecx
1402	movl	%ecx, 8(%edx)
1403L(bk_write_8bytes):
1404	movl	4(%eax), %ecx
1405	movl	%ecx, 4(%edx)
1406L(bk_write_4bytes):
1407	movl	(%eax), %ecx
1408	movl	%ecx, (%edx)
1409L(bk_write_0bytes):
1410#ifndef USE_AS_BCOPY
1411	movl	DEST(%esp), %eax
1412# ifdef USE_AS_MEMPCPY
1413	movl	LEN(%esp), %ecx
1414	add	%ecx, %eax
1415# endif
1416#endif
1417	RETURN
1418
1419	ALIGN (4)
1420L(bk_write_45bytes):
1421	movl	41(%eax), %ecx
1422	movl	%ecx, 41(%edx)
1423L(bk_write_41bytes):
1424	movl	37(%eax), %ecx
1425	movl	%ecx, 37(%edx)
1426L(bk_write_37bytes):
1427	movl	33(%eax), %ecx
1428	movl	%ecx, 33(%edx)
1429L(bk_write_33bytes):
1430	movl	29(%eax), %ecx
1431	movl	%ecx, 29(%edx)
1432L(bk_write_29bytes):
1433	movl	25(%eax), %ecx
1434	movl	%ecx, 25(%edx)
1435L(bk_write_25bytes):
1436	movl	21(%eax), %ecx
1437	movl	%ecx, 21(%edx)
1438L(bk_write_21bytes):
1439	movl	17(%eax), %ecx
1440	movl	%ecx, 17(%edx)
1441L(bk_write_17bytes):
1442	movl	13(%eax), %ecx
1443	movl	%ecx, 13(%edx)
1444L(bk_write_13bytes):
1445	movl	9(%eax), %ecx
1446	movl	%ecx, 9(%edx)
1447L(bk_write_9bytes):
1448	movl	5(%eax), %ecx
1449	movl	%ecx, 5(%edx)
1450L(bk_write_5bytes):
1451	movl	1(%eax), %ecx
1452	movl	%ecx, 1(%edx)
1453L(bk_write_1bytes):
1454	movzbl	(%eax), %ecx
1455	movb	%cl, (%edx)
1456#ifndef USE_AS_BCOPY
1457	movl	DEST(%esp), %eax
1458# ifdef USE_AS_MEMPCPY
1459	movl	LEN(%esp), %ecx
1460	add	%ecx, %eax
1461# endif
1462#endif
1463	RETURN
1464
1465	ALIGN (4)
1466L(bk_write_46bytes):
1467	movl	42(%eax), %ecx
1468	movl	%ecx, 42(%edx)
1469L(bk_write_42bytes):
1470	movl	38(%eax), %ecx
1471	movl	%ecx, 38(%edx)
1472L(bk_write_38bytes):
1473	movl	34(%eax), %ecx
1474	movl	%ecx, 34(%edx)
1475L(bk_write_34bytes):
1476	movl	30(%eax), %ecx
1477	movl	%ecx, 30(%edx)
1478L(bk_write_30bytes):
1479	movl	26(%eax), %ecx
1480	movl	%ecx, 26(%edx)
1481L(bk_write_26bytes):
1482	movl	22(%eax), %ecx
1483	movl	%ecx, 22(%edx)
1484L(bk_write_22bytes):
1485	movl	18(%eax), %ecx
1486	movl	%ecx, 18(%edx)
1487L(bk_write_18bytes):
1488	movl	14(%eax), %ecx
1489	movl	%ecx, 14(%edx)
1490L(bk_write_14bytes):
1491	movl	10(%eax), %ecx
1492	movl	%ecx, 10(%edx)
1493L(bk_write_10bytes):
1494	movl	6(%eax), %ecx
1495	movl	%ecx, 6(%edx)
1496L(bk_write_6bytes):
1497	movl	2(%eax), %ecx
1498	movl	%ecx, 2(%edx)
1499L(bk_write_2bytes):
1500	movzwl	(%eax), %ecx
1501	movw	%cx, (%edx)
1502#ifndef USE_AS_BCOPY
1503	movl	DEST(%esp), %eax
1504# ifdef USE_AS_MEMPCPY
1505	movl	LEN(%esp), %ecx
1506	add	%ecx, %eax
1507# endif
1508#endif
1509	RETURN
1510
1511	ALIGN (4)
1512L(bk_write_47bytes):
1513	movl	43(%eax), %ecx
1514	movl	%ecx, 43(%edx)
1515L(bk_write_43bytes):
1516	movl	39(%eax), %ecx
1517	movl	%ecx, 39(%edx)
1518L(bk_write_39bytes):
1519	movl	35(%eax), %ecx
1520	movl	%ecx, 35(%edx)
1521L(bk_write_35bytes):
1522	movl	31(%eax), %ecx
1523	movl	%ecx, 31(%edx)
1524L(bk_write_31bytes):
1525	movl	27(%eax), %ecx
1526	movl	%ecx, 27(%edx)
1527L(bk_write_27bytes):
1528	movl	23(%eax), %ecx
1529	movl	%ecx, 23(%edx)
1530L(bk_write_23bytes):
1531	movl	19(%eax), %ecx
1532	movl	%ecx, 19(%edx)
1533L(bk_write_19bytes):
1534	movl	15(%eax), %ecx
1535	movl	%ecx, 15(%edx)
1536L(bk_write_15bytes):
1537	movl	11(%eax), %ecx
1538	movl	%ecx, 11(%edx)
1539L(bk_write_11bytes):
1540	movl	7(%eax), %ecx
1541	movl	%ecx, 7(%edx)
1542L(bk_write_7bytes):
1543	movl	3(%eax), %ecx
1544	movl	%ecx, 3(%edx)
1545L(bk_write_3bytes):
1546	movzwl	1(%eax), %ecx
1547	movw	%cx, 1(%edx)
1548	movzbl	(%eax), %eax
1549	movb	%al, (%edx)
1550#ifndef USE_AS_BCOPY
1551	movl	DEST(%esp), %eax
1552# ifdef USE_AS_MEMPCPY
1553	movl	LEN(%esp), %ecx
1554	add	%ecx, %eax
1555# endif
1556#endif
1557	RETURN_END
1558
1559
1560	.pushsection .rodata.ssse3,"a",@progbits
1561	ALIGN (2)
1562L(table_48bytes_fwd):
1563	.int	JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1564	.int	JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1565	.int	JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1566	.int	JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1567	.int	JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1568	.int	JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1569	.int	JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1570	.int	JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1571	.int	JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1572	.int	JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1573	.int	JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1574	.int	JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1575	.int	JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1576	.int	JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1577	.int	JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1578	.int	JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1579	.int	JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1580	.int	JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1581	.int	JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1582	.int	JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1583	.int	JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1584	.int	JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1585	.int	JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1586	.int	JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1587	.int	JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1588	.int	JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1589	.int	JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1590	.int	JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1591	.int	JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1592	.int	JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1593	.int	JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1594	.int	JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1595	.int	JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1596	.int	JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1597	.int	JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1598	.int	JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1599	.int	JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1600	.int	JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1601	.int	JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1602	.int	JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1603	.int	JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1604	.int	JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1605	.int	JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1606	.int	JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1607	.int	JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1608	.int	JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1609	.int	JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1610	.int	JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1611
1612	ALIGN (2)
1613L(shl_table):
1614	.int	JMPTBL (L(shl_0), L(shl_table))
1615	.int	JMPTBL (L(shl_1), L(shl_table))
1616	.int	JMPTBL (L(shl_2), L(shl_table))
1617	.int	JMPTBL (L(shl_3), L(shl_table))
1618	.int	JMPTBL (L(shl_4), L(shl_table))
1619	.int	JMPTBL (L(shl_5), L(shl_table))
1620	.int	JMPTBL (L(shl_6), L(shl_table))
1621	.int	JMPTBL (L(shl_7), L(shl_table))
1622	.int	JMPTBL (L(shl_8), L(shl_table))
1623	.int	JMPTBL (L(shl_9), L(shl_table))
1624	.int	JMPTBL (L(shl_10), L(shl_table))
1625	.int	JMPTBL (L(shl_11), L(shl_table))
1626	.int	JMPTBL (L(shl_12), L(shl_table))
1627	.int	JMPTBL (L(shl_13), L(shl_table))
1628	.int	JMPTBL (L(shl_14), L(shl_table))
1629	.int	JMPTBL (L(shl_15), L(shl_table))
1630
1631	ALIGN (2)
1632L(table_48_bytes_bwd):
1633	.int	JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1634	.int	JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1635	.int	JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1636	.int	JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1637	.int	JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1638	.int	JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1639	.int	JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1640	.int	JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1641	.int	JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1642	.int	JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1643	.int	JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1644	.int	JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1645	.int	JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1646	.int	JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1647	.int	JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1648	.int	JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1649	.int	JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1650	.int	JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1651	.int	JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1652	.int	JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1653	.int	JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1654	.int	JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1655	.int	JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1656	.int	JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1657	.int	JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1658	.int	JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1659	.int	JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1660	.int	JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1661	.int	JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1662	.int	JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1663	.int	JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1664	.int	JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1665	.int	JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1666	.int	JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1667	.int	JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1668	.int	JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1669	.int	JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1670	.int	JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1671	.int	JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1672	.int	JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1673	.int	JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1674	.int	JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1675	.int	JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1676	.int	JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1677	.int	JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1678	.int	JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1679	.int	JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1680	.int	JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1681
1682	.popsection
1683
1684#ifdef USE_AS_MEMMOVE
1685	ALIGN (4)
1686L(copy_backward):
1687	PUSH (%esi)
1688	movl	%eax, %esi
1689	lea	(%ecx,%edx,1),%edx
1690	lea	(%ecx,%esi,1),%esi
1691	testl	$0x3, %edx
1692	jnz	L(bk_align)
1693
1694L(bk_aligned_4):
1695	cmp	$64, %ecx
1696	jae	L(bk_write_more64bytes)
1697
1698L(bk_write_64bytesless):
1699	cmp	$32, %ecx
1700	jb	L(bk_write_less32bytes)
1701
1702L(bk_write_more32bytes):
1703	/* Copy 32 bytes at a time.  */
1704	sub	$32, %ecx
1705	movl	-4(%esi), %eax
1706	movl	%eax, -4(%edx)
1707	movl	-8(%esi), %eax
1708	movl	%eax, -8(%edx)
1709	movl	-12(%esi), %eax
1710	movl	%eax, -12(%edx)
1711	movl	-16(%esi), %eax
1712	movl	%eax, -16(%edx)
1713	movl	-20(%esi), %eax
1714	movl	%eax, -20(%edx)
1715	movl	-24(%esi), %eax
1716	movl	%eax, -24(%edx)
1717	movl	-28(%esi), %eax
1718	movl	%eax, -28(%edx)
1719	movl	-32(%esi), %eax
1720	movl	%eax, -32(%edx)
1721	sub	$32, %edx
1722	sub	$32, %esi
1723
1724L(bk_write_less32bytes):
1725	movl	%esi, %eax
1726	sub	%ecx, %edx
1727	sub	%ecx, %eax
1728	POP (%esi)
1729L(bk_write_less32bytes_2):
1730	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1731
1732	CFI_PUSH (%esi)
1733	ALIGN (4)
1734L(bk_align):
1735	cmp	$8, %ecx
1736	jbe	L(bk_write_less32bytes)
1737	testl	$1, %edx
1738	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1739	   then (EDX & 2) must be != 0.  */
1740	jz	L(bk_got2)
1741	sub	$1, %esi
1742	sub	$1, %ecx
1743	sub	$1, %edx
1744	movzbl	(%esi), %eax
1745	movb	%al, (%edx)
1746
1747	testl	$2, %edx
1748	jz	L(bk_aligned_4)
1749
1750L(bk_got2):
1751	sub	$2, %esi
1752	sub	$2, %ecx
1753	sub	$2, %edx
1754	movzwl	(%esi), %eax
1755	movw	%ax, (%edx)
1756	jmp	L(bk_aligned_4)
1757
1758	ALIGN (4)
1759L(bk_write_more64bytes):
1760	/* Check alignment of last byte.  */
1761	testl	$15, %edx
1762	jz	L(bk_ssse3_cpy_pre)
1763
1764/* EDX is aligned 4 bytes, but not 16 bytes.  */
1765L(bk_ssse3_align):
1766	sub	$4, %esi
1767	sub	$4, %ecx
1768	sub	$4, %edx
1769	movl	(%esi), %eax
1770	movl	%eax, (%edx)
1771
1772	testl	$15, %edx
1773	jz	L(bk_ssse3_cpy_pre)
1774
1775	sub	$4, %esi
1776	sub	$4, %ecx
1777	sub	$4, %edx
1778	movl	(%esi), %eax
1779	movl	%eax, (%edx)
1780
1781	testl	$15, %edx
1782	jz	L(bk_ssse3_cpy_pre)
1783
1784	sub	$4, %esi
1785	sub	$4, %ecx
1786	sub	$4, %edx
1787	movl	(%esi), %eax
1788	movl	%eax, (%edx)
1789
1790L(bk_ssse3_cpy_pre):
1791	cmp	$64, %ecx
1792	jb	L(bk_write_more32bytes)
1793
1794L(bk_ssse3_cpy):
1795	sub	$64, %esi
1796	sub	$64, %ecx
1797	sub	$64, %edx
1798	movdqu	0x30(%esi), %xmm3
1799	movdqa	%xmm3, 0x30(%edx)
1800	movdqu	0x20(%esi), %xmm2
1801	movdqa	%xmm2, 0x20(%edx)
1802	movdqu	0x10(%esi), %xmm1
1803	movdqa	%xmm1, 0x10(%edx)
1804	movdqu	(%esi), %xmm0
1805	movdqa	%xmm0, (%edx)
1806	cmp	$64, %ecx
1807	jae	L(bk_ssse3_cpy)
1808	jmp	L(bk_write_64bytesless)
1809
1810#endif
1811
1812END (MEMCPY)
1813