• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label)	.L##label
33#endif
34
35#ifndef ALIGN
36# define ALIGN(n)	.p2align n
37#endif
38
39#ifndef cfi_startproc
40# define cfi_startproc			.cfi_startproc
41#endif
42
43#ifndef cfi_endproc
44# define cfi_endproc			.cfi_endproc
45#endif
46
47#ifndef cfi_rel_offset
48# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
49#endif
50
51#ifndef cfi_restore
52# define cfi_restore(reg)		.cfi_restore reg
53#endif
54
55#ifndef cfi_adjust_cfa_offset
56# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
57#endif
58
59#ifndef ENTRY
60# define ENTRY(name)			\
61	.type name,  @function; 	\
62	.globl name;			\
63	.p2align 4;			\
64name:					\
65	cfi_startproc
66#endif
67
68#ifndef END
69# define END(name)			\
70	cfi_endproc;			\
71	.size name, .-name
72#endif
73
74#define CFI_PUSH(REG)						\
75  cfi_adjust_cfa_offset (4);					\
76  cfi_rel_offset (REG, 0)
77
78#define CFI_POP(REG)						\
79  cfi_adjust_cfa_offset (-4);					\
80  cfi_restore (REG)
81
82#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
83#define POP(REG)	popl REG; CFI_POP (REG)
84
85#ifdef USE_AS_BZERO
86# define DEST		PARMS
87# define LEN		DEST+4
88# define SETRTNVAL
89#else
90# define DEST		PARMS
91# define CHR		DEST+4
92# define LEN		CHR+4
93# define SETRTNVAL	movl DEST(%esp), %eax
94#endif
95
96#ifdef SHARED
97# define ENTRANCE	PUSH (%ebx);
98# define RETURN_END	POP (%ebx); ret
99# define RETURN		RETURN_END; CFI_PUSH (%ebx)
100# define PARMS		8		/* Preserve EBX.  */
101# define JMPTBL(I, B)	I - B
102
103/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
104   jump table with relative offsets.   */
105# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
106    /* We first load PC into EBX.  */				\
107    call	__i686.get_pc_thunk.bx;				\
108    /* Get the address of the jump table.  */			\
109    add		$(TABLE - .), %ebx;				\
110    /* Get the entry and convert the relative offset to the	\
111       absolute address.  */					\
112    add		(%ebx,%ecx,4), %ebx;				\
113    add		%ecx, %edx;					\
114    /* We loaded the jump table and adjuested EDX. Go.  */	\
115    jmp		*%ebx
116
117	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
118	.globl	__i686.get_pc_thunk.bx
119	.hidden	__i686.get_pc_thunk.bx
120	ALIGN (4)
121	.type	__i686.get_pc_thunk.bx,@function
122__i686.get_pc_thunk.bx:
123	movl	(%esp), %ebx
124	ret
125#else
126# define ENTRANCE
127# define RETURN_END	ret
128# define RETURN		RETURN_END
129# define PARMS		4
130# define JMPTBL(I, B)	I
131
132/* Branch to an entry in a jump table.  TABLE is a jump table with
133   absolute offsets.  */
134# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
135    add		%ecx, %edx;					\
136    jmp		*TABLE(,%ecx,4)
137#endif
138
139	.section .text.sse2,"ax",@progbits
140	ALIGN (4)
141ENTRY (sse2_memset5_atom)
142	ENTRANCE
143
144	movl	LEN(%esp), %ecx
145#ifdef USE_AS_BZERO
146	xor	%eax, %eax
147#else
148	movzbl	CHR(%esp), %eax
149	movb	%al, %ah
150	/* Fill the whole EAX with pattern.  */
151	movl	%eax, %edx
152	shl	$16, %eax
153	or	%edx, %eax
154#endif
155	movl	DEST(%esp), %edx
156	cmp	$32, %ecx
157	jae	L(32bytesormore)
158
159L(write_less32bytes):
160	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
161
162
163	.pushsection .rodata.sse2,"a",@progbits
164	ALIGN (2)
165L(table_less_32bytes):
166	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
167	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
168	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
169	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
170	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
171	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
172	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
173	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
174	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
175	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
176	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
177	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
178	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
179	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
180	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
181	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
182	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
183	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
184	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
185	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
186	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
187	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
188	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
189	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
190	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
191	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
192	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
193	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
194	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
195	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
196	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
197	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
198	.popsection
199
200	ALIGN (4)
201L(write_28bytes):
202	movl	%eax, -28(%edx)
203L(write_24bytes):
204	movl	%eax, -24(%edx)
205L(write_20bytes):
206	movl	%eax, -20(%edx)
207L(write_16bytes):
208	movl	%eax, -16(%edx)
209L(write_12bytes):
210	movl	%eax, -12(%edx)
211L(write_8bytes):
212	movl	%eax, -8(%edx)
213L(write_4bytes):
214	movl	%eax, -4(%edx)
215L(write_0bytes):
216	SETRTNVAL
217	RETURN
218
219	ALIGN (4)
220L(write_29bytes):
221	movl	%eax, -29(%edx)
222L(write_25bytes):
223	movl	%eax, -25(%edx)
224L(write_21bytes):
225	movl	%eax, -21(%edx)
226L(write_17bytes):
227	movl	%eax, -17(%edx)
228L(write_13bytes):
229	movl	%eax, -13(%edx)
230L(write_9bytes):
231	movl	%eax, -9(%edx)
232L(write_5bytes):
233	movl	%eax, -5(%edx)
234L(write_1bytes):
235	movb	%al, -1(%edx)
236	SETRTNVAL
237	RETURN
238
239	ALIGN (4)
240L(write_30bytes):
241	movl	%eax, -30(%edx)
242L(write_26bytes):
243	movl	%eax, -26(%edx)
244L(write_22bytes):
245	movl	%eax, -22(%edx)
246L(write_18bytes):
247	movl	%eax, -18(%edx)
248L(write_14bytes):
249	movl	%eax, -14(%edx)
250L(write_10bytes):
251	movl	%eax, -10(%edx)
252L(write_6bytes):
253	movl	%eax, -6(%edx)
254L(write_2bytes):
255	movw	%ax, -2(%edx)
256	SETRTNVAL
257	RETURN
258
259	ALIGN (4)
260L(write_31bytes):
261	movl	%eax, -31(%edx)
262L(write_27bytes):
263	movl	%eax, -27(%edx)
264L(write_23bytes):
265	movl	%eax, -23(%edx)
266L(write_19bytes):
267	movl	%eax, -19(%edx)
268L(write_15bytes):
269	movl	%eax, -15(%edx)
270L(write_11bytes):
271	movl	%eax, -11(%edx)
272L(write_7bytes):
273	movl	%eax, -7(%edx)
274L(write_3bytes):
275	movw	%ax, -3(%edx)
276	movb	%al, -1(%edx)
277	SETRTNVAL
278	RETURN
279
280	ALIGN (4)
281/* ECX > 32 and EDX is 4 byte aligned.  */
282L(32bytesormore):
283	/* Fill xmm0 with the pattern.  */
284#ifdef USE_AS_BZERO
285	pxor	%xmm0, %xmm0
286#else
287	movd	%eax, %xmm0
288	pshufd	$0, %xmm0, %xmm0
289#endif
290	testl	$0xf, %edx
291	jz	L(aligned_16)
292/* ECX > 32 and EDX is not 16 byte aligned.  */
293L(not_aligned_16):
294	movdqu	%xmm0, (%edx)
295	movl	%edx, %eax
296	and	$-16, %edx
297	add	$16, %edx
298	sub	%edx, %eax
299	add	%eax, %ecx
300	movd	%xmm0, %eax
301
302	ALIGN (4)
303L(aligned_16):
304	cmp	$128, %ecx
305	jae	L(128bytesormore)
306
307L(aligned_16_less128bytes):
308	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
309
310	ALIGN (4)
311L(128bytesormore):
312#ifdef SHARED_CACHE_SIZE
313	PUSH (%ebx)
314	mov	$SHARED_CACHE_SIZE, %ebx
315#else
316# ifdef SHARED
317	call	__i686.get_pc_thunk.bx
318	add	$_GLOBAL_OFFSET_TABLE_, %ebx
319	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
320# else
321	PUSH (%ebx)
322	mov	__x86_shared_cache_size, %ebx
323# endif
324#endif
325	cmp	%ebx, %ecx
326	jae	L(128bytesormore_nt_start)
327
328
329#ifdef DATA_CACHE_SIZE
330	POP (%ebx)
331# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
332	cmp	$DATA_CACHE_SIZE, %ecx
333#else
334# ifdef SHARED
335#  define RESTORE_EBX_STATE
336	call	__i686.get_pc_thunk.bx
337	add	$_GLOBAL_OFFSET_TABLE_, %ebx
338	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
339# else
340	POP (%ebx)
341#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
342	cmp	__x86_data_cache_size, %ecx
343# endif
344#endif
345
346	jae	L(128bytes_L2_normal)
347	subl	$128, %ecx
348L(128bytesormore_normal):
349	sub	$128, %ecx
350	movdqa	%xmm0, (%edx)
351	movdqa	%xmm0, 0x10(%edx)
352	movdqa	%xmm0, 0x20(%edx)
353	movdqa	%xmm0, 0x30(%edx)
354	movdqa	%xmm0, 0x40(%edx)
355	movdqa	%xmm0, 0x50(%edx)
356	movdqa	%xmm0, 0x60(%edx)
357	movdqa	%xmm0, 0x70(%edx)
358	lea	128(%edx), %edx
359	jb	L(128bytesless_normal)
360
361
362	sub	$128, %ecx
363	movdqa	%xmm0, (%edx)
364	movdqa	%xmm0, 0x10(%edx)
365	movdqa	%xmm0, 0x20(%edx)
366	movdqa	%xmm0, 0x30(%edx)
367	movdqa	%xmm0, 0x40(%edx)
368	movdqa	%xmm0, 0x50(%edx)
369	movdqa	%xmm0, 0x60(%edx)
370	movdqa	%xmm0, 0x70(%edx)
371	lea	128(%edx), %edx
372	jae	L(128bytesormore_normal)
373
374L(128bytesless_normal):
375	add	$128, %ecx
376	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
377
378	ALIGN (4)
379L(128bytes_L2_normal):
380	prefetcht0	0x380(%edx)
381	prefetcht0	0x3c0(%edx)
382	sub	$128, %ecx
383	movdqa	%xmm0, (%edx)
384	movaps	%xmm0, 0x10(%edx)
385	movaps	%xmm0, 0x20(%edx)
386	movaps	%xmm0, 0x30(%edx)
387	movaps	%xmm0, 0x40(%edx)
388	movaps	%xmm0, 0x50(%edx)
389	movaps	%xmm0, 0x60(%edx)
390	movaps	%xmm0, 0x70(%edx)
391	add	$128, %edx
392	cmp	$128, %ecx
393	jae	L(128bytes_L2_normal)
394
395L(128bytesless_L2_normal):
396	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
397
398	RESTORE_EBX_STATE
399L(128bytesormore_nt_start):
400	sub	%ebx, %ecx
401	mov	%ebx, %eax
402	and	$0x7f, %eax
403	add	%eax, %ecx
404	movd	%xmm0, %eax
405	ALIGN (4)
406L(128bytesormore_shared_cache_loop):
407	prefetcht0	0x3c0(%edx)
408	prefetcht0	0x380(%edx)
409	sub	$0x80, %ebx
410	movdqa	%xmm0, (%edx)
411	movdqa	%xmm0, 0x10(%edx)
412	movdqa	%xmm0, 0x20(%edx)
413	movdqa	%xmm0, 0x30(%edx)
414	movdqa	%xmm0, 0x40(%edx)
415	movdqa	%xmm0, 0x50(%edx)
416	movdqa	%xmm0, 0x60(%edx)
417	movdqa	%xmm0, 0x70(%edx)
418	add	$0x80, %edx
419	cmp	$0x80, %ebx
420	jae	L(128bytesormore_shared_cache_loop)
421	cmp	$0x80, %ecx
422	jb	L(shared_cache_loop_end)
423	ALIGN (4)
424L(128bytesormore_nt):
425	sub	$0x80, %ecx
426	movntdq	%xmm0, (%edx)
427	movntdq	%xmm0, 0x10(%edx)
428	movntdq	%xmm0, 0x20(%edx)
429	movntdq	%xmm0, 0x30(%edx)
430	movntdq	%xmm0, 0x40(%edx)
431	movntdq	%xmm0, 0x50(%edx)
432	movntdq	%xmm0, 0x60(%edx)
433	movntdq	%xmm0, 0x70(%edx)
434	add	$0x80, %edx
435	cmp	$0x80, %ecx
436	jae	L(128bytesormore_nt)
437	sfence
438L(shared_cache_loop_end):
439#if defined DATA_CACHE_SIZE || !defined SHARED
440	POP (%ebx)
441#endif
442	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
443
444
445	.pushsection .rodata.sse2,"a",@progbits
446	ALIGN (2)
447L(table_16_128bytes):
448	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
449	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
450	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
451	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
452	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
453	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
454	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
455	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
456	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
457	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
458	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
459	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
460	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
461	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
462	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
463	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
464	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
465	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
466	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
467	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
468	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
469	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
470	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
471	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
472	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
473	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
474	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
475	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
476	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
477	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
478	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
479	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
480	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
481	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
482	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
483	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
484	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
485	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
486	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
487	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
488	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
489	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
490	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
491	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
492	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
493	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
494	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
495	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
496	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
497	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
498	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
499	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
500	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
501	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
502	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
503	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
504	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
505	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
506	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
507	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
508	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
509	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
510	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
511	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
512	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
513	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
514	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
515	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
516	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
517	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
518	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
519	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
520	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
521	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
522	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
523	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
524	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
525	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
526	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
527	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
528	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
529	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
530	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
531	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
532	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
533	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
534	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
535	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
536	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
537	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
538	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
539	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
540	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
541	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
542	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
543	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
544	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
545	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
546	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
547	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
548	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
549	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
550	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
551	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
552	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
553	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
554	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
555	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
556	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
557	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
558	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
559	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
560	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
561	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
562	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
563	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
564	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
565	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
566	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
567	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
568	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
569	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
570	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
571	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
572	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
573	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
574	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
575	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
576	.popsection
577
578	ALIGN (4)
579L(aligned_16_112bytes):
580	movdqa	%xmm0, -112(%edx)
581L(aligned_16_96bytes):
582	movdqa	%xmm0, -96(%edx)
583L(aligned_16_80bytes):
584	movdqa	%xmm0, -80(%edx)
585L(aligned_16_64bytes):
586	movdqa	%xmm0, -64(%edx)
587L(aligned_16_48bytes):
588	movdqa	%xmm0, -48(%edx)
589L(aligned_16_32bytes):
590	movdqa	%xmm0, -32(%edx)
591L(aligned_16_16bytes):
592	movdqa	%xmm0, -16(%edx)
593L(aligned_16_0bytes):
594	SETRTNVAL
595	RETURN
596
597	ALIGN (4)
598L(aligned_16_113bytes):
599	movdqa	%xmm0, -113(%edx)
600L(aligned_16_97bytes):
601	movdqa	%xmm0, -97(%edx)
602L(aligned_16_81bytes):
603	movdqa	%xmm0, -81(%edx)
604L(aligned_16_65bytes):
605	movdqa	%xmm0, -65(%edx)
606L(aligned_16_49bytes):
607	movdqa	%xmm0, -49(%edx)
608L(aligned_16_33bytes):
609	movdqa	%xmm0, -33(%edx)
610L(aligned_16_17bytes):
611	movdqa	%xmm0, -17(%edx)
612L(aligned_16_1bytes):
613	movb	%al, -1(%edx)
614	SETRTNVAL
615	RETURN
616
617	ALIGN (4)
618L(aligned_16_114bytes):
619	movdqa	%xmm0, -114(%edx)
620L(aligned_16_98bytes):
621	movdqa	%xmm0, -98(%edx)
622L(aligned_16_82bytes):
623	movdqa	%xmm0, -82(%edx)
624L(aligned_16_66bytes):
625	movdqa	%xmm0, -66(%edx)
626L(aligned_16_50bytes):
627	movdqa	%xmm0, -50(%edx)
628L(aligned_16_34bytes):
629	movdqa	%xmm0, -34(%edx)
630L(aligned_16_18bytes):
631	movdqa	%xmm0, -18(%edx)
632L(aligned_16_2bytes):
633	movw	%ax, -2(%edx)
634	SETRTNVAL
635	RETURN
636
637	ALIGN (4)
638L(aligned_16_115bytes):
639	movdqa	%xmm0, -115(%edx)
640L(aligned_16_99bytes):
641	movdqa	%xmm0, -99(%edx)
642L(aligned_16_83bytes):
643	movdqa	%xmm0, -83(%edx)
644L(aligned_16_67bytes):
645	movdqa	%xmm0, -67(%edx)
646L(aligned_16_51bytes):
647	movdqa	%xmm0, -51(%edx)
648L(aligned_16_35bytes):
649	movdqa	%xmm0, -35(%edx)
650L(aligned_16_19bytes):
651	movdqa	%xmm0, -19(%edx)
652L(aligned_16_3bytes):
653	movw	%ax, -3(%edx)
654	movb	%al, -1(%edx)
655	SETRTNVAL
656	RETURN
657
658	ALIGN (4)
659L(aligned_16_116bytes):
660	movdqa	%xmm0, -116(%edx)
661L(aligned_16_100bytes):
662	movdqa	%xmm0, -100(%edx)
663L(aligned_16_84bytes):
664	movdqa	%xmm0, -84(%edx)
665L(aligned_16_68bytes):
666	movdqa	%xmm0, -68(%edx)
667L(aligned_16_52bytes):
668	movdqa	%xmm0, -52(%edx)
669L(aligned_16_36bytes):
670	movdqa	%xmm0, -36(%edx)
671L(aligned_16_20bytes):
672	movdqa	%xmm0, -20(%edx)
673L(aligned_16_4bytes):
674	movl	%eax, -4(%edx)
675	SETRTNVAL
676	RETURN
677
678	ALIGN (4)
679L(aligned_16_117bytes):
680	movdqa	%xmm0, -117(%edx)
681L(aligned_16_101bytes):
682	movdqa	%xmm0, -101(%edx)
683L(aligned_16_85bytes):
684	movdqa	%xmm0, -85(%edx)
685L(aligned_16_69bytes):
686	movdqa	%xmm0, -69(%edx)
687L(aligned_16_53bytes):
688	movdqa	%xmm0, -53(%edx)
689L(aligned_16_37bytes):
690	movdqa	%xmm0, -37(%edx)
691L(aligned_16_21bytes):
692	movdqa	%xmm0, -21(%edx)
693L(aligned_16_5bytes):
694	movl	%eax, -5(%edx)
695	movb	%al, -1(%edx)
696	SETRTNVAL
697	RETURN
698
699	ALIGN (4)
700L(aligned_16_118bytes):
701	movdqa	%xmm0, -118(%edx)
702L(aligned_16_102bytes):
703	movdqa	%xmm0, -102(%edx)
704L(aligned_16_86bytes):
705	movdqa	%xmm0, -86(%edx)
706L(aligned_16_70bytes):
707	movdqa	%xmm0, -70(%edx)
708L(aligned_16_54bytes):
709	movdqa	%xmm0, -54(%edx)
710L(aligned_16_38bytes):
711	movdqa	%xmm0, -38(%edx)
712L(aligned_16_22bytes):
713	movdqa	%xmm0, -22(%edx)
714L(aligned_16_6bytes):
715	movl	%eax, -6(%edx)
716	movw	%ax, -2(%edx)
717	SETRTNVAL
718	RETURN
719
720	ALIGN (4)
721L(aligned_16_119bytes):
722	movdqa	%xmm0, -119(%edx)
723L(aligned_16_103bytes):
724	movdqa	%xmm0, -103(%edx)
725L(aligned_16_87bytes):
726	movdqa	%xmm0, -87(%edx)
727L(aligned_16_71bytes):
728	movdqa	%xmm0, -71(%edx)
729L(aligned_16_55bytes):
730	movdqa	%xmm0, -55(%edx)
731L(aligned_16_39bytes):
732	movdqa	%xmm0, -39(%edx)
733L(aligned_16_23bytes):
734	movdqa	%xmm0, -23(%edx)
735L(aligned_16_7bytes):
736	movl	%eax, -7(%edx)
737	movw	%ax, -3(%edx)
738	movb	%al, -1(%edx)
739	SETRTNVAL
740	RETURN
741
742	ALIGN (4)
743L(aligned_16_120bytes):
744	movdqa	%xmm0, -120(%edx)
745L(aligned_16_104bytes):
746	movdqa	%xmm0, -104(%edx)
747L(aligned_16_88bytes):
748	movdqa	%xmm0, -88(%edx)
749L(aligned_16_72bytes):
750	movdqa	%xmm0, -72(%edx)
751L(aligned_16_56bytes):
752	movdqa	%xmm0, -56(%edx)
753L(aligned_16_40bytes):
754	movdqa	%xmm0, -40(%edx)
755L(aligned_16_24bytes):
756	movdqa	%xmm0, -24(%edx)
757L(aligned_16_8bytes):
758	movq	%xmm0, -8(%edx)
759	SETRTNVAL
760	RETURN
761
762	ALIGN (4)
763L(aligned_16_121bytes):
764	movdqa	%xmm0, -121(%edx)
765L(aligned_16_105bytes):
766	movdqa	%xmm0, -105(%edx)
767L(aligned_16_89bytes):
768	movdqa	%xmm0, -89(%edx)
769L(aligned_16_73bytes):
770	movdqa	%xmm0, -73(%edx)
771L(aligned_16_57bytes):
772	movdqa	%xmm0, -57(%edx)
773L(aligned_16_41bytes):
774	movdqa	%xmm0, -41(%edx)
775L(aligned_16_25bytes):
776	movdqa	%xmm0, -25(%edx)
777L(aligned_16_9bytes):
778	movq	%xmm0, -9(%edx)
779	movb	%al, -1(%edx)
780	SETRTNVAL
781	RETURN
782
783	ALIGN (4)
784L(aligned_16_122bytes):
785	movdqa	%xmm0, -122(%edx)
786L(aligned_16_106bytes):
787	movdqa	%xmm0, -106(%edx)
788L(aligned_16_90bytes):
789	movdqa	%xmm0, -90(%edx)
790L(aligned_16_74bytes):
791	movdqa	%xmm0, -74(%edx)
792L(aligned_16_58bytes):
793	movdqa	%xmm0, -58(%edx)
794L(aligned_16_42bytes):
795	movdqa	%xmm0, -42(%edx)
796L(aligned_16_26bytes):
797	movdqa	%xmm0, -26(%edx)
798L(aligned_16_10bytes):
799	movq	%xmm0, -10(%edx)
800	movw	%ax, -2(%edx)
801	SETRTNVAL
802	RETURN
803
804	ALIGN (4)
805L(aligned_16_123bytes):
806	movdqa	%xmm0, -123(%edx)
807L(aligned_16_107bytes):
808	movdqa	%xmm0, -107(%edx)
809L(aligned_16_91bytes):
810	movdqa	%xmm0, -91(%edx)
811L(aligned_16_75bytes):
812	movdqa	%xmm0, -75(%edx)
813L(aligned_16_59bytes):
814	movdqa	%xmm0, -59(%edx)
815L(aligned_16_43bytes):
816	movdqa	%xmm0, -43(%edx)
817L(aligned_16_27bytes):
818	movdqa	%xmm0, -27(%edx)
819L(aligned_16_11bytes):
820	movq	%xmm0, -11(%edx)
821	movw	%ax, -3(%edx)
822	movb	%al, -1(%edx)
823	SETRTNVAL
824	RETURN
825
826	ALIGN (4)
827L(aligned_16_124bytes):
828	movdqa	%xmm0, -124(%edx)
829L(aligned_16_108bytes):
830	movdqa	%xmm0, -108(%edx)
831L(aligned_16_92bytes):
832	movdqa	%xmm0, -92(%edx)
833L(aligned_16_76bytes):
834	movdqa	%xmm0, -76(%edx)
835L(aligned_16_60bytes):
836	movdqa	%xmm0, -60(%edx)
837L(aligned_16_44bytes):
838	movdqa	%xmm0, -44(%edx)
839L(aligned_16_28bytes):
840	movdqa	%xmm0, -28(%edx)
841L(aligned_16_12bytes):
842	movq	%xmm0, -12(%edx)
843	movl	%eax, -4(%edx)
844	SETRTNVAL
845	RETURN
846
847	ALIGN (4)
848L(aligned_16_125bytes):
849	movdqa	%xmm0, -125(%edx)
850L(aligned_16_109bytes):
851	movdqa	%xmm0, -109(%edx)
852L(aligned_16_93bytes):
853	movdqa	%xmm0, -93(%edx)
854L(aligned_16_77bytes):
855	movdqa	%xmm0, -77(%edx)
856L(aligned_16_61bytes):
857	movdqa	%xmm0, -61(%edx)
858L(aligned_16_45bytes):
859	movdqa	%xmm0, -45(%edx)
860L(aligned_16_29bytes):
861	movdqa	%xmm0, -29(%edx)
862L(aligned_16_13bytes):
863	movq	%xmm0, -13(%edx)
864	movl	%eax, -5(%edx)
865	movb	%al, -1(%edx)
866	SETRTNVAL
867	RETURN
868
869	ALIGN (4)
870L(aligned_16_126bytes):
871	movdqa	%xmm0, -126(%edx)
872L(aligned_16_110bytes):
873	movdqa	%xmm0, -110(%edx)
874L(aligned_16_94bytes):
875	movdqa	%xmm0, -94(%edx)
876L(aligned_16_78bytes):
877	movdqa	%xmm0, -78(%edx)
878L(aligned_16_62bytes):
879	movdqa	%xmm0, -62(%edx)
880L(aligned_16_46bytes):
881	movdqa	%xmm0, -46(%edx)
882L(aligned_16_30bytes):
883	movdqa	%xmm0, -30(%edx)
884L(aligned_16_14bytes):
885	movq	%xmm0, -14(%edx)
886	movl	%eax, -6(%edx)
887	movw	%ax, -2(%edx)
888	SETRTNVAL
889	RETURN
890
891	ALIGN (4)
892L(aligned_16_127bytes):
893	movdqa	%xmm0, -127(%edx)
894L(aligned_16_111bytes):
895	movdqa	%xmm0, -111(%edx)
896L(aligned_16_95bytes):
897	movdqa	%xmm0, -95(%edx)
898L(aligned_16_79bytes):
899	movdqa	%xmm0, -79(%edx)
900L(aligned_16_63bytes):
901	movdqa	%xmm0, -63(%edx)
902L(aligned_16_47bytes):
903	movdqa	%xmm0, -47(%edx)
904L(aligned_16_31bytes):
905	movdqa	%xmm0, -31(%edx)
906L(aligned_16_15bytes):
907	movq	%xmm0, -15(%edx)
908	movl	%eax, -7(%edx)
909	movw	%ax, -3(%edx)
910	movb	%al, -1(%edx)
911	SETRTNVAL
912	RETURN_END
913
914END (sse2_memset5_atom)
915