• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32#undef __i686
33
34#ifndef L
35# define L(label)	.L##label
36#endif
37
38#ifndef ALIGN
39# define ALIGN(n)	.p2align n
40#endif
41
42#ifndef cfi_startproc
43# define cfi_startproc			.cfi_startproc
44#endif
45
46#ifndef cfi_endproc
47# define cfi_endproc			.cfi_endproc
48#endif
49
50#ifndef cfi_rel_offset
51# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
52#endif
53
54#ifndef cfi_restore
55# define cfi_restore(reg)		.cfi_restore reg
56#endif
57
58#ifndef cfi_adjust_cfa_offset
59# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
60#endif
61
62#ifndef ENTRY
63# define ENTRY(name)			\
64	.type name,  @function; 	\
65	.globl name;			\
66	.p2align 4;			\
67name:					\
68	cfi_startproc
69#endif
70
71#ifndef END
72# define END(name)			\
73	cfi_endproc;			\
74	.size name, .-name
75#endif
76
77#define CFI_PUSH(REG)						\
78  cfi_adjust_cfa_offset (4);					\
79  cfi_rel_offset (REG, 0)
80
81#define CFI_POP(REG)						\
82  cfi_adjust_cfa_offset (-4);					\
83  cfi_restore (REG)
84
85#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
86#define POP(REG)	popl REG; CFI_POP (REG)
87
88#ifdef USE_AS_BZERO
89# define DEST		PARMS
90# define LEN		DEST+4
91# define SETRTNVAL
92#else
93# define DEST		PARMS
94# define CHR		DEST+4
95# define LEN		CHR+4
96# define SETRTNVAL	movl DEST(%esp), %eax
97#endif
98
99#if (defined SHARED || defined __PIC__)
100# define ENTRANCE	PUSH (%ebx);
101# define RETURN_END	POP (%ebx); ret
102# define RETURN		RETURN_END; CFI_PUSH (%ebx)
103# define PARMS		8		/* Preserve EBX.  */
104# define JMPTBL(I, B)	I - B
105
106/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
107   jump table with relative offsets.   */
108# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
109    /* We first load PC into EBX.  */				\
110    call	__i686.get_pc_thunk.bx;				\
111    /* Get the address of the jump table.  */			\
112    add		$(TABLE - .), %ebx;				\
113    /* Get the entry and convert the relative offset to the	\
114       absolute address.  */					\
115    add		(%ebx,%ecx,4), %ebx;				\
116    add		%ecx, %edx;					\
117    /* We loaded the jump table and adjuested EDX. Go.  */	\
118    jmp		*%ebx
119
120	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
121	.globl	__i686.get_pc_thunk.bx
122	.hidden	__i686.get_pc_thunk.bx
123	ALIGN (4)
124	.type	__i686.get_pc_thunk.bx,@function
125__i686.get_pc_thunk.bx:
126	movl	(%esp), %ebx
127	ret
128#else
129# define ENTRANCE
130# define RETURN_END	ret
131# define RETURN		RETURN_END
132# define PARMS		4
133# define JMPTBL(I, B)	I
134
135/* Branch to an entry in a jump table.  TABLE is a jump table with
136   absolute offsets.  */
137# define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
138    add		%ecx, %edx;					\
139    jmp		*TABLE(,%ecx,4)
140#endif
141
142#ifndef MEMSET
143# define MEMSET memset
144#endif
145
146	.section .text.sse2,"ax",@progbits
147	ALIGN (4)
148ENTRY (MEMSET)
149	ENTRANCE
150
151	movl	LEN(%esp), %ecx
152#ifdef USE_AS_BZERO
153	xor	%eax, %eax
154#else
155	movzbl	CHR(%esp), %eax
156	movb	%al, %ah
157	/* Fill the whole EAX with pattern.  */
158	movl	%eax, %edx
159	shl	$16, %eax
160	or	%edx, %eax
161#endif
162	movl	DEST(%esp), %edx
163	cmp	$32, %ecx
164	jae	L(32bytesormore)
165
166L(write_less32bytes):
167	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
168
169
170	.pushsection .rodata.sse2,"a",@progbits
171	ALIGN (2)
172L(table_less_32bytes):
173	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
174	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
175	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
176	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
177	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
178	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
179	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
180	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
181	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
182	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
183	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
184	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
185	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
186	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
187	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
188	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
189	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
190	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
191	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
192	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
193	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
194	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
195	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
196	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
197	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
198	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
199	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
200	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
201	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
202	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
203	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
204	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
205	.popsection
206
207	ALIGN (4)
208L(write_28bytes):
209	movl	%eax, -28(%edx)
210L(write_24bytes):
211	movl	%eax, -24(%edx)
212L(write_20bytes):
213	movl	%eax, -20(%edx)
214L(write_16bytes):
215	movl	%eax, -16(%edx)
216L(write_12bytes):
217	movl	%eax, -12(%edx)
218L(write_8bytes):
219	movl	%eax, -8(%edx)
220L(write_4bytes):
221	movl	%eax, -4(%edx)
222L(write_0bytes):
223	SETRTNVAL
224	RETURN
225
226	ALIGN (4)
227L(write_29bytes):
228	movl	%eax, -29(%edx)
229L(write_25bytes):
230	movl	%eax, -25(%edx)
231L(write_21bytes):
232	movl	%eax, -21(%edx)
233L(write_17bytes):
234	movl	%eax, -17(%edx)
235L(write_13bytes):
236	movl	%eax, -13(%edx)
237L(write_9bytes):
238	movl	%eax, -9(%edx)
239L(write_5bytes):
240	movl	%eax, -5(%edx)
241L(write_1bytes):
242	movb	%al, -1(%edx)
243	SETRTNVAL
244	RETURN
245
246	ALIGN (4)
247L(write_30bytes):
248	movl	%eax, -30(%edx)
249L(write_26bytes):
250	movl	%eax, -26(%edx)
251L(write_22bytes):
252	movl	%eax, -22(%edx)
253L(write_18bytes):
254	movl	%eax, -18(%edx)
255L(write_14bytes):
256	movl	%eax, -14(%edx)
257L(write_10bytes):
258	movl	%eax, -10(%edx)
259L(write_6bytes):
260	movl	%eax, -6(%edx)
261L(write_2bytes):
262	movw	%ax, -2(%edx)
263	SETRTNVAL
264	RETURN
265
266	ALIGN (4)
267L(write_31bytes):
268	movl	%eax, -31(%edx)
269L(write_27bytes):
270	movl	%eax, -27(%edx)
271L(write_23bytes):
272	movl	%eax, -23(%edx)
273L(write_19bytes):
274	movl	%eax, -19(%edx)
275L(write_15bytes):
276	movl	%eax, -15(%edx)
277L(write_11bytes):
278	movl	%eax, -11(%edx)
279L(write_7bytes):
280	movl	%eax, -7(%edx)
281L(write_3bytes):
282	movw	%ax, -3(%edx)
283	movb	%al, -1(%edx)
284	SETRTNVAL
285	RETURN
286
287	ALIGN (4)
288/* ECX > 32 and EDX is 4 byte aligned.  */
289L(32bytesormore):
290	/* Fill xmm0 with the pattern.  */
291#ifdef USE_AS_BZERO
292	pxor	%xmm0, %xmm0
293#else
294	movd	%eax, %xmm0
295	pshufd	$0, %xmm0, %xmm0
296#endif
297	testl	$0xf, %edx
298	jz	L(aligned_16)
299/* ECX > 32 and EDX is not 16 byte aligned.  */
300L(not_aligned_16):
301	movdqu	%xmm0, (%edx)
302	movl	%edx, %eax
303	and	$-16, %edx
304	add	$16, %edx
305	sub	%edx, %eax
306	add	%eax, %ecx
307	movd	%xmm0, %eax
308
309	ALIGN (4)
310L(aligned_16):
311	cmp	$128, %ecx
312	jae	L(128bytesormore)
313
314L(aligned_16_less128bytes):
315	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
316
317	ALIGN (4)
318L(128bytesormore):
319#ifdef SHARED_CACHE_SIZE
320	PUSH (%ebx)
321	mov	$SHARED_CACHE_SIZE, %ebx
322#else
323# if (defined SHARED || defined __PIC__)
324	call	__i686.get_pc_thunk.bx
325	add	$_GLOBAL_OFFSET_TABLE_, %ebx
326	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
327# else
328	PUSH (%ebx)
329	mov	__x86_shared_cache_size, %ebx
330# endif
331#endif
332	cmp	%ebx, %ecx
333	jae	L(128bytesormore_nt_start)
334
335
336#ifdef DATA_CACHE_SIZE
337	POP (%ebx)
338# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
339	cmp	$DATA_CACHE_SIZE, %ecx
340#else
341# if (defined SHARED || defined __PIC__)
342#  define RESTORE_EBX_STATE
343	call	__i686.get_pc_thunk.bx
344	add	$_GLOBAL_OFFSET_TABLE_, %ebx
345	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
346# else
347	POP (%ebx)
348#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
349	cmp	__x86_data_cache_size, %ecx
350# endif
351#endif
352
353	jae	L(128bytes_L2_normal)
354	subl	$128, %ecx
355L(128bytesormore_normal):
356	sub	$128, %ecx
357	movdqa	%xmm0, (%edx)
358	movdqa	%xmm0, 0x10(%edx)
359	movdqa	%xmm0, 0x20(%edx)
360	movdqa	%xmm0, 0x30(%edx)
361	movdqa	%xmm0, 0x40(%edx)
362	movdqa	%xmm0, 0x50(%edx)
363	movdqa	%xmm0, 0x60(%edx)
364	movdqa	%xmm0, 0x70(%edx)
365	lea	128(%edx), %edx
366	jb	L(128bytesless_normal)
367
368
369	sub	$128, %ecx
370	movdqa	%xmm0, (%edx)
371	movdqa	%xmm0, 0x10(%edx)
372	movdqa	%xmm0, 0x20(%edx)
373	movdqa	%xmm0, 0x30(%edx)
374	movdqa	%xmm0, 0x40(%edx)
375	movdqa	%xmm0, 0x50(%edx)
376	movdqa	%xmm0, 0x60(%edx)
377	movdqa	%xmm0, 0x70(%edx)
378	lea	128(%edx), %edx
379	jae	L(128bytesormore_normal)
380
381L(128bytesless_normal):
382	add	$128, %ecx
383	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
384
385	ALIGN (4)
386L(128bytes_L2_normal):
387	prefetcht0	0x380(%edx)
388	prefetcht0	0x3c0(%edx)
389	sub	$128, %ecx
390	movdqa	%xmm0, (%edx)
391	movaps	%xmm0, 0x10(%edx)
392	movaps	%xmm0, 0x20(%edx)
393	movaps	%xmm0, 0x30(%edx)
394	movaps	%xmm0, 0x40(%edx)
395	movaps	%xmm0, 0x50(%edx)
396	movaps	%xmm0, 0x60(%edx)
397	movaps	%xmm0, 0x70(%edx)
398	add	$128, %edx
399	cmp	$128, %ecx
400	jae	L(128bytes_L2_normal)
401
402L(128bytesless_L2_normal):
403	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
404
405	RESTORE_EBX_STATE
406L(128bytesormore_nt_start):
407	sub	%ebx, %ecx
408	mov	%ebx, %eax
409	and	$0x7f, %eax
410	add	%eax, %ecx
411	movd	%xmm0, %eax
412	ALIGN (4)
413L(128bytesormore_shared_cache_loop):
414	prefetcht0	0x3c0(%edx)
415	prefetcht0	0x380(%edx)
416	sub	$0x80, %ebx
417	movdqa	%xmm0, (%edx)
418	movdqa	%xmm0, 0x10(%edx)
419	movdqa	%xmm0, 0x20(%edx)
420	movdqa	%xmm0, 0x30(%edx)
421	movdqa	%xmm0, 0x40(%edx)
422	movdqa	%xmm0, 0x50(%edx)
423	movdqa	%xmm0, 0x60(%edx)
424	movdqa	%xmm0, 0x70(%edx)
425	add	$0x80, %edx
426	cmp	$0x80, %ebx
427	jae	L(128bytesormore_shared_cache_loop)
428	cmp	$0x80, %ecx
429	jb	L(shared_cache_loop_end)
430	ALIGN (4)
431L(128bytesormore_nt):
432	sub	$0x80, %ecx
433	movntdq	%xmm0, (%edx)
434	movntdq	%xmm0, 0x10(%edx)
435	movntdq	%xmm0, 0x20(%edx)
436	movntdq	%xmm0, 0x30(%edx)
437	movntdq	%xmm0, 0x40(%edx)
438	movntdq	%xmm0, 0x50(%edx)
439	movntdq	%xmm0, 0x60(%edx)
440	movntdq	%xmm0, 0x70(%edx)
441	add	$0x80, %edx
442	cmp	$0x80, %ecx
443	jae	L(128bytesormore_nt)
444	sfence
445L(shared_cache_loop_end):
446#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
447	POP (%ebx)
448#endif
449	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
450
451
452	.pushsection .rodata.sse2,"a",@progbits
453	ALIGN (2)
454L(table_16_128bytes):
455	.int	JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
456	.int	JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
457	.int	JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
458	.int	JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
459	.int	JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
460	.int	JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
461	.int	JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
462	.int	JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
463	.int	JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
464	.int	JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
465	.int	JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
466	.int	JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
467	.int	JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
468	.int	JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
469	.int	JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
470	.int	JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
471	.int	JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
472	.int	JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
473	.int	JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
474	.int	JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
475	.int	JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
476	.int	JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
477	.int	JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
478	.int	JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
479	.int	JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
480	.int	JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
481	.int	JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
482	.int	JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
483	.int	JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
484	.int	JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
485	.int	JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
486	.int	JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
487	.int	JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
488	.int	JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
489	.int	JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
490	.int	JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
491	.int	JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
492	.int	JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
493	.int	JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
494	.int	JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
495	.int	JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
496	.int	JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
497	.int	JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
498	.int	JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
499	.int	JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
500	.int	JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
501	.int	JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
502	.int	JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
503	.int	JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
504	.int	JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
505	.int	JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
506	.int	JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
507	.int	JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
508	.int	JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
509	.int	JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
510	.int	JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
511	.int	JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
512	.int	JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
513	.int	JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
514	.int	JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
515	.int	JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
516	.int	JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
517	.int	JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
518	.int	JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
519	.int	JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
520	.int	JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
521	.int	JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
522	.int	JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
523	.int	JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
524	.int	JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
525	.int	JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
526	.int	JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
527	.int	JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
528	.int	JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
529	.int	JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
530	.int	JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
531	.int	JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
532	.int	JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
533	.int	JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
534	.int	JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
535	.int	JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
536	.int	JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
537	.int	JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
538	.int	JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
539	.int	JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
540	.int	JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
541	.int	JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
542	.int	JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
543	.int	JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
544	.int	JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
545	.int	JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
546	.int	JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
547	.int	JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
548	.int	JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
549	.int	JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
550	.int	JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
551	.int	JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
552	.int	JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
553	.int	JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
554	.int	JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
555	.int	JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
556	.int	JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
557	.int	JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
558	.int	JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
559	.int	JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
560	.int	JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
561	.int	JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
562	.int	JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
563	.int	JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
564	.int	JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
565	.int	JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
566	.int	JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
567	.int	JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
568	.int	JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
569	.int	JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
570	.int	JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
571	.int	JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
572	.int	JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
573	.int	JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
574	.int	JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
575	.int	JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
576	.int	JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
577	.int	JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
578	.int	JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
579	.int	JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
580	.int	JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
581	.int	JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
582	.int	JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
583	.popsection
584
585	ALIGN (4)
586L(aligned_16_112bytes):
587	movdqa	%xmm0, -112(%edx)
588L(aligned_16_96bytes):
589	movdqa	%xmm0, -96(%edx)
590L(aligned_16_80bytes):
591	movdqa	%xmm0, -80(%edx)
592L(aligned_16_64bytes):
593	movdqa	%xmm0, -64(%edx)
594L(aligned_16_48bytes):
595	movdqa	%xmm0, -48(%edx)
596L(aligned_16_32bytes):
597	movdqa	%xmm0, -32(%edx)
598L(aligned_16_16bytes):
599	movdqa	%xmm0, -16(%edx)
600L(aligned_16_0bytes):
601	SETRTNVAL
602	RETURN
603
604	ALIGN (4)
605L(aligned_16_113bytes):
606	movdqa	%xmm0, -113(%edx)
607L(aligned_16_97bytes):
608	movdqa	%xmm0, -97(%edx)
609L(aligned_16_81bytes):
610	movdqa	%xmm0, -81(%edx)
611L(aligned_16_65bytes):
612	movdqa	%xmm0, -65(%edx)
613L(aligned_16_49bytes):
614	movdqa	%xmm0, -49(%edx)
615L(aligned_16_33bytes):
616	movdqa	%xmm0, -33(%edx)
617L(aligned_16_17bytes):
618	movdqa	%xmm0, -17(%edx)
619L(aligned_16_1bytes):
620	movb	%al, -1(%edx)
621	SETRTNVAL
622	RETURN
623
624	ALIGN (4)
625L(aligned_16_114bytes):
626	movdqa	%xmm0, -114(%edx)
627L(aligned_16_98bytes):
628	movdqa	%xmm0, -98(%edx)
629L(aligned_16_82bytes):
630	movdqa	%xmm0, -82(%edx)
631L(aligned_16_66bytes):
632	movdqa	%xmm0, -66(%edx)
633L(aligned_16_50bytes):
634	movdqa	%xmm0, -50(%edx)
635L(aligned_16_34bytes):
636	movdqa	%xmm0, -34(%edx)
637L(aligned_16_18bytes):
638	movdqa	%xmm0, -18(%edx)
639L(aligned_16_2bytes):
640	movw	%ax, -2(%edx)
641	SETRTNVAL
642	RETURN
643
644	ALIGN (4)
645L(aligned_16_115bytes):
646	movdqa	%xmm0, -115(%edx)
647L(aligned_16_99bytes):
648	movdqa	%xmm0, -99(%edx)
649L(aligned_16_83bytes):
650	movdqa	%xmm0, -83(%edx)
651L(aligned_16_67bytes):
652	movdqa	%xmm0, -67(%edx)
653L(aligned_16_51bytes):
654	movdqa	%xmm0, -51(%edx)
655L(aligned_16_35bytes):
656	movdqa	%xmm0, -35(%edx)
657L(aligned_16_19bytes):
658	movdqa	%xmm0, -19(%edx)
659L(aligned_16_3bytes):
660	movw	%ax, -3(%edx)
661	movb	%al, -1(%edx)
662	SETRTNVAL
663	RETURN
664
665	ALIGN (4)
666L(aligned_16_116bytes):
667	movdqa	%xmm0, -116(%edx)
668L(aligned_16_100bytes):
669	movdqa	%xmm0, -100(%edx)
670L(aligned_16_84bytes):
671	movdqa	%xmm0, -84(%edx)
672L(aligned_16_68bytes):
673	movdqa	%xmm0, -68(%edx)
674L(aligned_16_52bytes):
675	movdqa	%xmm0, -52(%edx)
676L(aligned_16_36bytes):
677	movdqa	%xmm0, -36(%edx)
678L(aligned_16_20bytes):
679	movdqa	%xmm0, -20(%edx)
680L(aligned_16_4bytes):
681	movl	%eax, -4(%edx)
682	SETRTNVAL
683	RETURN
684
685	ALIGN (4)
686L(aligned_16_117bytes):
687	movdqa	%xmm0, -117(%edx)
688L(aligned_16_101bytes):
689	movdqa	%xmm0, -101(%edx)
690L(aligned_16_85bytes):
691	movdqa	%xmm0, -85(%edx)
692L(aligned_16_69bytes):
693	movdqa	%xmm0, -69(%edx)
694L(aligned_16_53bytes):
695	movdqa	%xmm0, -53(%edx)
696L(aligned_16_37bytes):
697	movdqa	%xmm0, -37(%edx)
698L(aligned_16_21bytes):
699	movdqa	%xmm0, -21(%edx)
700L(aligned_16_5bytes):
701	movl	%eax, -5(%edx)
702	movb	%al, -1(%edx)
703	SETRTNVAL
704	RETURN
705
706	ALIGN (4)
707L(aligned_16_118bytes):
708	movdqa	%xmm0, -118(%edx)
709L(aligned_16_102bytes):
710	movdqa	%xmm0, -102(%edx)
711L(aligned_16_86bytes):
712	movdqa	%xmm0, -86(%edx)
713L(aligned_16_70bytes):
714	movdqa	%xmm0, -70(%edx)
715L(aligned_16_54bytes):
716	movdqa	%xmm0, -54(%edx)
717L(aligned_16_38bytes):
718	movdqa	%xmm0, -38(%edx)
719L(aligned_16_22bytes):
720	movdqa	%xmm0, -22(%edx)
721L(aligned_16_6bytes):
722	movl	%eax, -6(%edx)
723	movw	%ax, -2(%edx)
724	SETRTNVAL
725	RETURN
726
727	ALIGN (4)
728L(aligned_16_119bytes):
729	movdqa	%xmm0, -119(%edx)
730L(aligned_16_103bytes):
731	movdqa	%xmm0, -103(%edx)
732L(aligned_16_87bytes):
733	movdqa	%xmm0, -87(%edx)
734L(aligned_16_71bytes):
735	movdqa	%xmm0, -71(%edx)
736L(aligned_16_55bytes):
737	movdqa	%xmm0, -55(%edx)
738L(aligned_16_39bytes):
739	movdqa	%xmm0, -39(%edx)
740L(aligned_16_23bytes):
741	movdqa	%xmm0, -23(%edx)
742L(aligned_16_7bytes):
743	movl	%eax, -7(%edx)
744	movw	%ax, -3(%edx)
745	movb	%al, -1(%edx)
746	SETRTNVAL
747	RETURN
748
749	ALIGN (4)
750L(aligned_16_120bytes):
751	movdqa	%xmm0, -120(%edx)
752L(aligned_16_104bytes):
753	movdqa	%xmm0, -104(%edx)
754L(aligned_16_88bytes):
755	movdqa	%xmm0, -88(%edx)
756L(aligned_16_72bytes):
757	movdqa	%xmm0, -72(%edx)
758L(aligned_16_56bytes):
759	movdqa	%xmm0, -56(%edx)
760L(aligned_16_40bytes):
761	movdqa	%xmm0, -40(%edx)
762L(aligned_16_24bytes):
763	movdqa	%xmm0, -24(%edx)
764L(aligned_16_8bytes):
765	movq	%xmm0, -8(%edx)
766	SETRTNVAL
767	RETURN
768
769	ALIGN (4)
770L(aligned_16_121bytes):
771	movdqa	%xmm0, -121(%edx)
772L(aligned_16_105bytes):
773	movdqa	%xmm0, -105(%edx)
774L(aligned_16_89bytes):
775	movdqa	%xmm0, -89(%edx)
776L(aligned_16_73bytes):
777	movdqa	%xmm0, -73(%edx)
778L(aligned_16_57bytes):
779	movdqa	%xmm0, -57(%edx)
780L(aligned_16_41bytes):
781	movdqa	%xmm0, -41(%edx)
782L(aligned_16_25bytes):
783	movdqa	%xmm0, -25(%edx)
784L(aligned_16_9bytes):
785	movq	%xmm0, -9(%edx)
786	movb	%al, -1(%edx)
787	SETRTNVAL
788	RETURN
789
790	ALIGN (4)
791L(aligned_16_122bytes):
792	movdqa	%xmm0, -122(%edx)
793L(aligned_16_106bytes):
794	movdqa	%xmm0, -106(%edx)
795L(aligned_16_90bytes):
796	movdqa	%xmm0, -90(%edx)
797L(aligned_16_74bytes):
798	movdqa	%xmm0, -74(%edx)
799L(aligned_16_58bytes):
800	movdqa	%xmm0, -58(%edx)
801L(aligned_16_42bytes):
802	movdqa	%xmm0, -42(%edx)
803L(aligned_16_26bytes):
804	movdqa	%xmm0, -26(%edx)
805L(aligned_16_10bytes):
806	movq	%xmm0, -10(%edx)
807	movw	%ax, -2(%edx)
808	SETRTNVAL
809	RETURN
810
811	ALIGN (4)
812L(aligned_16_123bytes):
813	movdqa	%xmm0, -123(%edx)
814L(aligned_16_107bytes):
815	movdqa	%xmm0, -107(%edx)
816L(aligned_16_91bytes):
817	movdqa	%xmm0, -91(%edx)
818L(aligned_16_75bytes):
819	movdqa	%xmm0, -75(%edx)
820L(aligned_16_59bytes):
821	movdqa	%xmm0, -59(%edx)
822L(aligned_16_43bytes):
823	movdqa	%xmm0, -43(%edx)
824L(aligned_16_27bytes):
825	movdqa	%xmm0, -27(%edx)
826L(aligned_16_11bytes):
827	movq	%xmm0, -11(%edx)
828	movw	%ax, -3(%edx)
829	movb	%al, -1(%edx)
830	SETRTNVAL
831	RETURN
832
833	ALIGN (4)
834L(aligned_16_124bytes):
835	movdqa	%xmm0, -124(%edx)
836L(aligned_16_108bytes):
837	movdqa	%xmm0, -108(%edx)
838L(aligned_16_92bytes):
839	movdqa	%xmm0, -92(%edx)
840L(aligned_16_76bytes):
841	movdqa	%xmm0, -76(%edx)
842L(aligned_16_60bytes):
843	movdqa	%xmm0, -60(%edx)
844L(aligned_16_44bytes):
845	movdqa	%xmm0, -44(%edx)
846L(aligned_16_28bytes):
847	movdqa	%xmm0, -28(%edx)
848L(aligned_16_12bytes):
849	movq	%xmm0, -12(%edx)
850	movl	%eax, -4(%edx)
851	SETRTNVAL
852	RETURN
853
854	ALIGN (4)
855L(aligned_16_125bytes):
856	movdqa	%xmm0, -125(%edx)
857L(aligned_16_109bytes):
858	movdqa	%xmm0, -109(%edx)
859L(aligned_16_93bytes):
860	movdqa	%xmm0, -93(%edx)
861L(aligned_16_77bytes):
862	movdqa	%xmm0, -77(%edx)
863L(aligned_16_61bytes):
864	movdqa	%xmm0, -61(%edx)
865L(aligned_16_45bytes):
866	movdqa	%xmm0, -45(%edx)
867L(aligned_16_29bytes):
868	movdqa	%xmm0, -29(%edx)
869L(aligned_16_13bytes):
870	movq	%xmm0, -13(%edx)
871	movl	%eax, -5(%edx)
872	movb	%al, -1(%edx)
873	SETRTNVAL
874	RETURN
875
876	ALIGN (4)
877L(aligned_16_126bytes):
878	movdqa	%xmm0, -126(%edx)
879L(aligned_16_110bytes):
880	movdqa	%xmm0, -110(%edx)
881L(aligned_16_94bytes):
882	movdqa	%xmm0, -94(%edx)
883L(aligned_16_78bytes):
884	movdqa	%xmm0, -78(%edx)
885L(aligned_16_62bytes):
886	movdqa	%xmm0, -62(%edx)
887L(aligned_16_46bytes):
888	movdqa	%xmm0, -46(%edx)
889L(aligned_16_30bytes):
890	movdqa	%xmm0, -30(%edx)
891L(aligned_16_14bytes):
892	movq	%xmm0, -14(%edx)
893	movl	%eax, -6(%edx)
894	movw	%ax, -2(%edx)
895	SETRTNVAL
896	RETURN
897
898	ALIGN (4)
899L(aligned_16_127bytes):
900	movdqa	%xmm0, -127(%edx)
901L(aligned_16_111bytes):
902	movdqa	%xmm0, -111(%edx)
903L(aligned_16_95bytes):
904	movdqa	%xmm0, -95(%edx)
905L(aligned_16_79bytes):
906	movdqa	%xmm0, -79(%edx)
907L(aligned_16_63bytes):
908	movdqa	%xmm0, -63(%edx)
909L(aligned_16_47bytes):
910	movdqa	%xmm0, -47(%edx)
911L(aligned_16_31bytes):
912	movdqa	%xmm0, -31(%edx)
913L(aligned_16_15bytes):
914	movq	%xmm0, -15(%edx)
915	movl	%eax, -7(%edx)
916	movw	%ax, -3(%edx)
917	movb	%al, -1(%edx)
918	SETRTNVAL
919	RETURN_END
920
921END (MEMSET)
922