• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *	MMX 3DNow! library helper functions
3  *
4  *	To do:
5  *	We can use MMX just for prefetch in IRQ's. This may be a win.
6  *		(reported so on K6-III)
7  *	We should use a better code neutral filler for the short jump
8  *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
9  *	We also want to clobber the filler register so we don't get any
10  *		register forwarding stalls on the filler.
11  *
12  *	Add *user handling. Checksums are not a win with MMX on any CPU
13  *	tested so far for any MMX solution figured.
14  *
15  *	22/09/2000 - Arjan van de Ven
16  *		Improved for non-egineering-sample Athlons
17  *
18  */
19 #include <linux/hardirq.h>
20 #include <linux/string.h>
21 #include <linux/module.h>
22 #include <linux/sched.h>
23 #include <linux/types.h>
24 
25 #include <asm/fpu/api.h>
26 #include <asm/asm.h>
27 
_mmx_memcpy(void * to,const void * from,size_t len)28 void *_mmx_memcpy(void *to, const void *from, size_t len)
29 {
30 	void *p;
31 	int i;
32 
33 	if (unlikely(in_interrupt()))
34 		return __memcpy(to, from, len);
35 
36 	p = to;
37 	i = len >> 6; /* len/64 */
38 
39 	kernel_fpu_begin();
40 
41 	__asm__ __volatile__ (
42 		"1: prefetch (%0)\n"		/* This set is 28 bytes */
43 		"   prefetch 64(%0)\n"
44 		"   prefetch 128(%0)\n"
45 		"   prefetch 192(%0)\n"
46 		"   prefetch 256(%0)\n"
47 		"2:  \n"
48 		".section .fixup, \"ax\"\n"
49 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
50 		"   jmp 2b\n"
51 		".previous\n"
52 			_ASM_EXTABLE(1b, 3b)
53 			: : "r" (from));
54 
55 	for ( ; i > 5; i--) {
56 		__asm__ __volatile__ (
57 		"1:  prefetch 320(%0)\n"
58 		"2:  movq (%0), %%mm0\n"
59 		"  movq 8(%0), %%mm1\n"
60 		"  movq 16(%0), %%mm2\n"
61 		"  movq 24(%0), %%mm3\n"
62 		"  movq %%mm0, (%1)\n"
63 		"  movq %%mm1, 8(%1)\n"
64 		"  movq %%mm2, 16(%1)\n"
65 		"  movq %%mm3, 24(%1)\n"
66 		"  movq 32(%0), %%mm0\n"
67 		"  movq 40(%0), %%mm1\n"
68 		"  movq 48(%0), %%mm2\n"
69 		"  movq 56(%0), %%mm3\n"
70 		"  movq %%mm0, 32(%1)\n"
71 		"  movq %%mm1, 40(%1)\n"
72 		"  movq %%mm2, 48(%1)\n"
73 		"  movq %%mm3, 56(%1)\n"
74 		".section .fixup, \"ax\"\n"
75 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
76 		"   jmp 2b\n"
77 		".previous\n"
78 			_ASM_EXTABLE(1b, 3b)
79 			: : "r" (from), "r" (to) : "memory");
80 
81 		from += 64;
82 		to += 64;
83 	}
84 
85 	for ( ; i > 0; i--) {
86 		__asm__ __volatile__ (
87 		"  movq (%0), %%mm0\n"
88 		"  movq 8(%0), %%mm1\n"
89 		"  movq 16(%0), %%mm2\n"
90 		"  movq 24(%0), %%mm3\n"
91 		"  movq %%mm0, (%1)\n"
92 		"  movq %%mm1, 8(%1)\n"
93 		"  movq %%mm2, 16(%1)\n"
94 		"  movq %%mm3, 24(%1)\n"
95 		"  movq 32(%0), %%mm0\n"
96 		"  movq 40(%0), %%mm1\n"
97 		"  movq 48(%0), %%mm2\n"
98 		"  movq 56(%0), %%mm3\n"
99 		"  movq %%mm0, 32(%1)\n"
100 		"  movq %%mm1, 40(%1)\n"
101 		"  movq %%mm2, 48(%1)\n"
102 		"  movq %%mm3, 56(%1)\n"
103 			: : "r" (from), "r" (to) : "memory");
104 
105 		from += 64;
106 		to += 64;
107 	}
108 	/*
109 	 * Now do the tail of the block:
110 	 */
111 	__memcpy(to, from, len & 63);
112 	kernel_fpu_end();
113 
114 	return p;
115 }
116 EXPORT_SYMBOL(_mmx_memcpy);
117 
118 #ifdef CONFIG_MK7
119 
120 /*
121  *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
122  *	other MMX using processors do not.
123  */
124 
fast_clear_page(void * page)125 static void fast_clear_page(void *page)
126 {
127 	int i;
128 
129 	kernel_fpu_begin();
130 
131 	__asm__ __volatile__ (
132 		"  pxor %%mm0, %%mm0\n" : :
133 	);
134 
135 	for (i = 0; i < 4096/64; i++) {
136 		__asm__ __volatile__ (
137 		"  movntq %%mm0, (%0)\n"
138 		"  movntq %%mm0, 8(%0)\n"
139 		"  movntq %%mm0, 16(%0)\n"
140 		"  movntq %%mm0, 24(%0)\n"
141 		"  movntq %%mm0, 32(%0)\n"
142 		"  movntq %%mm0, 40(%0)\n"
143 		"  movntq %%mm0, 48(%0)\n"
144 		"  movntq %%mm0, 56(%0)\n"
145 		: : "r" (page) : "memory");
146 		page += 64;
147 	}
148 
149 	/*
150 	 * Since movntq is weakly-ordered, a "sfence" is needed to become
151 	 * ordered again:
152 	 */
153 	__asm__ __volatile__("sfence\n"::);
154 
155 	kernel_fpu_end();
156 }
157 
fast_copy_page(void * to,void * from)158 static void fast_copy_page(void *to, void *from)
159 {
160 	int i;
161 
162 	kernel_fpu_begin();
163 
164 	/*
165 	 * maybe the prefetch stuff can go before the expensive fnsave...
166 	 * but that is for later. -AV
167 	 */
168 	__asm__ __volatile__(
169 		"1: prefetch (%0)\n"
170 		"   prefetch 64(%0)\n"
171 		"   prefetch 128(%0)\n"
172 		"   prefetch 192(%0)\n"
173 		"   prefetch 256(%0)\n"
174 		"2:  \n"
175 		".section .fixup, \"ax\"\n"
176 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
177 		"   jmp 2b\n"
178 		".previous\n"
179 			_ASM_EXTABLE(1b, 3b) : : "r" (from));
180 
181 	for (i = 0; i < (4096-320)/64; i++) {
182 		__asm__ __volatile__ (
183 		"1: prefetch 320(%0)\n"
184 		"2: movq (%0), %%mm0\n"
185 		"   movntq %%mm0, (%1)\n"
186 		"   movq 8(%0), %%mm1\n"
187 		"   movntq %%mm1, 8(%1)\n"
188 		"   movq 16(%0), %%mm2\n"
189 		"   movntq %%mm2, 16(%1)\n"
190 		"   movq 24(%0), %%mm3\n"
191 		"   movntq %%mm3, 24(%1)\n"
192 		"   movq 32(%0), %%mm4\n"
193 		"   movntq %%mm4, 32(%1)\n"
194 		"   movq 40(%0), %%mm5\n"
195 		"   movntq %%mm5, 40(%1)\n"
196 		"   movq 48(%0), %%mm6\n"
197 		"   movntq %%mm6, 48(%1)\n"
198 		"   movq 56(%0), %%mm7\n"
199 		"   movntq %%mm7, 56(%1)\n"
200 		".section .fixup, \"ax\"\n"
201 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
202 		"   jmp 2b\n"
203 		".previous\n"
204 		_ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
205 
206 		from += 64;
207 		to += 64;
208 	}
209 
210 	for (i = (4096-320)/64; i < 4096/64; i++) {
211 		__asm__ __volatile__ (
212 		"2: movq (%0), %%mm0\n"
213 		"   movntq %%mm0, (%1)\n"
214 		"   movq 8(%0), %%mm1\n"
215 		"   movntq %%mm1, 8(%1)\n"
216 		"   movq 16(%0), %%mm2\n"
217 		"   movntq %%mm2, 16(%1)\n"
218 		"   movq 24(%0), %%mm3\n"
219 		"   movntq %%mm3, 24(%1)\n"
220 		"   movq 32(%0), %%mm4\n"
221 		"   movntq %%mm4, 32(%1)\n"
222 		"   movq 40(%0), %%mm5\n"
223 		"   movntq %%mm5, 40(%1)\n"
224 		"   movq 48(%0), %%mm6\n"
225 		"   movntq %%mm6, 48(%1)\n"
226 		"   movq 56(%0), %%mm7\n"
227 		"   movntq %%mm7, 56(%1)\n"
228 			: : "r" (from), "r" (to) : "memory");
229 		from += 64;
230 		to += 64;
231 	}
232 	/*
233 	 * Since movntq is weakly-ordered, a "sfence" is needed to become
234 	 * ordered again:
235 	 */
236 	__asm__ __volatile__("sfence \n"::);
237 	kernel_fpu_end();
238 }
239 
240 #else /* CONFIG_MK7 */
241 
242 /*
243  *	Generic MMX implementation without K7 specific streaming
244  */
fast_clear_page(void * page)245 static void fast_clear_page(void *page)
246 {
247 	int i;
248 
249 	kernel_fpu_begin();
250 
251 	__asm__ __volatile__ (
252 		"  pxor %%mm0, %%mm0\n" : :
253 	);
254 
255 	for (i = 0; i < 4096/128; i++) {
256 		__asm__ __volatile__ (
257 		"  movq %%mm0, (%0)\n"
258 		"  movq %%mm0, 8(%0)\n"
259 		"  movq %%mm0, 16(%0)\n"
260 		"  movq %%mm0, 24(%0)\n"
261 		"  movq %%mm0, 32(%0)\n"
262 		"  movq %%mm0, 40(%0)\n"
263 		"  movq %%mm0, 48(%0)\n"
264 		"  movq %%mm0, 56(%0)\n"
265 		"  movq %%mm0, 64(%0)\n"
266 		"  movq %%mm0, 72(%0)\n"
267 		"  movq %%mm0, 80(%0)\n"
268 		"  movq %%mm0, 88(%0)\n"
269 		"  movq %%mm0, 96(%0)\n"
270 		"  movq %%mm0, 104(%0)\n"
271 		"  movq %%mm0, 112(%0)\n"
272 		"  movq %%mm0, 120(%0)\n"
273 			: : "r" (page) : "memory");
274 		page += 128;
275 	}
276 
277 	kernel_fpu_end();
278 }
279 
fast_copy_page(void * to,void * from)280 static void fast_copy_page(void *to, void *from)
281 {
282 	int i;
283 
284 	kernel_fpu_begin();
285 
286 	__asm__ __volatile__ (
287 		"1: prefetch (%0)\n"
288 		"   prefetch 64(%0)\n"
289 		"   prefetch 128(%0)\n"
290 		"   prefetch 192(%0)\n"
291 		"   prefetch 256(%0)\n"
292 		"2:  \n"
293 		".section .fixup, \"ax\"\n"
294 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
295 		"   jmp 2b\n"
296 		".previous\n"
297 			_ASM_EXTABLE(1b, 3b) : : "r" (from));
298 
299 	for (i = 0; i < 4096/64; i++) {
300 		__asm__ __volatile__ (
301 		"1: prefetch 320(%0)\n"
302 		"2: movq (%0), %%mm0\n"
303 		"   movq 8(%0), %%mm1\n"
304 		"   movq 16(%0), %%mm2\n"
305 		"   movq 24(%0), %%mm3\n"
306 		"   movq %%mm0, (%1)\n"
307 		"   movq %%mm1, 8(%1)\n"
308 		"   movq %%mm2, 16(%1)\n"
309 		"   movq %%mm3, 24(%1)\n"
310 		"   movq 32(%0), %%mm0\n"
311 		"   movq 40(%0), %%mm1\n"
312 		"   movq 48(%0), %%mm2\n"
313 		"   movq 56(%0), %%mm3\n"
314 		"   movq %%mm0, 32(%1)\n"
315 		"   movq %%mm1, 40(%1)\n"
316 		"   movq %%mm2, 48(%1)\n"
317 		"   movq %%mm3, 56(%1)\n"
318 		".section .fixup, \"ax\"\n"
319 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
320 		"   jmp 2b\n"
321 		".previous\n"
322 			_ASM_EXTABLE(1b, 3b)
323 			: : "r" (from), "r" (to) : "memory");
324 
325 		from += 64;
326 		to += 64;
327 	}
328 	kernel_fpu_end();
329 }
330 
331 #endif /* !CONFIG_MK7 */
332 
333 /*
334  * Favour MMX for page clear and copy:
335  */
slow_zero_page(void * page)336 static void slow_zero_page(void *page)
337 {
338 	int d0, d1;
339 
340 	__asm__ __volatile__(
341 		"cld\n\t"
342 		"rep ; stosl"
343 
344 			: "=&c" (d0), "=&D" (d1)
345 			:"a" (0), "1" (page), "0" (1024)
346 			:"memory");
347 }
348 
mmx_clear_page(void * page)349 void mmx_clear_page(void *page)
350 {
351 	if (unlikely(in_interrupt()))
352 		slow_zero_page(page);
353 	else
354 		fast_clear_page(page);
355 }
356 EXPORT_SYMBOL(mmx_clear_page);
357 
slow_copy_page(void * to,void * from)358 static void slow_copy_page(void *to, void *from)
359 {
360 	int d0, d1, d2;
361 
362 	__asm__ __volatile__(
363 		"cld\n\t"
364 		"rep ; movsl"
365 		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
366 		: "0" (1024), "1" ((long) to), "2" ((long) from)
367 		: "memory");
368 }
369 
mmx_copy_page(void * to,void * from)370 void mmx_copy_page(void *to, void *from)
371 {
372 	if (unlikely(in_interrupt()))
373 		slow_copy_page(to, from);
374 	else
375 		fast_copy_page(to, from);
376 }
377 EXPORT_SYMBOL(mmx_copy_page);
378