• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *	MMX 3DNow! library helper functions
4  *
5  *	To do:
6  *	We can use MMX just for prefetch in IRQ's. This may be a win.
7  *		(reported so on K6-III)
8  *	We should use a better code neutral filler for the short jump
9  *		leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
10  *	We also want to clobber the filler register so we don't get any
11  *		register forwarding stalls on the filler.
12  *
13  *	Add *user handling. Checksums are not a win with MMX on any CPU
14  *	tested so far for any MMX solution figured.
15  *
16  *	22/09/2000 - Arjan van de Ven
17  *		Improved for non-egineering-sample Athlons
18  *
19  */
20 #include <linux/hardirq.h>
21 #include <linux/string.h>
22 #include <linux/export.h>
23 #include <linux/sched.h>
24 #include <linux/types.h>
25 
26 #include <asm/fpu/api.h>
27 #include <asm/asm.h>
28 
_mmx_memcpy(void * to,const void * from,size_t len)29 void *_mmx_memcpy(void *to, const void *from, size_t len)
30 {
31 	void *p;
32 	int i;
33 
34 	if (unlikely(in_interrupt()))
35 		return __memcpy(to, from, len);
36 
37 	p = to;
38 	i = len >> 6; /* len/64 */
39 
40 	kernel_fpu_begin();
41 
42 	__asm__ __volatile__ (
43 		"1: prefetch (%0)\n"		/* This set is 28 bytes */
44 		"   prefetch 64(%0)\n"
45 		"   prefetch 128(%0)\n"
46 		"   prefetch 192(%0)\n"
47 		"   prefetch 256(%0)\n"
48 		"2:  \n"
49 		".section .fixup, \"ax\"\n"
50 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
51 		"   jmp 2b\n"
52 		".previous\n"
53 			_ASM_EXTABLE(1b, 3b)
54 			: : "r" (from));
55 
56 	for ( ; i > 5; i--) {
57 		__asm__ __volatile__ (
58 		"1:  prefetch 320(%0)\n"
59 		"2:  movq (%0), %%mm0\n"
60 		"  movq 8(%0), %%mm1\n"
61 		"  movq 16(%0), %%mm2\n"
62 		"  movq 24(%0), %%mm3\n"
63 		"  movq %%mm0, (%1)\n"
64 		"  movq %%mm1, 8(%1)\n"
65 		"  movq %%mm2, 16(%1)\n"
66 		"  movq %%mm3, 24(%1)\n"
67 		"  movq 32(%0), %%mm0\n"
68 		"  movq 40(%0), %%mm1\n"
69 		"  movq 48(%0), %%mm2\n"
70 		"  movq 56(%0), %%mm3\n"
71 		"  movq %%mm0, 32(%1)\n"
72 		"  movq %%mm1, 40(%1)\n"
73 		"  movq %%mm2, 48(%1)\n"
74 		"  movq %%mm3, 56(%1)\n"
75 		".section .fixup, \"ax\"\n"
76 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
77 		"   jmp 2b\n"
78 		".previous\n"
79 			_ASM_EXTABLE(1b, 3b)
80 			: : "r" (from), "r" (to) : "memory");
81 
82 		from += 64;
83 		to += 64;
84 	}
85 
86 	for ( ; i > 0; i--) {
87 		__asm__ __volatile__ (
88 		"  movq (%0), %%mm0\n"
89 		"  movq 8(%0), %%mm1\n"
90 		"  movq 16(%0), %%mm2\n"
91 		"  movq 24(%0), %%mm3\n"
92 		"  movq %%mm0, (%1)\n"
93 		"  movq %%mm1, 8(%1)\n"
94 		"  movq %%mm2, 16(%1)\n"
95 		"  movq %%mm3, 24(%1)\n"
96 		"  movq 32(%0), %%mm0\n"
97 		"  movq 40(%0), %%mm1\n"
98 		"  movq 48(%0), %%mm2\n"
99 		"  movq 56(%0), %%mm3\n"
100 		"  movq %%mm0, 32(%1)\n"
101 		"  movq %%mm1, 40(%1)\n"
102 		"  movq %%mm2, 48(%1)\n"
103 		"  movq %%mm3, 56(%1)\n"
104 			: : "r" (from), "r" (to) : "memory");
105 
106 		from += 64;
107 		to += 64;
108 	}
109 	/*
110 	 * Now do the tail of the block:
111 	 */
112 	__memcpy(to, from, len & 63);
113 	kernel_fpu_end();
114 
115 	return p;
116 }
117 EXPORT_SYMBOL(_mmx_memcpy);
118 
119 #ifdef CONFIG_MK7
120 
121 /*
122  *	The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
123  *	other MMX using processors do not.
124  */
125 
fast_clear_page(void * page)126 static void fast_clear_page(void *page)
127 {
128 	int i;
129 
130 	kernel_fpu_begin();
131 
132 	__asm__ __volatile__ (
133 		"  pxor %%mm0, %%mm0\n" : :
134 	);
135 
136 	for (i = 0; i < 4096/64; i++) {
137 		__asm__ __volatile__ (
138 		"  movntq %%mm0, (%0)\n"
139 		"  movntq %%mm0, 8(%0)\n"
140 		"  movntq %%mm0, 16(%0)\n"
141 		"  movntq %%mm0, 24(%0)\n"
142 		"  movntq %%mm0, 32(%0)\n"
143 		"  movntq %%mm0, 40(%0)\n"
144 		"  movntq %%mm0, 48(%0)\n"
145 		"  movntq %%mm0, 56(%0)\n"
146 		: : "r" (page) : "memory");
147 		page += 64;
148 	}
149 
150 	/*
151 	 * Since movntq is weakly-ordered, a "sfence" is needed to become
152 	 * ordered again:
153 	 */
154 	__asm__ __volatile__("sfence\n"::);
155 
156 	kernel_fpu_end();
157 }
158 
fast_copy_page(void * to,void * from)159 static void fast_copy_page(void *to, void *from)
160 {
161 	int i;
162 
163 	kernel_fpu_begin();
164 
165 	/*
166 	 * maybe the prefetch stuff can go before the expensive fnsave...
167 	 * but that is for later. -AV
168 	 */
169 	__asm__ __volatile__(
170 		"1: prefetch (%0)\n"
171 		"   prefetch 64(%0)\n"
172 		"   prefetch 128(%0)\n"
173 		"   prefetch 192(%0)\n"
174 		"   prefetch 256(%0)\n"
175 		"2:  \n"
176 		".section .fixup, \"ax\"\n"
177 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
178 		"   jmp 2b\n"
179 		".previous\n"
180 			_ASM_EXTABLE(1b, 3b) : : "r" (from));
181 
182 	for (i = 0; i < (4096-320)/64; i++) {
183 		__asm__ __volatile__ (
184 		"1: prefetch 320(%0)\n"
185 		"2: movq (%0), %%mm0\n"
186 		"   movntq %%mm0, (%1)\n"
187 		"   movq 8(%0), %%mm1\n"
188 		"   movntq %%mm1, 8(%1)\n"
189 		"   movq 16(%0), %%mm2\n"
190 		"   movntq %%mm2, 16(%1)\n"
191 		"   movq 24(%0), %%mm3\n"
192 		"   movntq %%mm3, 24(%1)\n"
193 		"   movq 32(%0), %%mm4\n"
194 		"   movntq %%mm4, 32(%1)\n"
195 		"   movq 40(%0), %%mm5\n"
196 		"   movntq %%mm5, 40(%1)\n"
197 		"   movq 48(%0), %%mm6\n"
198 		"   movntq %%mm6, 48(%1)\n"
199 		"   movq 56(%0), %%mm7\n"
200 		"   movntq %%mm7, 56(%1)\n"
201 		".section .fixup, \"ax\"\n"
202 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
203 		"   jmp 2b\n"
204 		".previous\n"
205 		_ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
206 
207 		from += 64;
208 		to += 64;
209 	}
210 
211 	for (i = (4096-320)/64; i < 4096/64; i++) {
212 		__asm__ __volatile__ (
213 		"2: movq (%0), %%mm0\n"
214 		"   movntq %%mm0, (%1)\n"
215 		"   movq 8(%0), %%mm1\n"
216 		"   movntq %%mm1, 8(%1)\n"
217 		"   movq 16(%0), %%mm2\n"
218 		"   movntq %%mm2, 16(%1)\n"
219 		"   movq 24(%0), %%mm3\n"
220 		"   movntq %%mm3, 24(%1)\n"
221 		"   movq 32(%0), %%mm4\n"
222 		"   movntq %%mm4, 32(%1)\n"
223 		"   movq 40(%0), %%mm5\n"
224 		"   movntq %%mm5, 40(%1)\n"
225 		"   movq 48(%0), %%mm6\n"
226 		"   movntq %%mm6, 48(%1)\n"
227 		"   movq 56(%0), %%mm7\n"
228 		"   movntq %%mm7, 56(%1)\n"
229 			: : "r" (from), "r" (to) : "memory");
230 		from += 64;
231 		to += 64;
232 	}
233 	/*
234 	 * Since movntq is weakly-ordered, a "sfence" is needed to become
235 	 * ordered again:
236 	 */
237 	__asm__ __volatile__("sfence \n"::);
238 	kernel_fpu_end();
239 }
240 
241 #else /* CONFIG_MK7 */
242 
243 /*
244  *	Generic MMX implementation without K7 specific streaming
245  */
fast_clear_page(void * page)246 static void fast_clear_page(void *page)
247 {
248 	int i;
249 
250 	kernel_fpu_begin();
251 
252 	__asm__ __volatile__ (
253 		"  pxor %%mm0, %%mm0\n" : :
254 	);
255 
256 	for (i = 0; i < 4096/128; i++) {
257 		__asm__ __volatile__ (
258 		"  movq %%mm0, (%0)\n"
259 		"  movq %%mm0, 8(%0)\n"
260 		"  movq %%mm0, 16(%0)\n"
261 		"  movq %%mm0, 24(%0)\n"
262 		"  movq %%mm0, 32(%0)\n"
263 		"  movq %%mm0, 40(%0)\n"
264 		"  movq %%mm0, 48(%0)\n"
265 		"  movq %%mm0, 56(%0)\n"
266 		"  movq %%mm0, 64(%0)\n"
267 		"  movq %%mm0, 72(%0)\n"
268 		"  movq %%mm0, 80(%0)\n"
269 		"  movq %%mm0, 88(%0)\n"
270 		"  movq %%mm0, 96(%0)\n"
271 		"  movq %%mm0, 104(%0)\n"
272 		"  movq %%mm0, 112(%0)\n"
273 		"  movq %%mm0, 120(%0)\n"
274 			: : "r" (page) : "memory");
275 		page += 128;
276 	}
277 
278 	kernel_fpu_end();
279 }
280 
fast_copy_page(void * to,void * from)281 static void fast_copy_page(void *to, void *from)
282 {
283 	int i;
284 
285 	kernel_fpu_begin();
286 
287 	__asm__ __volatile__ (
288 		"1: prefetch (%0)\n"
289 		"   prefetch 64(%0)\n"
290 		"   prefetch 128(%0)\n"
291 		"   prefetch 192(%0)\n"
292 		"   prefetch 256(%0)\n"
293 		"2:  \n"
294 		".section .fixup, \"ax\"\n"
295 		"3: movw $0x1AEB, 1b\n"	/* jmp on 26 bytes */
296 		"   jmp 2b\n"
297 		".previous\n"
298 			_ASM_EXTABLE(1b, 3b) : : "r" (from));
299 
300 	for (i = 0; i < 4096/64; i++) {
301 		__asm__ __volatile__ (
302 		"1: prefetch 320(%0)\n"
303 		"2: movq (%0), %%mm0\n"
304 		"   movq 8(%0), %%mm1\n"
305 		"   movq 16(%0), %%mm2\n"
306 		"   movq 24(%0), %%mm3\n"
307 		"   movq %%mm0, (%1)\n"
308 		"   movq %%mm1, 8(%1)\n"
309 		"   movq %%mm2, 16(%1)\n"
310 		"   movq %%mm3, 24(%1)\n"
311 		"   movq 32(%0), %%mm0\n"
312 		"   movq 40(%0), %%mm1\n"
313 		"   movq 48(%0), %%mm2\n"
314 		"   movq 56(%0), %%mm3\n"
315 		"   movq %%mm0, 32(%1)\n"
316 		"   movq %%mm1, 40(%1)\n"
317 		"   movq %%mm2, 48(%1)\n"
318 		"   movq %%mm3, 56(%1)\n"
319 		".section .fixup, \"ax\"\n"
320 		"3: movw $0x05EB, 1b\n"	/* jmp on 5 bytes */
321 		"   jmp 2b\n"
322 		".previous\n"
323 			_ASM_EXTABLE(1b, 3b)
324 			: : "r" (from), "r" (to) : "memory");
325 
326 		from += 64;
327 		to += 64;
328 	}
329 	kernel_fpu_end();
330 }
331 
332 #endif /* !CONFIG_MK7 */
333 
334 /*
335  * Favour MMX for page clear and copy:
336  */
slow_zero_page(void * page)337 static void slow_zero_page(void *page)
338 {
339 	int d0, d1;
340 
341 	__asm__ __volatile__(
342 		"cld\n\t"
343 		"rep ; stosl"
344 
345 			: "=&c" (d0), "=&D" (d1)
346 			:"a" (0), "1" (page), "0" (1024)
347 			:"memory");
348 }
349 
mmx_clear_page(void * page)350 void mmx_clear_page(void *page)
351 {
352 	if (unlikely(in_interrupt()))
353 		slow_zero_page(page);
354 	else
355 		fast_clear_page(page);
356 }
357 EXPORT_SYMBOL(mmx_clear_page);
358 
slow_copy_page(void * to,void * from)359 static void slow_copy_page(void *to, void *from)
360 {
361 	int d0, d1, d2;
362 
363 	__asm__ __volatile__(
364 		"cld\n\t"
365 		"rep ; movsl"
366 		: "=&c" (d0), "=&D" (d1), "=&S" (d2)
367 		: "0" (1024), "1" ((long) to), "2" ((long) from)
368 		: "memory");
369 }
370 
mmx_copy_page(void * to,void * from)371 void mmx_copy_page(void *to, void *from)
372 {
373 	if (unlikely(in_interrupt()))
374 		slow_copy_page(to, from);
375 	else
376 		fast_copy_page(to, from);
377 }
378 EXPORT_SYMBOL(mmx_copy_page);
379