1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_32_H
3 #define _ASM_X86_XOR_32_H
4
5 /*
6 * Optimized RAID-5 checksumming functions for MMX.
7 */
8
9 /*
10 * High-speed RAID5 checksumming functions utilizing MMX instructions.
11 * Copyright (C) 1998 Ingo Molnar.
12 */
13
14 #define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
15 #define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
16 #define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
17 #define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
18 #define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
19 #define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
20
21 #include <asm/fpu/api.h>
22
23 static void
xor_pII_mmx_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)24 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
25 {
26 unsigned long lines = bytes >> 7;
27
28 kernel_fpu_begin();
29
30 asm volatile(
31 #undef BLOCK
32 #define BLOCK(i) \
33 LD(i, 0) \
34 LD(i + 1, 1) \
35 LD(i + 2, 2) \
36 LD(i + 3, 3) \
37 XO1(i, 0) \
38 ST(i, 0) \
39 XO1(i+1, 1) \
40 ST(i+1, 1) \
41 XO1(i + 2, 2) \
42 ST(i + 2, 2) \
43 XO1(i + 3, 3) \
44 ST(i + 3, 3)
45
46 " .align 32 ;\n"
47 " 1: ;\n"
48
49 BLOCK(0)
50 BLOCK(4)
51 BLOCK(8)
52 BLOCK(12)
53
54 " addl $128, %1 ;\n"
55 " addl $128, %2 ;\n"
56 " decl %0 ;\n"
57 " jnz 1b ;\n"
58 : "+r" (lines),
59 "+r" (p1), "+r" (p2)
60 :
61 : "memory");
62
63 kernel_fpu_end();
64 }
65
66 static void
xor_pII_mmx_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)67 xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
68 unsigned long *p3)
69 {
70 unsigned long lines = bytes >> 7;
71
72 kernel_fpu_begin();
73
74 asm volatile(
75 #undef BLOCK
76 #define BLOCK(i) \
77 LD(i, 0) \
78 LD(i + 1, 1) \
79 LD(i + 2, 2) \
80 LD(i + 3, 3) \
81 XO1(i, 0) \
82 XO1(i + 1, 1) \
83 XO1(i + 2, 2) \
84 XO1(i + 3, 3) \
85 XO2(i, 0) \
86 ST(i, 0) \
87 XO2(i + 1, 1) \
88 ST(i + 1, 1) \
89 XO2(i + 2, 2) \
90 ST(i + 2, 2) \
91 XO2(i + 3, 3) \
92 ST(i + 3, 3)
93
94 " .align 32 ;\n"
95 " 1: ;\n"
96
97 BLOCK(0)
98 BLOCK(4)
99 BLOCK(8)
100 BLOCK(12)
101
102 " addl $128, %1 ;\n"
103 " addl $128, %2 ;\n"
104 " addl $128, %3 ;\n"
105 " decl %0 ;\n"
106 " jnz 1b ;\n"
107 : "+r" (lines),
108 "+r" (p1), "+r" (p2), "+r" (p3)
109 :
110 : "memory");
111
112 kernel_fpu_end();
113 }
114
115 static void
xor_pII_mmx_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)116 xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
117 unsigned long *p3, unsigned long *p4)
118 {
119 unsigned long lines = bytes >> 7;
120
121 kernel_fpu_begin();
122
123 asm volatile(
124 #undef BLOCK
125 #define BLOCK(i) \
126 LD(i, 0) \
127 LD(i + 1, 1) \
128 LD(i + 2, 2) \
129 LD(i + 3, 3) \
130 XO1(i, 0) \
131 XO1(i + 1, 1) \
132 XO1(i + 2, 2) \
133 XO1(i + 3, 3) \
134 XO2(i, 0) \
135 XO2(i + 1, 1) \
136 XO2(i + 2, 2) \
137 XO2(i + 3, 3) \
138 XO3(i, 0) \
139 ST(i, 0) \
140 XO3(i + 1, 1) \
141 ST(i + 1, 1) \
142 XO3(i + 2, 2) \
143 ST(i + 2, 2) \
144 XO3(i + 3, 3) \
145 ST(i + 3, 3)
146
147 " .align 32 ;\n"
148 " 1: ;\n"
149
150 BLOCK(0)
151 BLOCK(4)
152 BLOCK(8)
153 BLOCK(12)
154
155 " addl $128, %1 ;\n"
156 " addl $128, %2 ;\n"
157 " addl $128, %3 ;\n"
158 " addl $128, %4 ;\n"
159 " decl %0 ;\n"
160 " jnz 1b ;\n"
161 : "+r" (lines),
162 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
163 :
164 : "memory");
165
166 kernel_fpu_end();
167 }
168
169
170 static void
xor_pII_mmx_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)171 xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
172 unsigned long *p3, unsigned long *p4, unsigned long *p5)
173 {
174 unsigned long lines = bytes >> 7;
175
176 kernel_fpu_begin();
177
178 /* Make sure GCC forgets anything it knows about p4 or p5,
179 such that it won't pass to the asm volatile below a
180 register that is shared with any other variable. That's
181 because we modify p4 and p5 there, but we can't mark them
182 as read/write, otherwise we'd overflow the 10-asm-operands
183 limit of GCC < 3.1. */
184 asm("" : "+r" (p4), "+r" (p5));
185
186 asm volatile(
187 #undef BLOCK
188 #define BLOCK(i) \
189 LD(i, 0) \
190 LD(i + 1, 1) \
191 LD(i + 2, 2) \
192 LD(i + 3, 3) \
193 XO1(i, 0) \
194 XO1(i + 1, 1) \
195 XO1(i + 2, 2) \
196 XO1(i + 3, 3) \
197 XO2(i, 0) \
198 XO2(i + 1, 1) \
199 XO2(i + 2, 2) \
200 XO2(i + 3, 3) \
201 XO3(i, 0) \
202 XO3(i + 1, 1) \
203 XO3(i + 2, 2) \
204 XO3(i + 3, 3) \
205 XO4(i, 0) \
206 ST(i, 0) \
207 XO4(i + 1, 1) \
208 ST(i + 1, 1) \
209 XO4(i + 2, 2) \
210 ST(i + 2, 2) \
211 XO4(i + 3, 3) \
212 ST(i + 3, 3)
213
214 " .align 32 ;\n"
215 " 1: ;\n"
216
217 BLOCK(0)
218 BLOCK(4)
219 BLOCK(8)
220 BLOCK(12)
221
222 " addl $128, %1 ;\n"
223 " addl $128, %2 ;\n"
224 " addl $128, %3 ;\n"
225 " addl $128, %4 ;\n"
226 " addl $128, %5 ;\n"
227 " decl %0 ;\n"
228 " jnz 1b ;\n"
229 : "+r" (lines),
230 "+r" (p1), "+r" (p2), "+r" (p3)
231 : "r" (p4), "r" (p5)
232 : "memory");
233
234 /* p4 and p5 were modified, and now the variables are dead.
235 Clobber them just to be sure nobody does something stupid
236 like assuming they have some legal value. */
237 asm("" : "=r" (p4), "=r" (p5));
238
239 kernel_fpu_end();
240 }
241
242 #undef LD
243 #undef XO1
244 #undef XO2
245 #undef XO3
246 #undef XO4
247 #undef ST
248 #undef BLOCK
249
250 static void
xor_p5_mmx_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)251 xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
252 {
253 unsigned long lines = bytes >> 6;
254
255 kernel_fpu_begin();
256
257 asm volatile(
258 " .align 32 ;\n"
259 " 1: ;\n"
260 " movq (%1), %%mm0 ;\n"
261 " movq 8(%1), %%mm1 ;\n"
262 " pxor (%2), %%mm0 ;\n"
263 " movq 16(%1), %%mm2 ;\n"
264 " movq %%mm0, (%1) ;\n"
265 " pxor 8(%2), %%mm1 ;\n"
266 " movq 24(%1), %%mm3 ;\n"
267 " movq %%mm1, 8(%1) ;\n"
268 " pxor 16(%2), %%mm2 ;\n"
269 " movq 32(%1), %%mm4 ;\n"
270 " movq %%mm2, 16(%1) ;\n"
271 " pxor 24(%2), %%mm3 ;\n"
272 " movq 40(%1), %%mm5 ;\n"
273 " movq %%mm3, 24(%1) ;\n"
274 " pxor 32(%2), %%mm4 ;\n"
275 " movq 48(%1), %%mm6 ;\n"
276 " movq %%mm4, 32(%1) ;\n"
277 " pxor 40(%2), %%mm5 ;\n"
278 " movq 56(%1), %%mm7 ;\n"
279 " movq %%mm5, 40(%1) ;\n"
280 " pxor 48(%2), %%mm6 ;\n"
281 " pxor 56(%2), %%mm7 ;\n"
282 " movq %%mm6, 48(%1) ;\n"
283 " movq %%mm7, 56(%1) ;\n"
284
285 " addl $64, %1 ;\n"
286 " addl $64, %2 ;\n"
287 " decl %0 ;\n"
288 " jnz 1b ;\n"
289 : "+r" (lines),
290 "+r" (p1), "+r" (p2)
291 :
292 : "memory");
293
294 kernel_fpu_end();
295 }
296
297 static void
xor_p5_mmx_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)298 xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
299 unsigned long *p3)
300 {
301 unsigned long lines = bytes >> 6;
302
303 kernel_fpu_begin();
304
305 asm volatile(
306 " .align 32,0x90 ;\n"
307 " 1: ;\n"
308 " movq (%1), %%mm0 ;\n"
309 " movq 8(%1), %%mm1 ;\n"
310 " pxor (%2), %%mm0 ;\n"
311 " movq 16(%1), %%mm2 ;\n"
312 " pxor 8(%2), %%mm1 ;\n"
313 " pxor (%3), %%mm0 ;\n"
314 " pxor 16(%2), %%mm2 ;\n"
315 " movq %%mm0, (%1) ;\n"
316 " pxor 8(%3), %%mm1 ;\n"
317 " pxor 16(%3), %%mm2 ;\n"
318 " movq 24(%1), %%mm3 ;\n"
319 " movq %%mm1, 8(%1) ;\n"
320 " movq 32(%1), %%mm4 ;\n"
321 " movq 40(%1), %%mm5 ;\n"
322 " pxor 24(%2), %%mm3 ;\n"
323 " movq %%mm2, 16(%1) ;\n"
324 " pxor 32(%2), %%mm4 ;\n"
325 " pxor 24(%3), %%mm3 ;\n"
326 " pxor 40(%2), %%mm5 ;\n"
327 " movq %%mm3, 24(%1) ;\n"
328 " pxor 32(%3), %%mm4 ;\n"
329 " pxor 40(%3), %%mm5 ;\n"
330 " movq 48(%1), %%mm6 ;\n"
331 " movq %%mm4, 32(%1) ;\n"
332 " movq 56(%1), %%mm7 ;\n"
333 " pxor 48(%2), %%mm6 ;\n"
334 " movq %%mm5, 40(%1) ;\n"
335 " pxor 56(%2), %%mm7 ;\n"
336 " pxor 48(%3), %%mm6 ;\n"
337 " pxor 56(%3), %%mm7 ;\n"
338 " movq %%mm6, 48(%1) ;\n"
339 " movq %%mm7, 56(%1) ;\n"
340
341 " addl $64, %1 ;\n"
342 " addl $64, %2 ;\n"
343 " addl $64, %3 ;\n"
344 " decl %0 ;\n"
345 " jnz 1b ;\n"
346 : "+r" (lines),
347 "+r" (p1), "+r" (p2), "+r" (p3)
348 :
349 : "memory" );
350
351 kernel_fpu_end();
352 }
353
354 static void
xor_p5_mmx_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)355 xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
356 unsigned long *p3, unsigned long *p4)
357 {
358 unsigned long lines = bytes >> 6;
359
360 kernel_fpu_begin();
361
362 asm volatile(
363 " .align 32,0x90 ;\n"
364 " 1: ;\n"
365 " movq (%1), %%mm0 ;\n"
366 " movq 8(%1), %%mm1 ;\n"
367 " pxor (%2), %%mm0 ;\n"
368 " movq 16(%1), %%mm2 ;\n"
369 " pxor 8(%2), %%mm1 ;\n"
370 " pxor (%3), %%mm0 ;\n"
371 " pxor 16(%2), %%mm2 ;\n"
372 " pxor 8(%3), %%mm1 ;\n"
373 " pxor (%4), %%mm0 ;\n"
374 " movq 24(%1), %%mm3 ;\n"
375 " pxor 16(%3), %%mm2 ;\n"
376 " pxor 8(%4), %%mm1 ;\n"
377 " movq %%mm0, (%1) ;\n"
378 " movq 32(%1), %%mm4 ;\n"
379 " pxor 24(%2), %%mm3 ;\n"
380 " pxor 16(%4), %%mm2 ;\n"
381 " movq %%mm1, 8(%1) ;\n"
382 " movq 40(%1), %%mm5 ;\n"
383 " pxor 32(%2), %%mm4 ;\n"
384 " pxor 24(%3), %%mm3 ;\n"
385 " movq %%mm2, 16(%1) ;\n"
386 " pxor 40(%2), %%mm5 ;\n"
387 " pxor 32(%3), %%mm4 ;\n"
388 " pxor 24(%4), %%mm3 ;\n"
389 " movq %%mm3, 24(%1) ;\n"
390 " movq 56(%1), %%mm7 ;\n"
391 " movq 48(%1), %%mm6 ;\n"
392 " pxor 40(%3), %%mm5 ;\n"
393 " pxor 32(%4), %%mm4 ;\n"
394 " pxor 48(%2), %%mm6 ;\n"
395 " movq %%mm4, 32(%1) ;\n"
396 " pxor 56(%2), %%mm7 ;\n"
397 " pxor 40(%4), %%mm5 ;\n"
398 " pxor 48(%3), %%mm6 ;\n"
399 " pxor 56(%3), %%mm7 ;\n"
400 " movq %%mm5, 40(%1) ;\n"
401 " pxor 48(%4), %%mm6 ;\n"
402 " pxor 56(%4), %%mm7 ;\n"
403 " movq %%mm6, 48(%1) ;\n"
404 " movq %%mm7, 56(%1) ;\n"
405
406 " addl $64, %1 ;\n"
407 " addl $64, %2 ;\n"
408 " addl $64, %3 ;\n"
409 " addl $64, %4 ;\n"
410 " decl %0 ;\n"
411 " jnz 1b ;\n"
412 : "+r" (lines),
413 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
414 :
415 : "memory");
416
417 kernel_fpu_end();
418 }
419
420 static void
xor_p5_mmx_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)421 xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
422 unsigned long *p3, unsigned long *p4, unsigned long *p5)
423 {
424 unsigned long lines = bytes >> 6;
425
426 kernel_fpu_begin();
427
428 /* Make sure GCC forgets anything it knows about p4 or p5,
429 such that it won't pass to the asm volatile below a
430 register that is shared with any other variable. That's
431 because we modify p4 and p5 there, but we can't mark them
432 as read/write, otherwise we'd overflow the 10-asm-operands
433 limit of GCC < 3.1. */
434 asm("" : "+r" (p4), "+r" (p5));
435
436 asm volatile(
437 " .align 32,0x90 ;\n"
438 " 1: ;\n"
439 " movq (%1), %%mm0 ;\n"
440 " movq 8(%1), %%mm1 ;\n"
441 " pxor (%2), %%mm0 ;\n"
442 " pxor 8(%2), %%mm1 ;\n"
443 " movq 16(%1), %%mm2 ;\n"
444 " pxor (%3), %%mm0 ;\n"
445 " pxor 8(%3), %%mm1 ;\n"
446 " pxor 16(%2), %%mm2 ;\n"
447 " pxor (%4), %%mm0 ;\n"
448 " pxor 8(%4), %%mm1 ;\n"
449 " pxor 16(%3), %%mm2 ;\n"
450 " movq 24(%1), %%mm3 ;\n"
451 " pxor (%5), %%mm0 ;\n"
452 " pxor 8(%5), %%mm1 ;\n"
453 " movq %%mm0, (%1) ;\n"
454 " pxor 16(%4), %%mm2 ;\n"
455 " pxor 24(%2), %%mm3 ;\n"
456 " movq %%mm1, 8(%1) ;\n"
457 " pxor 16(%5), %%mm2 ;\n"
458 " pxor 24(%3), %%mm3 ;\n"
459 " movq 32(%1), %%mm4 ;\n"
460 " movq %%mm2, 16(%1) ;\n"
461 " pxor 24(%4), %%mm3 ;\n"
462 " pxor 32(%2), %%mm4 ;\n"
463 " movq 40(%1), %%mm5 ;\n"
464 " pxor 24(%5), %%mm3 ;\n"
465 " pxor 32(%3), %%mm4 ;\n"
466 " pxor 40(%2), %%mm5 ;\n"
467 " movq %%mm3, 24(%1) ;\n"
468 " pxor 32(%4), %%mm4 ;\n"
469 " pxor 40(%3), %%mm5 ;\n"
470 " movq 48(%1), %%mm6 ;\n"
471 " movq 56(%1), %%mm7 ;\n"
472 " pxor 32(%5), %%mm4 ;\n"
473 " pxor 40(%4), %%mm5 ;\n"
474 " pxor 48(%2), %%mm6 ;\n"
475 " pxor 56(%2), %%mm7 ;\n"
476 " movq %%mm4, 32(%1) ;\n"
477 " pxor 48(%3), %%mm6 ;\n"
478 " pxor 56(%3), %%mm7 ;\n"
479 " pxor 40(%5), %%mm5 ;\n"
480 " pxor 48(%4), %%mm6 ;\n"
481 " pxor 56(%4), %%mm7 ;\n"
482 " movq %%mm5, 40(%1) ;\n"
483 " pxor 48(%5), %%mm6 ;\n"
484 " pxor 56(%5), %%mm7 ;\n"
485 " movq %%mm6, 48(%1) ;\n"
486 " movq %%mm7, 56(%1) ;\n"
487
488 " addl $64, %1 ;\n"
489 " addl $64, %2 ;\n"
490 " addl $64, %3 ;\n"
491 " addl $64, %4 ;\n"
492 " addl $64, %5 ;\n"
493 " decl %0 ;\n"
494 " jnz 1b ;\n"
495 : "+r" (lines),
496 "+r" (p1), "+r" (p2), "+r" (p3)
497 : "r" (p4), "r" (p5)
498 : "memory");
499
500 /* p4 and p5 were modified, and now the variables are dead.
501 Clobber them just to be sure nobody does something stupid
502 like assuming they have some legal value. */
503 asm("" : "=r" (p4), "=r" (p5));
504
505 kernel_fpu_end();
506 }
507
508 static struct xor_block_template xor_block_pII_mmx = {
509 .name = "pII_mmx",
510 .do_2 = xor_pII_mmx_2,
511 .do_3 = xor_pII_mmx_3,
512 .do_4 = xor_pII_mmx_4,
513 .do_5 = xor_pII_mmx_5,
514 };
515
516 static struct xor_block_template xor_block_p5_mmx = {
517 .name = "p5_mmx",
518 .do_2 = xor_p5_mmx_2,
519 .do_3 = xor_p5_mmx_3,
520 .do_4 = xor_p5_mmx_4,
521 .do_5 = xor_p5_mmx_5,
522 };
523
524 static struct xor_block_template xor_block_pIII_sse = {
525 .name = "pIII_sse",
526 .do_2 = xor_sse_2,
527 .do_3 = xor_sse_3,
528 .do_4 = xor_sse_4,
529 .do_5 = xor_sse_5,
530 };
531
532 /* Also try the AVX routines */
533 #include <asm/xor_avx.h>
534
535 /* Also try the generic routines. */
536 #include <asm-generic/xor.h>
537
538 /* We force the use of the SSE xor block because it can write around L2.
539 We may also be able to load into the L1 only depending on how the cpu
540 deals with a load to a line that is being prefetched. */
541 #undef XOR_TRY_TEMPLATES
542 #define XOR_TRY_TEMPLATES \
543 do { \
544 AVX_XOR_SPEED; \
545 if (boot_cpu_has(X86_FEATURE_XMM)) { \
546 xor_speed(&xor_block_pIII_sse); \
547 xor_speed(&xor_block_sse_pf64); \
548 } else if (boot_cpu_has(X86_FEATURE_MMX)) { \
549 xor_speed(&xor_block_pII_mmx); \
550 xor_speed(&xor_block_p5_mmx); \
551 } else { \
552 xor_speed(&xor_block_8regs); \
553 xor_speed(&xor_block_8regs_p); \
554 xor_speed(&xor_block_32regs); \
555 xor_speed(&xor_block_32regs_p); \
556 } \
557 } while (0)
558
559 #endif /* _ASM_X86_XOR_32_H */
560