1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * arch/arm64/lib/xor-neon.c
4 *
5 * Authors: Jackie Liu <liuyun01@kylinos.cn>
6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
7 */
8
9 #include <linux/raid/xor.h>
10 #include <linux/module.h>
11 #include <asm/neon-intrinsics.h>
12
xor_arm64_neon_2(unsigned long bytes,unsigned long * p1,unsigned long * p2)13 void xor_arm64_neon_2(unsigned long bytes, unsigned long *p1,
14 unsigned long *p2)
15 {
16 uint64_t *dp1 = (uint64_t *)p1;
17 uint64_t *dp2 = (uint64_t *)p2;
18
19 register uint64x2_t v0, v1, v2, v3;
20 long lines = bytes / (sizeof(uint64x2_t) * 4);
21
22 do {
23 /* p1 ^= p2 */
24 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
25 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
26 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
27 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
28
29 /* store */
30 vst1q_u64(dp1 + 0, v0);
31 vst1q_u64(dp1 + 2, v1);
32 vst1q_u64(dp1 + 4, v2);
33 vst1q_u64(dp1 + 6, v3);
34
35 dp1 += 8;
36 dp2 += 8;
37 } while (--lines > 0);
38 }
39
xor_arm64_neon_3(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3)40 void xor_arm64_neon_3(unsigned long bytes, unsigned long *p1,
41 unsigned long *p2, unsigned long *p3)
42 {
43 uint64_t *dp1 = (uint64_t *)p1;
44 uint64_t *dp2 = (uint64_t *)p2;
45 uint64_t *dp3 = (uint64_t *)p3;
46
47 register uint64x2_t v0, v1, v2, v3;
48 long lines = bytes / (sizeof(uint64x2_t) * 4);
49
50 do {
51 /* p1 ^= p2 */
52 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
53 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
54 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
55 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
56
57 /* p1 ^= p3 */
58 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
59 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
60 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
61 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
62
63 /* store */
64 vst1q_u64(dp1 + 0, v0);
65 vst1q_u64(dp1 + 2, v1);
66 vst1q_u64(dp1 + 4, v2);
67 vst1q_u64(dp1 + 6, v3);
68
69 dp1 += 8;
70 dp2 += 8;
71 dp3 += 8;
72 } while (--lines > 0);
73 }
74
xor_arm64_neon_4(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4)75 void xor_arm64_neon_4(unsigned long bytes, unsigned long *p1,
76 unsigned long *p2, unsigned long *p3, unsigned long *p4)
77 {
78 uint64_t *dp1 = (uint64_t *)p1;
79 uint64_t *dp2 = (uint64_t *)p2;
80 uint64_t *dp3 = (uint64_t *)p3;
81 uint64_t *dp4 = (uint64_t *)p4;
82
83 register uint64x2_t v0, v1, v2, v3;
84 long lines = bytes / (sizeof(uint64x2_t) * 4);
85
86 do {
87 /* p1 ^= p2 */
88 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
89 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
90 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
91 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
92
93 /* p1 ^= p3 */
94 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
95 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
96 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
97 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
98
99 /* p1 ^= p4 */
100 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
101 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
102 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
103 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
104
105 /* store */
106 vst1q_u64(dp1 + 0, v0);
107 vst1q_u64(dp1 + 2, v1);
108 vst1q_u64(dp1 + 4, v2);
109 vst1q_u64(dp1 + 6, v3);
110
111 dp1 += 8;
112 dp2 += 8;
113 dp3 += 8;
114 dp4 += 8;
115 } while (--lines > 0);
116 }
117
xor_arm64_neon_5(unsigned long bytes,unsigned long * p1,unsigned long * p2,unsigned long * p3,unsigned long * p4,unsigned long * p5)118 void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
119 unsigned long *p2, unsigned long *p3,
120 unsigned long *p4, unsigned long *p5)
121 {
122 uint64_t *dp1 = (uint64_t *)p1;
123 uint64_t *dp2 = (uint64_t *)p2;
124 uint64_t *dp3 = (uint64_t *)p3;
125 uint64_t *dp4 = (uint64_t *)p4;
126 uint64_t *dp5 = (uint64_t *)p5;
127
128 register uint64x2_t v0, v1, v2, v3;
129 long lines = bytes / (sizeof(uint64x2_t) * 4);
130
131 do {
132 /* p1 ^= p2 */
133 v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
134 v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
135 v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
136 v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
137
138 /* p1 ^= p3 */
139 v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
140 v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
141 v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
142 v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
143
144 /* p1 ^= p4 */
145 v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
146 v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
147 v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
148 v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
149
150 /* p1 ^= p5 */
151 v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
152 v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
153 v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
154 v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
155
156 /* store */
157 vst1q_u64(dp1 + 0, v0);
158 vst1q_u64(dp1 + 2, v1);
159 vst1q_u64(dp1 + 4, v2);
160 vst1q_u64(dp1 + 6, v3);
161
162 dp1 += 8;
163 dp2 += 8;
164 dp3 += 8;
165 dp4 += 8;
166 dp5 += 8;
167 } while (--lines > 0);
168 }
169
170 struct xor_block_template const xor_block_inner_neon = {
171 .name = "__inner_neon__",
172 .do_2 = xor_arm64_neon_2,
173 .do_3 = xor_arm64_neon_3,
174 .do_4 = xor_arm64_neon_4,
175 .do_5 = xor_arm64_neon_5,
176 };
177 EXPORT_SYMBOL(xor_block_inner_neon);
178
179 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
180 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
181 MODULE_LICENSE("GPL");
182