1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/neon-blocked.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_ukernel_8x2__neonfma(uint32_t m,uint32_t n,const float * restrict a,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict c,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_spmm_ukernel_8x2__neonfma(
18 uint32_t m,
19 uint32_t n,
20 const float*restrict a,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict c,
25 const union xnn_f32_output_params params[restrict static 1])
26 {
27 assert(m != 0);
28
29 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
30 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
31 size_t i = m;
32 while XNN_LIKELY(i >= 8) {
33 const float*restrict w = weights;
34 const int32_t* dmap = widx_dmap;
35 const uint32_t* nnzmap = nidx_nnzmap;
36 size_t j = n;
37 while (j >= 2) {
38 uint32_t nnz = *nnzmap++;
39 float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
40 float32x4_t vacc4567c0 = vacc0123c0;
41 float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
42 float32x4_t vacc4567c1 = vacc0123c1;
43 if XNN_LIKELY(nnz != 0) {
44 do {
45 const intptr_t diff = *dmap++;
46 const float32x4_t va0123 = vld1q_f32(a);
47 const float32x4_t va4567 = vld1q_f32(a + 4);
48 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
49 const float32x2_t vb = vld1_f32(w); w += 2;
50
51 vacc0123c0 = vfmaq_lane_f32(vacc0123c0, va0123, vb, 0);
52 vacc4567c0 = vfmaq_lane_f32(vacc4567c0, va4567, vb, 0);
53 vacc0123c1 = vfmaq_lane_f32(vacc0123c1, va0123, vb, 1);
54 vacc4567c1 = vfmaq_lane_f32(vacc4567c1, va4567, vb, 1);
55 } while (--nnz != 0);
56 }
57 float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
58 float32x4_t vout4567c0 = vminq_f32(vacc4567c0, vmax);
59 float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
60 float32x4_t vout4567c1 = vminq_f32(vacc4567c1, vmax);
61
62 vout0123c0 = vmaxq_f32(vout0123c0, vmin);
63 vout4567c0 = vmaxq_f32(vout4567c0, vmin);
64 vout0123c1 = vmaxq_f32(vout0123c1, vmin);
65 vout4567c1 = vmaxq_f32(vout4567c1, vmin);
66
67 vst1q_f32(c + 0 * m + 0, vout0123c0);
68 vst1q_f32(c + 0 * m + 4, vout4567c0);
69 vst1q_f32(c + 1 * m + 0, vout0123c1);
70 vst1q_f32(c + 1 * m + 4, vout4567c1);
71 c += 2 * m;
72 j -= 2;
73 }
74
75 // clean up loop, fall back to nr=1
76 if XNN_UNLIKELY(j != 0) {
77 do {
78 uint32_t nnz = *nnzmap++;
79 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
80 float32x4_t vacc4567 = vacc0123;
81 if XNN_LIKELY(nnz != 0) {
82 do {
83 const intptr_t diff = *dmap++;
84 const float32x4_t va0123 = vld1q_f32(a);
85 const float32x4_t va4567 = vld1q_f32(a + 4);
86 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
87 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
88 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
89 vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
90 } while (--nnz != 0);
91 }
92 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
93 float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
94
95 vout0123 = vmaxq_f32(vout0123, vmin);
96 vout4567 = vmaxq_f32(vout4567, vmin);
97
98 vst1q_f32(c + 0, vout0123);
99 vst1q_f32(c + 4, vout4567);
100 c += m;
101 j -= 1;
102 } while (j != 0);
103 }
104 c -= m * n;
105 c += 8;
106 a += 8;
107 i -= 8;
108 }
109 if XNN_UNLIKELY(i != 0) {
110 if (i & 4) {
111 const float*restrict w = weights;
112 const int32_t* dmap = widx_dmap;
113 const uint32_t* nnzmap = nidx_nnzmap;
114 size_t j = n;
115 while (j >= 2) {
116 uint32_t nnz = *nnzmap++;
117 float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
118 float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
119 if XNN_LIKELY(nnz != 0) {
120 do {
121 const intptr_t diff = *dmap++;
122 const float32x4_t va0123 = vld1q_f32(a);
123 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
124 const float32x2_t vb = vld1_f32(w); w += 2;
125
126 vacc0123c0 = vfmaq_lane_f32(vacc0123c0, va0123, vb, 0);
127 vacc0123c1 = vfmaq_lane_f32(vacc0123c1, va0123, vb, 1);
128 } while (--nnz != 0);
129 }
130 float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
131 float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
132
133 vout0123c0 = vmaxq_f32(vout0123c0, vmin);
134 vout0123c1 = vmaxq_f32(vout0123c1, vmin);
135
136 vst1q_f32(c + 0 * m + 0, vout0123c0);
137 vst1q_f32(c + 1 * m + 0, vout0123c1);
138 c += 2 * m;
139 j -= 2;
140 }
141
142 // clean up loop, fall back to nr=1
143 if XNN_UNLIKELY(j != 0) {
144 do {
145 uint32_t nnz = *nnzmap++;
146 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
147 if XNN_LIKELY(nnz != 0) {
148 do {
149 const intptr_t diff = *dmap++;
150 const float32x4_t va0123 = vld1q_f32(a);
151 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
152 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
153 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
154 } while (--nnz != 0);
155 }
156 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
157
158 vout0123 = vmaxq_f32(vout0123, vmin);
159
160 vst1q_f32(c + 0, vout0123);
161 c += m;
162 j -= 1;
163 } while (j != 0);
164 }
165 c -= m * n;
166 c += 4;
167 a += 4;
168 }
169 if (i & 2) {
170 const float*restrict w = weights;
171 const int32_t* dmap = widx_dmap;
172 const uint32_t* nnzmap = nidx_nnzmap;
173 size_t j = n;
174 while (j >= 2) {
175 uint32_t nnz = *nnzmap++;
176 float32x2_t vacc01c0 = vld1_dup_f32(w); w += 1;
177 float32x2_t vacc01c1 = vld1_dup_f32(w); w += 1;
178 if XNN_LIKELY(nnz != 0) {
179 do {
180 const intptr_t diff = *dmap++;
181 const float32x2_t va01 = vld1_f32(a);
182 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
183 const float32x2_t vb = vld1_f32(w); w += 2;
184
185 vacc01c0 = vfma_lane_f32(vacc01c0, va01, vb, 0);
186 vacc01c1 = vfma_lane_f32(vacc01c1, va01, vb, 1);
187 } while (--nnz != 0);
188 }
189 float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax));
190 float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax));
191
192 vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin));
193 vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin));
194
195 vst1_f32(c + 0 * m + 0, vout01c0);
196 vst1_f32(c + 1 * m + 0, vout01c1);
197 c += 2 * m;
198 j -= 2;
199 }
200
201 // clean up loop, fall back to nr=1
202 if XNN_UNLIKELY(j != 0) {
203 do {
204 uint32_t nnz = *nnzmap++;
205 float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
206 if XNN_LIKELY(nnz != 0) {
207 do {
208 const intptr_t diff = *dmap++;
209 const float32x2_t va01 = vld1_f32(a);
210 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
211 const float32x2_t vb = vld1_dup_f32(w); w += 1;
212 vacc01 = vfma_f32(vacc01, va01, vb);
213 } while (--nnz != 0);
214 }
215 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
216 vout01 = vmax_f32(vout01, vget_low_f32(vmin));
217
218 vst1_f32(c, vout01);
219 c += m;
220 j -= 1;
221 } while (j != 0);
222 }
223 c -= m * n;
224 c += 2;
225 a += 2;
226 }
227 if (i & 1) {
228 const float*restrict w = weights;
229 const int32_t* dmap = widx_dmap;
230 const uint32_t* nnzmap = nidx_nnzmap;
231 size_t j = n;
232 while (j >= 2) {
233 uint32_t nnz = *nnzmap++;
234 float32x2_t vacc0c0 = vld1_dup_f32(w); w += 1;
235 float32x2_t vacc0c1 = vld1_dup_f32(w); w += 1;
236 if XNN_LIKELY(nnz != 0) {
237 do {
238 const intptr_t diff = *dmap++;
239 const float32x2_t va0 = vld1_dup_f32(a);
240 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
241 const float32x2_t vb = vld1_f32(w); w += 2;
242
243 vacc0c0 = vfma_lane_f32(vacc0c0, va0, vb, 0);
244 vacc0c1 = vfma_lane_f32(vacc0c1, va0, vb, 1);
245 } while (--nnz != 0);
246 }
247 float32x2_t vout0c0 = vmin_f32(vacc0c0, vget_low_f32(vmax));
248 float32x2_t vout0c1 = vmin_f32(vacc0c1, vget_low_f32(vmax));
249
250 vout0c0 = vmax_f32(vout0c0, vget_low_f32(vmin));
251 vout0c1 = vmax_f32(vout0c1, vget_low_f32(vmin));
252
253 vst1_lane_f32(c + 0 * m + 0, vout0c0, 0);
254 vst1_lane_f32(c + 1 * m + 0, vout0c1, 0);
255 c += 2 * m;
256 j -= 2;
257 }
258
259 // clean up loop, fall back to nr=1
260 if XNN_UNLIKELY(j != 0) {
261 do {
262 uint32_t nnz = *nnzmap++;
263 float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
264 if XNN_LIKELY(nnz != 0) {
265 do {
266 const intptr_t diff = *dmap++;
267 const float32x2_t va0 = vld1_dup_f32(a);
268 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
269 const float32x2_t vb = vld1_dup_f32(w); w += 1;
270 vacc0 = vfma_f32(vacc0, va0, vb);
271 } while (--nnz != 0);
272 }
273 float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
274 vout0 = vmax_f32(vout0, vget_low_f32(vmin));
275
276 vst1_lane_f32(c, vout0, 1);
277 c += m;
278 j -= 1;
279 } while (j != 0);
280 }
281 c -= m * n;
282 c += 1;
283 a += 1;
284 }
285 }
286 }
287