1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/neon-blocked.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_ukernel_4x4__neonfma(uint32_t m,uint32_t n,const float * restrict a,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict c,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_spmm_ukernel_4x4__neonfma(
18 uint32_t m,
19 uint32_t n,
20 const float*restrict a,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict c,
25 const union xnn_f32_output_params params[restrict static 1])
26 {
27 assert(m != 0);
28
29 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
30 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
31 size_t i = m;
32 while XNN_LIKELY(i >= 4) {
33 const float*restrict w = weights;
34 const int32_t* dmap = widx_dmap;
35 const uint32_t* nnzmap = nidx_nnzmap;
36 size_t j = n;
37 while (j >= 4) {
38 uint32_t nnz = *nnzmap++;
39 float32x4_t vacc0123c0 = vld1q_dup_f32(w); w += 1;
40 float32x4_t vacc0123c1 = vld1q_dup_f32(w); w += 1;
41 float32x4_t vacc0123c2 = vld1q_dup_f32(w); w += 1;
42 float32x4_t vacc0123c3 = vld1q_dup_f32(w); w += 1;
43 if XNN_LIKELY(nnz != 0) {
44 do {
45 const intptr_t diff = *dmap++;
46 const float32x4_t va0123 = vld1q_f32(a);
47 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
48 const float32x4_t vb = vld1q_f32(w); w += 4;
49
50 vacc0123c0 = vfmaq_laneq_f32(vacc0123c0, va0123, vb, 0);
51 vacc0123c1 = vfmaq_laneq_f32(vacc0123c1, va0123, vb, 1);
52 vacc0123c2 = vfmaq_laneq_f32(vacc0123c2, va0123, vb, 2);
53 vacc0123c3 = vfmaq_laneq_f32(vacc0123c3, va0123, vb, 3);
54 } while (--nnz != 0);
55 }
56 float32x4_t vout0123c0 = vminq_f32(vacc0123c0, vmax);
57 float32x4_t vout0123c1 = vminq_f32(vacc0123c1, vmax);
58 float32x4_t vout0123c2 = vminq_f32(vacc0123c2, vmax);
59 float32x4_t vout0123c3 = vminq_f32(vacc0123c3, vmax);
60
61 vout0123c0 = vmaxq_f32(vout0123c0, vmin);
62 vout0123c1 = vmaxq_f32(vout0123c1, vmin);
63 vout0123c2 = vmaxq_f32(vout0123c2, vmin);
64 vout0123c3 = vmaxq_f32(vout0123c3, vmin);
65
66 vst1q_f32(c + 0 * m + 0, vout0123c0);
67 vst1q_f32(c + 1 * m + 0, vout0123c1);
68 vst1q_f32(c + 2 * m + 0, vout0123c2);
69 vst1q_f32(c + 3 * m + 0, vout0123c3);
70 c += 4 * m;
71 j -= 4;
72 }
73
74 // clean up loop, fall back to nr=1
75 if XNN_UNLIKELY(j != 0) {
76 do {
77 uint32_t nnz = *nnzmap++;
78 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
79 if XNN_LIKELY(nnz != 0) {
80 do {
81 const intptr_t diff = *dmap++;
82 const float32x4_t va0123 = vld1q_f32(a);
83 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
84 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
85 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
86 } while (--nnz != 0);
87 }
88 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
89
90 vout0123 = vmaxq_f32(vout0123, vmin);
91
92 vst1q_f32(c + 0, vout0123);
93 c += m;
94 j -= 1;
95 } while (j != 0);
96 }
97 c -= m * n;
98 c += 4;
99 a += 4;
100 i -= 4;
101 }
102 if XNN_UNLIKELY(i != 0) {
103 if (i & 2) {
104 const float*restrict w = weights;
105 const int32_t* dmap = widx_dmap;
106 const uint32_t* nnzmap = nidx_nnzmap;
107 size_t j = n;
108 while (j >= 4) {
109 uint32_t nnz = *nnzmap++;
110 float32x2_t vacc01c0 = vld1_dup_f32(w); w += 1;
111 float32x2_t vacc01c1 = vld1_dup_f32(w); w += 1;
112 float32x2_t vacc01c2 = vld1_dup_f32(w); w += 1;
113 float32x2_t vacc01c3 = vld1_dup_f32(w); w += 1;
114 if XNN_LIKELY(nnz != 0) {
115 do {
116 const intptr_t diff = *dmap++;
117 const float32x2_t va01 = vld1_f32(a);
118 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
119 const float32x4_t vb = vld1q_f32(w); w += 4;
120
121 vacc01c0 = vfma_laneq_f32(vacc01c0, va01, vb, 0);
122 vacc01c1 = vfma_laneq_f32(vacc01c1, va01, vb, 1);
123 vacc01c2 = vfma_laneq_f32(vacc01c2, va01, vb, 2);
124 vacc01c3 = vfma_laneq_f32(vacc01c3, va01, vb, 3);
125 } while (--nnz != 0);
126 }
127 float32x2_t vout01c0 = vmin_f32(vacc01c0, vget_low_f32(vmax));
128 float32x2_t vout01c1 = vmin_f32(vacc01c1, vget_low_f32(vmax));
129 float32x2_t vout01c2 = vmin_f32(vacc01c2, vget_low_f32(vmax));
130 float32x2_t vout01c3 = vmin_f32(vacc01c3, vget_low_f32(vmax));
131
132 vout01c0 = vmax_f32(vout01c0, vget_low_f32(vmin));
133 vout01c1 = vmax_f32(vout01c1, vget_low_f32(vmin));
134 vout01c2 = vmax_f32(vout01c2, vget_low_f32(vmin));
135 vout01c3 = vmax_f32(vout01c3, vget_low_f32(vmin));
136
137 vst1_f32(c + 0 * m + 0, vout01c0);
138 vst1_f32(c + 1 * m + 0, vout01c1);
139 vst1_f32(c + 2 * m + 0, vout01c2);
140 vst1_f32(c + 3 * m + 0, vout01c3);
141 c += 4 * m;
142 j -= 4;
143 }
144
145 // clean up loop, fall back to nr=1
146 if XNN_UNLIKELY(j != 0) {
147 do {
148 uint32_t nnz = *nnzmap++;
149 float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
150 if XNN_LIKELY(nnz != 0) {
151 do {
152 const intptr_t diff = *dmap++;
153 const float32x2_t va01 = vld1_f32(a);
154 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
155 const float32x2_t vb = vld1_dup_f32(w); w += 1;
156 vacc01 = vfma_f32(vacc01, va01, vb);
157 } while (--nnz != 0);
158 }
159 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
160 vout01 = vmax_f32(vout01, vget_low_f32(vmin));
161
162 vst1_f32(c, vout01);
163 c += m;
164 j -= 1;
165 } while (j != 0);
166 }
167 c -= m * n;
168 c += 2;
169 a += 2;
170 }
171 if (i & 1) {
172 const float*restrict w = weights;
173 const int32_t* dmap = widx_dmap;
174 const uint32_t* nnzmap = nidx_nnzmap;
175 size_t j = n;
176 while (j >= 4) {
177 uint32_t nnz = *nnzmap++;
178 float32x2_t vacc0c0 = vld1_dup_f32(w); w += 1;
179 float32x2_t vacc0c1 = vld1_dup_f32(w); w += 1;
180 float32x2_t vacc0c2 = vld1_dup_f32(w); w += 1;
181 float32x2_t vacc0c3 = vld1_dup_f32(w); w += 1;
182 if XNN_LIKELY(nnz != 0) {
183 do {
184 const intptr_t diff = *dmap++;
185 const float32x2_t va0 = vld1_dup_f32(a);
186 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
187 const float32x4_t vb = vld1q_f32(w); w += 4;
188
189 vacc0c0 = vfma_laneq_f32(vacc0c0, va0, vb, 0);
190 vacc0c1 = vfma_laneq_f32(vacc0c1, va0, vb, 1);
191 vacc0c2 = vfma_laneq_f32(vacc0c2, va0, vb, 2);
192 vacc0c3 = vfma_laneq_f32(vacc0c3, va0, vb, 3);
193 } while (--nnz != 0);
194 }
195 float32x2_t vout0c0 = vmin_f32(vacc0c0, vget_low_f32(vmax));
196 float32x2_t vout0c1 = vmin_f32(vacc0c1, vget_low_f32(vmax));
197 float32x2_t vout0c2 = vmin_f32(vacc0c2, vget_low_f32(vmax));
198 float32x2_t vout0c3 = vmin_f32(vacc0c3, vget_low_f32(vmax));
199
200 vout0c0 = vmax_f32(vout0c0, vget_low_f32(vmin));
201 vout0c1 = vmax_f32(vout0c1, vget_low_f32(vmin));
202 vout0c2 = vmax_f32(vout0c2, vget_low_f32(vmin));
203 vout0c3 = vmax_f32(vout0c3, vget_low_f32(vmin));
204
205 vst1_lane_f32(c + 0 * m + 0, vout0c0, 0);
206 vst1_lane_f32(c + 1 * m + 0, vout0c1, 0);
207 vst1_lane_f32(c + 2 * m + 0, vout0c2, 0);
208 vst1_lane_f32(c + 3 * m + 0, vout0c3, 0);
209 c += 4 * m;
210 j -= 4;
211 }
212
213 // clean up loop, fall back to nr=1
214 if XNN_UNLIKELY(j != 0) {
215 do {
216 uint32_t nnz = *nnzmap++;
217 float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
218 if XNN_LIKELY(nnz != 0) {
219 do {
220 const intptr_t diff = *dmap++;
221 const float32x2_t va0 = vld1_dup_f32(a);
222 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
223 const float32x2_t vb = vld1_dup_f32(w); w += 1;
224 vacc0 = vfma_f32(vacc0, va0, vb);
225 } while (--nnz != 0);
226 }
227 float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
228 vout0 = vmax_f32(vout0, vget_low_f32(vmin));
229
230 vst1_lane_f32(c, vout0, 1);
231 c += m;
232 j -= 1;
233 } while (j != 0);
234 }
235 c -= m * n;
236 c += 1;
237 a += 1;
238 }
239 }
240 }
241