1 // Auto-generated file. Do not edit!
2 // Template: src/f16-spmm/neonfp16arith.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f16_spmm_ukernel_24x1__neonfp16arith_unroll2(uint32_t m,uint32_t n,const void * restrict input,const void * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,void * restrict output,const struct xnn_f16_output_params params[restrict static1])17 void xnn_f16_spmm_ukernel_24x1__neonfp16arith_unroll2(
18 uint32_t m,
19 uint32_t n,
20 const void*restrict input,
21 const void*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 void*restrict output,
25 const struct xnn_f16_output_params params[restrict static 1])
26 {
27 assert(m != 0);
28
29 const __fp16*restrict a = input;
30 __fp16*restrict c = output;
31
32 const float16x8_t vscale = vld1q_dup_f16((const __fp16*) ¶ms->scale);
33 const float16x8_t vmax = vld1q_dup_f16((const __fp16*) ¶ms->max);
34 const float16x8_t vmin = vld1q_dup_f16((const __fp16*) ¶ms->min);
35
36 size_t i = m;
37 while XNN_LIKELY(i >= 24) {
38 const __fp16*restrict w = weights;
39 const int32_t* dmap = widx_dmap;
40 const uint32_t* nnzmap = nidx_nnzmap;
41 size_t j = n;
42 do {
43 uint32_t nnz = *nnzmap++;
44 float16x8_t vacc01234567x0 = vld1q_dup_f16(w); w += 1;
45 float16x8_t vacc01234567x1 = vmovq_n_f16(0.0f);
46 float16x8_t vacc89ABCDEFx0 = vacc01234567x0;
47 float16x8_t vacc89ABCDEFx1 = vmovq_n_f16(0.0f);
48 float16x8_t vaccGHIJKLMNx0 = vacc01234567x0;
49 float16x8_t vaccGHIJKLMNx1 = vmovq_n_f16(0.0f);
50 for (; nnz >= 2; nnz -= 2) {
51 const intptr_t diff0 = dmap[0];
52 const intptr_t diff1 = dmap[1];
53 dmap += 2;
54 const float16x8_t va01234567x0 = vld1q_f16(a);
55 const float16x8_t va89ABCDEFx0 = vld1q_f16(a + 8);
56 const float16x8_t vaGHIJKLMNx0 = vld1q_f16(a + 16);
57 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff0);
58 const float16x8_t vb0 = vld1q_dup_f16(w); w += 1;
59 vacc01234567x0 = vfmaq_f16(vacc01234567x0, va01234567x0, vb0);
60 vacc89ABCDEFx0 = vfmaq_f16(vacc89ABCDEFx0, va89ABCDEFx0, vb0);
61 vaccGHIJKLMNx0 = vfmaq_f16(vaccGHIJKLMNx0, vaGHIJKLMNx0, vb0);
62 const float16x8_t va01234567x1 = vld1q_f16(a);
63 const float16x8_t va89ABCDEFx1 = vld1q_f16(a + 8);
64 const float16x8_t vaGHIJKLMNx1 = vld1q_f16(a + 16);
65 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff1);
66 const float16x8_t vb1 = vld1q_dup_f16(w); w += 1;
67 vacc01234567x1 = vfmaq_f16(vacc01234567x1, va01234567x1, vb1);
68 vacc89ABCDEFx1 = vfmaq_f16(vacc89ABCDEFx1, va89ABCDEFx1, vb1);
69 vaccGHIJKLMNx1 = vfmaq_f16(vaccGHIJKLMNx1, vaGHIJKLMNx1, vb1);
70 }
71 float16x8_t vacc01234567 = vacc01234567x0;
72 float16x8_t vacc89ABCDEF = vacc89ABCDEFx0;
73 float16x8_t vaccGHIJKLMN = vaccGHIJKLMNx0;
74 vacc01234567 = vaddq_f16(vacc01234567, vacc01234567x1);
75 vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vacc89ABCDEFx1);
76 vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vaccGHIJKLMNx1);
77 if XNN_LIKELY(nnz != 0) {
78 do {
79 const intptr_t diff = *dmap++;
80 const float16x8_t va01234567 = vld1q_f16(a);
81 const float16x8_t va89ABCDEF = vld1q_f16(a + 8);
82 const float16x8_t vaGHIJKLMN = vld1q_f16(a + 16);
83 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff);
84 const float16x8_t vb = vld1q_dup_f16(w); w += 1;
85 vacc01234567 = vfmaq_f16(vacc01234567, va01234567, vb);
86 vacc89ABCDEF = vfmaq_f16(vacc89ABCDEF, va89ABCDEF, vb);
87 vaccGHIJKLMN = vfmaq_f16(vaccGHIJKLMN, vaGHIJKLMN, vb);
88 } while (--nnz != 0);
89 }
90 float16x8_t vout01234567 = vmulq_f16(vacc01234567, vscale);
91 float16x8_t vout89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
92 float16x8_t voutGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
93 vout01234567 = vminq_f16(vout01234567, vmax);
94 vout89ABCDEF = vminq_f16(vout89ABCDEF, vmax);
95 voutGHIJKLMN = vminq_f16(voutGHIJKLMN, vmax);
96 vout01234567 = vmaxq_f16(vout01234567, vmin);
97 vout89ABCDEF = vmaxq_f16(vout89ABCDEF, vmin);
98 voutGHIJKLMN = vmaxq_f16(voutGHIJKLMN, vmin);
99 vst1q_f16(c, vout01234567);
100 vst1q_f16(c + 8, vout89ABCDEF);
101 vst1q_f16(c + 16, voutGHIJKLMN);
102 c += m;
103 } while (--j != 0);
104 c -= m * n;
105 c += 24;
106 a += 24;
107 i -= 24;
108 }
109 if XNN_UNLIKELY(i != 0) {
110 if (i & 16) {
111 const __fp16*restrict w = weights;
112 const int32_t* dmap = widx_dmap;
113 const uint32_t* nnzmap = nidx_nnzmap;
114 size_t j = n;
115 do {
116 uint32_t nnz = *nnzmap++;
117 float16x8_t vacc01234567 = vld1q_dup_f16(w); w += 1;
118 float16x8_t vacc89ABCDEF = vacc01234567;
119 if XNN_LIKELY(nnz != 0) {
120 do {
121 const intptr_t diff = *dmap++;
122 const float16x8_t va01234567 = vld1q_f16(a);
123 const float16x8_t va89ABCDEF = vld1q_f16(a + 8);
124 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff);
125 const float16x8_t vb = vld1q_dup_f16(w); w += 1;
126 vacc01234567 = vfmaq_f16(vacc01234567, va01234567, vb);
127 vacc89ABCDEF = vfmaq_f16(vacc89ABCDEF, va89ABCDEF, vb);
128 } while (--nnz != 0);
129 }
130 float16x8_t vout01234567 = vminq_f16(vacc01234567, vmax);
131 float16x8_t vout89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
132 vout01234567 = vmaxq_f16(vout01234567, vmin);
133 vout89ABCDEF = vmaxq_f16(vout89ABCDEF, vmin);
134 vst1q_f16(c, vout01234567);
135 vst1q_f16(c + 8, vout89ABCDEF);
136 c += m;
137 } while (--j != 0);
138 c -= m * n;
139 c += 16;
140 a += 16;
141 }
142 if (i & 8) {
143 const __fp16*restrict w = weights;
144 const int32_t* dmap = widx_dmap;
145 const uint32_t* nnzmap = nidx_nnzmap;
146 size_t j = n;
147 do {
148 uint32_t nnz = *nnzmap++;
149 float16x8_t vacc01234567 = vld1q_dup_f16(w); w += 1;
150 if XNN_LIKELY(nnz != 0) {
151 do {
152 const intptr_t diff = *dmap++;
153 const float16x8_t va01234567 = vld1q_f16(a);
154 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff);
155 const float16x8_t vb = vld1q_dup_f16(w); w += 1;
156 vacc01234567 = vfmaq_f16(vacc01234567, va01234567, vb);
157 } while (--nnz != 0);
158 }
159 float16x8_t vout01234567 = vminq_f16(vacc01234567, vmax);
160 vout01234567 = vmaxq_f16(vout01234567, vmin);
161 vst1q_f16(c, vout01234567);
162 c += m;
163 } while (--j != 0);
164 c -= m * n;
165 c += 8;
166 a += 8;
167 }
168 if (i & 4) {
169 const __fp16*restrict w = weights;
170 const int32_t* dmap = widx_dmap;
171 const uint32_t* nnzmap = nidx_nnzmap;
172 size_t j = n;
173 do {
174 uint32_t nnz = *nnzmap++;
175 float16x4_t vacc0123 = vld1_dup_f16(w); w += 1;
176 if XNN_LIKELY(nnz != 0) {
177 do {
178 const intptr_t diff = *dmap++;
179 const float16x4_t va0123 = vld1_f16(a);
180 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff);
181 const float16x4_t vb = vld1_dup_f16(w); w += 1;
182 vacc0123 = vfma_f16(vacc0123, va0123, vb);
183 } while (--nnz != 0);
184 }
185 float16x4_t vout0123 = vmin_f16(vacc0123, vget_low_f16(vmax));
186 vout0123 = vmax_f16(vout0123, vget_low_f16(vmin));
187 vst1_f16(c, vout0123);
188 c += m;
189 } while (--j != 0);
190 c -= m * n;
191 c += 4;
192 a += 4;
193 }
194 if (i & 2) {
195 const __fp16*restrict w = weights;
196 const int32_t* dmap = widx_dmap;
197 const uint32_t* nnzmap = nidx_nnzmap;
198 size_t j = n;
199 do {
200 uint32_t nnz = *nnzmap++;
201 float16x4_t vacc01 = vld1_dup_f16(w); w += 1;
202 if XNN_LIKELY(nnz != 0) {
203 do {
204 const intptr_t diff = *dmap++;
205 const float16x4_t va01 = vreinterpret_f32_f16(vld1_dup_f32(__builtin_assume_aligned(a, 1)));
206 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff);
207 const float16x4_t vb = vld1_dup_f16(w); w += 1;
208 vacc01 = vfma_f16(vacc01, va01, vb);
209 } while (--nnz != 0);
210 }
211 float16x4_t vout01 = vmin_f16(vacc01, vget_low_f16(vmax));
212 vout01 = vmax_f16(vout01, vget_low_f16(vmin));
213 vst1_lane_f32(__builtin_assume_aligned(c, 1), vreinterpret_f16_f32(vout01), 0);
214 c += m;
215 } while (--j != 0);
216 c -= m * n;
217 c += 2;
218 a += 2;
219 }
220 if (i & 1) {
221 const __fp16*restrict w = weights;
222 const int32_t* dmap = widx_dmap;
223 const uint32_t* nnzmap = nidx_nnzmap;
224 size_t j = n;
225 do {
226 uint32_t nnz = *nnzmap++;
227 float16x4_t vacc0 = vld1_dup_f16(w); w += 1;
228 if XNN_LIKELY(nnz != 0) {
229 do {
230 const intptr_t diff = *dmap++;
231 const float16x4_t va0 = vld1_dup_f16(a);
232 a = (const __fp16*restrict) ((uintptr_t) a + (uintptr_t) diff);
233 const float16x4_t vb = vld1_dup_f16(w); w += 1;
234 vacc0 = vfma_f16(vacc0, va0, vb);
235 } while (--nnz != 0);
236 }
237 float16x4_t vout0 = vmin_f16(vacc0, vget_low_f16(vmax));
238 vout0 = vmax_f16(vout0, vget_low_f16(vmin));
239 vst1_lane_f16(c, vout0, 0);
240 c += m;
241 } while (--j != 0);
242 c -= m * n;
243 c += 1;
244 a += 1;
245 }
246 }
247 }
248