1 // Auto-generated file. Do not edit!
2 // Template: src/f32-spmm/neon.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/spmm.h>
15
16
xnn_f32_spmm_ukernel_12x1__neonfma(uint32_t m,uint32_t n,const float * restrict a,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict c,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_spmm_ukernel_12x1__neonfma(
18 uint32_t m,
19 uint32_t n,
20 const float*restrict a,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict c,
25 const union xnn_f32_output_params params[restrict static 1])
26 {
27 assert(m != 0);
28
29 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
30 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
31 size_t i = m;
32 while XNN_LIKELY(i >= 12) {
33 const float*restrict w = weights;
34 const int32_t* dmap = widx_dmap;
35 const uint32_t* nnzmap = nidx_nnzmap;
36 size_t j = n;
37 do {
38 uint32_t nnz = *nnzmap++;
39 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
40 float32x4_t vacc4567 = vacc0123;
41 float32x4_t vacc89AB = vacc0123;
42 if XNN_LIKELY(nnz != 0) {
43 do {
44 const intptr_t diff = *dmap++;
45 const float32x4_t va0123 = vld1q_f32(a);
46 const float32x4_t va4567 = vld1q_f32(a + 4);
47 const float32x4_t va89AB = vld1q_f32(a + 8);
48 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
49 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
50 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
51 vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
52 vacc89AB = vfmaq_f32(vacc89AB, va89AB, vb);
53 } while (--nnz != 0);
54 }
55 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
56 float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
57 float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
58 vout0123 = vmaxq_f32(vout0123, vmin);
59 vout4567 = vmaxq_f32(vout4567, vmin);
60 vout89AB = vmaxq_f32(vout89AB, vmin);
61 vst1q_f32(c, vout0123);
62 vst1q_f32(c + 4, vout4567);
63 vst1q_f32(c + 8, vout89AB);
64 c += m;
65 } while (--j != 0);
66 c -= m * n;
67 c += 12;
68 a += 12;
69 i -= 12;
70 }
71 if XNN_UNLIKELY(i != 0) {
72 if (i & 8) {
73 const float*restrict w = weights;
74 const int32_t* dmap = widx_dmap;
75 const uint32_t* nnzmap = nidx_nnzmap;
76 size_t j = n;
77 do {
78 uint32_t nnz = *nnzmap++;
79 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
80 float32x4_t vacc4567 = vacc0123;
81 if XNN_LIKELY(nnz != 0) {
82 do {
83 const intptr_t diff = *dmap++;
84 const float32x4_t va0123 = vld1q_f32(a);
85 const float32x4_t va4567 = vld1q_f32(a + 4);
86 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
87 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
88 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
89 vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
90 } while (--nnz != 0);
91 }
92 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
93 float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
94 vout0123 = vmaxq_f32(vout0123, vmin);
95 vout4567 = vmaxq_f32(vout4567, vmin);
96 vst1q_f32(c, vout0123);
97 vst1q_f32(c + 4, vout4567);
98 c += m;
99 } while (--j != 0);
100 c -= m * n;
101 c += 8;
102 a += 8;
103 }
104 if (i & 4) {
105 const float*restrict w = weights;
106 const int32_t* dmap = widx_dmap;
107 const uint32_t* nnzmap = nidx_nnzmap;
108 size_t j = n;
109 do {
110 uint32_t nnz = *nnzmap++;
111 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
112 if XNN_LIKELY(nnz != 0) {
113 do {
114 const intptr_t diff = *dmap++;
115 const float32x4_t va0123 = vld1q_f32(a);
116 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
117 const float32x4_t vb = vld1q_dup_f32(w); w += 1;
118 vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
119 } while (--nnz != 0);
120 }
121 float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
122 vout0123 = vmaxq_f32(vout0123, vmin);
123 vst1q_f32(c, vout0123);
124 c += m;
125 } while (--j != 0);
126 c -= m * n;
127 c += 4;
128 a += 4;
129 }
130 if (i & 2) {
131 const float*restrict w = weights;
132 const int32_t* dmap = widx_dmap;
133 const uint32_t* nnzmap = nidx_nnzmap;
134 size_t j = n;
135 do {
136 uint32_t nnz = *nnzmap++;
137 float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
138 if XNN_LIKELY(nnz != 0) {
139 do {
140 const intptr_t diff = *dmap++;
141 const float32x2_t va01 = vld1_f32(a);
142 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
143 const float32x2_t vb = vld1_dup_f32(w); w += 1;
144 vacc01 = vfma_f32(vacc01, va01, vb);
145 } while (--nnz != 0);
146 }
147 float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
148 vout01 = vmax_f32(vout01, vget_low_f32(vmin));
149 vst1_f32(c, vout01);
150 c += m;
151 } while (--j != 0);
152 c -= m * n;
153 c += 2;
154 a += 2;
155 }
156 if (i & 1) {
157 const float*restrict w = weights;
158 const int32_t* dmap = widx_dmap;
159 const uint32_t* nnzmap = nidx_nnzmap;
160 size_t j = n;
161 do {
162 uint32_t nnz = *nnzmap++;
163 float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
164 if XNN_LIKELY(nnz != 0) {
165 do {
166 const intptr_t diff = *dmap++;
167 const float32x2_t va0 = vld1_dup_f32(a);
168 a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
169 const float32x2_t vb = vld1_dup_f32(w); w += 1;
170 vacc0 = vfma_f32(vacc0, va0, vb);
171 } while (--nnz != 0);
172 }
173 float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
174 vout0 = vmax_f32(vout0, vget_low_f32(vmin));
175 vst1_lane_f32(c, vout0, 0);
176 c += m;
177 } while (--j != 0);
178 c -= m * n;
179 c += 1;
180 a += 1;
181 }
182 }
183 }
184