• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-spmm/neon.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/spmm.h>
15 
16 
xnn_f32_spmm_ukernel_12x1__neonfma(uint32_t m,uint32_t n,const float * restrict a,const float * restrict weights,const int32_t * restrict widx_dmap,const uint32_t * restrict nidx_nnzmap,float * restrict c,const union xnn_f32_output_params params[restrict static1])17 void xnn_f32_spmm_ukernel_12x1__neonfma(
18     uint32_t m,
19     uint32_t n,
20     const float*restrict a,
21     const float*restrict weights,
22     const int32_t*restrict widx_dmap,
23     const uint32_t*restrict nidx_nnzmap,
24     float*restrict c,
25     const union xnn_f32_output_params params[restrict static 1])
26 {
27   assert(m != 0);
28 
29   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
30   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
31   size_t i = m;
32   while XNN_LIKELY(i >= 12) {
33     const float*restrict w = weights;
34     const int32_t* dmap = widx_dmap;
35     const uint32_t* nnzmap = nidx_nnzmap;
36     size_t j = n;
37     do {
38       uint32_t nnz = *nnzmap++;
39       float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
40       float32x4_t vacc4567 = vacc0123;
41       float32x4_t vacc89AB = vacc0123;
42       if XNN_LIKELY(nnz != 0) {
43         do {
44           const intptr_t diff = *dmap++;
45           const float32x4_t va0123 = vld1q_f32(a);
46           const float32x4_t va4567 = vld1q_f32(a + 4);
47           const float32x4_t va89AB = vld1q_f32(a + 8);
48           a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
49           const float32x4_t vb = vld1q_dup_f32(w); w += 1;
50           vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
51           vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
52           vacc89AB = vfmaq_f32(vacc89AB, va89AB, vb);
53         } while (--nnz != 0);
54       }
55       float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
56       float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
57       float32x4_t vout89AB = vminq_f32(vacc89AB, vmax);
58       vout0123 = vmaxq_f32(vout0123, vmin);
59       vout4567 = vmaxq_f32(vout4567, vmin);
60       vout89AB = vmaxq_f32(vout89AB, vmin);
61       vst1q_f32(c, vout0123);
62       vst1q_f32(c + 4, vout4567);
63       vst1q_f32(c + 8, vout89AB);
64       c += m;
65     } while (--j != 0);
66     c -= m * n;
67     c += 12;
68     a += 12;
69     i -= 12;
70   }
71   if XNN_UNLIKELY(i != 0) {
72     if (i & 8) {
73       const float*restrict w = weights;
74       const int32_t* dmap = widx_dmap;
75       const uint32_t* nnzmap = nidx_nnzmap;
76       size_t j = n;
77       do {
78         uint32_t nnz = *nnzmap++;
79         float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
80         float32x4_t vacc4567 = vacc0123;
81         if XNN_LIKELY(nnz != 0) {
82           do {
83             const intptr_t diff = *dmap++;
84             const float32x4_t va0123 = vld1q_f32(a);
85             const float32x4_t va4567 = vld1q_f32(a + 4);
86             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
87             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
88             vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
89             vacc4567 = vfmaq_f32(vacc4567, va4567, vb);
90           } while (--nnz != 0);
91         }
92         float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
93         float32x4_t vout4567 = vminq_f32(vacc4567, vmax);
94         vout0123 = vmaxq_f32(vout0123, vmin);
95         vout4567 = vmaxq_f32(vout4567, vmin);
96         vst1q_f32(c, vout0123);
97         vst1q_f32(c + 4, vout4567);
98         c += m;
99       } while (--j != 0);
100       c -= m * n;
101       c += 8;
102       a += 8;
103     }
104     if (i & 4) {
105       const float*restrict w = weights;
106       const int32_t* dmap = widx_dmap;
107       const uint32_t* nnzmap = nidx_nnzmap;
108       size_t j = n;
109       do {
110         uint32_t nnz = *nnzmap++;
111         float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1;
112         if XNN_LIKELY(nnz != 0) {
113           do {
114             const intptr_t diff = *dmap++;
115             const float32x4_t va0123 = vld1q_f32(a);
116             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
117             const float32x4_t vb = vld1q_dup_f32(w); w += 1;
118             vacc0123 = vfmaq_f32(vacc0123, va0123, vb);
119           } while (--nnz != 0);
120         }
121         float32x4_t vout0123 = vminq_f32(vacc0123, vmax);
122         vout0123 = vmaxq_f32(vout0123, vmin);
123         vst1q_f32(c, vout0123);
124         c += m;
125       } while (--j != 0);
126       c -= m * n;
127       c += 4;
128       a += 4;
129     }
130     if (i & 2) {
131       const float*restrict w = weights;
132       const int32_t* dmap = widx_dmap;
133       const uint32_t* nnzmap = nidx_nnzmap;
134       size_t j = n;
135       do {
136         uint32_t nnz = *nnzmap++;
137         float32x2_t vacc01 = vld1_dup_f32(w); w += 1;
138         if XNN_LIKELY(nnz != 0) {
139           do {
140             const intptr_t diff = *dmap++;
141             const float32x2_t va01 = vld1_f32(a);
142             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
143             const float32x2_t vb = vld1_dup_f32(w); w += 1;
144             vacc01 = vfma_f32(vacc01, va01, vb);
145           } while (--nnz != 0);
146         }
147         float32x2_t vout01 = vmin_f32(vacc01, vget_low_f32(vmax));
148         vout01 = vmax_f32(vout01, vget_low_f32(vmin));
149         vst1_f32(c, vout01);
150         c += m;
151       } while (--j != 0);
152       c -= m * n;
153       c += 2;
154       a += 2;
155     }
156     if (i & 1) {
157       const float*restrict w = weights;
158       const int32_t* dmap = widx_dmap;
159       const uint32_t* nnzmap = nidx_nnzmap;
160       size_t j = n;
161       do {
162         uint32_t nnz = *nnzmap++;
163         float32x2_t vacc0 = vld1_dup_f32(w); w += 1;
164         if XNN_LIKELY(nnz != 0) {
165           do {
166             const intptr_t diff = *dmap++;
167             const float32x2_t va0 = vld1_dup_f32(a);
168             a = (const float*restrict) ((uintptr_t) a + (uintptr_t) diff);
169             const float32x2_t vb = vld1_dup_f32(w); w += 1;
170             vacc0 = vfma_f32(vacc0, va0, vb);
171           } while (--nnz != 0);
172         }
173         float32x2_t vout0 = vmin_f32(vacc0, vget_low_f32(vmax));
174         vout0 = vmax_f32(vout0, vget_low_f32(vmin));
175         vst1_lane_f32(c, vout0, 0);
176         c += m;
177       } while (--j != 0);
178       c -= m * n;
179       c += 1;
180       a += 1;
181     }
182   }
183 }
184