1
2 #include "pf_conv.h"
3
4 #include <string.h>
5 #include <assert.h>
6
7 #include <algorithm>
8
9 #if 0
10 #include <stdio.h>
11
12 #define DPRINT(...) fprintf(stderr, __VA_ARGS__)
13
14 #else
15 #define DPRINT(...) do { } while (0)
16 #endif
17
18
19 #ifdef HAVE_MIPP
20 #include <mipp.h>
21 #endif
22
23
24 #ifndef CONV_ARCH_POST
25 #error CONV_ARCH_POST not defined
26 #endif
27
28 #define PP_STRINGIFY(X) #X
29 #define PP_TOSTRING(X) PP_STRINGIFY(X)
30 #define PP_CONCAT_IMPL(x, y) x##y
31 #define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y )
32
33 #define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST)
34
35
ARCHFUNCNAME(id)36 const char * ARCHFUNCNAME(id)()
37 {
38 return PP_TOSTRING(CONV_ARCH_POST);
39 }
40
41
ARCHFUNCNAME(conv_float_simd_size)42 int ARCHFUNCNAME(conv_float_simd_size)()
43 {
44 #if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP)
45 // have a completely MIPP independent implementation
46 return 1;
47 #else
48 return mipp::N<float>();
49 #endif
50 }
51
52
ARCHFUNCNAME(conv_float_move_rest)53 void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state)
54 {
55 int R = state->size - state->offset; // this many samples from prev conv_float were not processed
56 if (R > 0)
57 {
58 // memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
59 std::copy(&s[state->offset], &s[state->size], s);
60 }
61 else
62 R = 0;
63 state->offset = 0; // data - to be processed - is at begin
64 state->size = R; // this many unprocessed samples
65 }
66
67
ARCHFUNCNAME(conv_cplx_move_rest)68 void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state)
69 {
70 int R = state->size - state->offset; // this many samples from prev conv_float were not processed
71 if (R > 0)
72 {
73 // memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin
74 std::copy(&s[state->offset], &s[state->size], s);
75 }
76 else
77 R = 0;
78 state->offset = 0; // data - to be processed - is at begin
79 state->size = R; // this many unprocessed samples
80 }
81
82
83 #if defined(MIPP_NO_INTRINSICS)
84 // have a completely MIPP independent implementation
85 // #error missing HAVE_MIPP: there is no MIPP-independent implementation
86
ARCHFUNCNAME(conv_float_inplace)87 int ARCHFUNCNAME(conv_float_inplace)(
88 float * RESTRICT s, conv_buffer_state * RESTRICT state,
89 const float * RESTRICT filter, const int sz_filter
90 )
91 {
92 const int off0 = state->offset;
93 const int sz_s = state->size;
94 int offset;
95
96 for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
97 {
98 float accu = 0.0F;
99 for (int k = 0; k < sz_filter; ++k)
100 accu += s[offset+k] * filter[k];
101 s[offset] = accu;
102 }
103
104 state->offset = offset;
105 return offset - off0;
106 }
107
108
ARCHFUNCNAME(conv_float_oop)109 int ARCHFUNCNAME(conv_float_oop)(
110 const float * RESTRICT s, conv_buffer_state * RESTRICT state,
111 const float * RESTRICT filter, const int sz_filter,
112 float * RESTRICT y
113 )
114 {
115 const int off0 = state->offset;
116 const int sz_s = state->size;
117 int offset;
118
119 for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
120 {
121 float accu = 0.0F;
122 for (int k = 0; k < sz_filter; ++k)
123 accu += s[offset+k] * filter[k];
124 y[offset] = accu;
125 }
126
127 state->offset = offset;
128 return offset - off0;
129 }
130
131
ARCHFUNCNAME(conv_cplx_float_oop)132 int ARCHFUNCNAME(conv_cplx_float_oop)(
133 const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
134 const float * RESTRICT filter, const int sz_filter,
135 complexf * RESTRICT y_cplx
136 )
137 {
138 const int off0 = state->offset;
139 const int sz_s = state->size;
140 const int sz_f = sz_filter;
141 int offset;
142
143 for ( offset = off0; offset + sz_f <= sz_s; ++offset)
144 {
145 float accu_re = 0.0F;
146 float accu_im = 0.0F;
147 for (int k = 0; k < sz_filter; ++k)
148 {
149 accu_re = s_cplx[offset+k].i * filter[k]; // accu += rS * rH;
150 accu_im = s_cplx[offset+k].q * filter[k]; // accu += rS * rH;
151 }
152 y_cplx[offset].i = accu_re; // == hadd() == sum of real parts
153 y_cplx[offset].q = accu_im; // == hadd() == sum of imag parts
154 }
155
156 state->offset = offset;
157 return offset - off0;
158 }
159
160
161 #elif defined(HAVE_MIPP)
162
163
ARCHFUNCNAME(conv_float_inplace)164 int ARCHFUNCNAME(conv_float_inplace)(
165 float * RESTRICT s, conv_buffer_state * RESTRICT state,
166 const float * RESTRICT filter, const int sz_filter
167 )
168 {
169 assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
170
171 mipp::Reg<float> accu, rS, rH;
172 const int off0 = state->offset;
173 const int sz_s = state->size;
174 int offset;
175
176 for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
177 {
178 accu.set0();
179 for (int k = 0; k < sz_filter; k += mipp::N<float>())
180 {
181 rS.load(&s[offset+k]);
182 rH.load(&filter[k]);
183 accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
184 }
185 s[offset] = accu.sum(); // == hadd()
186 }
187
188 state->offset = offset;
189 return offset - off0;
190 }
191
192
ARCHFUNCNAME(conv_float_oop)193 int ARCHFUNCNAME(conv_float_oop)(
194 const float * RESTRICT s, conv_buffer_state * RESTRICT state,
195 const float * RESTRICT filter, const int sz_filter,
196 float * RESTRICT y
197 )
198 {
199 assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
200
201 mipp::Reg<float> accu, rS, rH;
202 const int off0 = state->offset;
203 const int sz_s = state->size;
204 int offset;
205
206 for ( offset = off0; offset + sz_filter <= sz_s; ++offset)
207 {
208 accu.set0();
209 for (int k = 0; k < sz_filter; k += mipp::N<float>())
210 {
211 rS.loadu(&s[offset+k]);
212 rH.load(&filter[k]);
213 accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH;
214 }
215 y[offset] = accu.sum(); // == hadd()
216 }
217
218 state->offset = offset;
219 return offset - off0;
220 }
221
222
ARCHFUNCNAME(conv_cplx_float_oop)223 int ARCHFUNCNAME(conv_cplx_float_oop)(
224 const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state,
225 const float * RESTRICT filter, const int sz_filter,
226 complexf * RESTRICT y_cplx
227 )
228 {
229 assert( (sz_filter % mipp::N<float>()) == 0 ); // size of filter must be divisible by conv_float_simd_size()
230 const float * RESTRICT s = &(s_cplx[0].i);
231 float * RESTRICT y = &(y_cplx[0].i);
232
233 mipp::Regx2<float> accu_x2, rS_x2, H_x2;
234 const int off0 = 2 * state->offset;
235 const int sz_s = 2 * state->size;
236 const int sz_f2 = 2 * sz_filter;
237 int offset;
238
239 for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2)
240 {
241 accu_x2.val[0].set0();
242 accu_x2.val[1].set0();
243 for (int k = 0; k < sz_filter; k += mipp::N<float>())
244 {
245 mipp::Reg<float> rH;
246 rS_x2.loadu(&s[offset+2*k]);
247 rH.load(&filter[k]);
248 H_x2 = mipp::interleave<float>(rH, rH);
249 accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]); // accu += rS * rH;
250 accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]); // accu += rS * rH;
251 }
252 H_x2 = mipp::deinterleave(accu_x2);
253 y[offset] = H_x2.val[0].sum(); // == hadd() == sum of real parts
254 y[offset+1] = H_x2.val[1].sum(); // == hadd() == sum of imag parts
255 }
256
257 state->offset = offset /2;
258 return (offset - off0) / 2;
259 }
260
261 #endif
262
263
264 static const conv_f_ptrs conv_ptrs =
265 {
266 PP_TOSTRING(CONV_ARCH_POST),
267 #ifndef MIPP_NO_INTRINSICS
268 1,
269 #else
270 0,
271 #endif
272
273 ARCHFUNCNAME(id),
274 ARCHFUNCNAME(conv_float_simd_size),
275
276 #if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP)
277 ARCHFUNCNAME(conv_float_move_rest),
278 ARCHFUNCNAME(conv_float_inplace),
279 ARCHFUNCNAME(conv_float_oop),
280
281 ARCHFUNCNAME(conv_cplx_move_rest),
282 ARCHFUNCNAME(conv_cplx_float_oop)
283 #else
284 nullptr,
285 nullptr,
286 nullptr,
287
288 nullptr,
289 nullptr
290 #endif
291 };
292
293
ARCHFUNCNAME(conv_ptrs)294 const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)()
295 {
296 DPRINT("arch pointer for '%s':\n", conv_ptrs.id);
297 if (!strcmp(conv_ptrs.id, "none"))
298 return &conv_ptrs;
299
300 #if defined(MIPP_NO_INTRINSICS)
301 DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id);
302 return &conv_ptrs;
303 #elif defined(HAVE_MIPP)
304 DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id);
305 DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp);
306 DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size());
307 if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1)
308 return &conv_ptrs;
309 else
310 DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size());
311 #else
312 DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id);
313 #endif
314 DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id);
315 return nullptr;
316 }
317
318 #if defined(__cplusplus) && (__cplusplus >= 201703L)
319 [[maybe_unused]]
320 #endif
321 static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs);
322
323