• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 #include <math.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <assert.h>
6 
7 #include <algorithm>
8 #include <random>
9 #include <cstdint>
10 #include <complex>
11 
12 #include "papi_perf_counter.h"
13 
14 //#if defined(HAVE_MIPP) && !defined(NO_MIPP)
15 #if defined(HAVE_MIPP)
16 #include <mipp.h>
17 
18 #define MIPP_VECTOR  mipp::vector
19 #else
20 #define MIPP_VECTOR  std::vector
21 #endif
22 
23 #include "pf_conv_dispatcher.h"
24 #include "pf_conv.h"
25 
26 
27 #define TEST_WITH_MIN_LEN     0
28 
29 
generate_rng_vec(int M,int N=-1,int seed_value=1)30 MIPP_VECTOR<float> generate_rng_vec(int M, int N = -1, int seed_value = 1)
31 {
32     MIPP_VECTOR<float> v(N < 0 ? M : N);
33     std::mt19937 g;
34     g.seed(seed_value);
35     constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX));
36     for (int k = 0; k < M; ++k)
37         v[k] = float(int_fast32_t(g())) * scale;
38     for (int k = M; k < N; ++k)
39         v[k] = 0.0F;
40     return v;
41 }
42 
43 
bench_oop_core(const conv_f_ptrs & conv_arch,const float * signal,const int sz_signal,const float * filter,const int sz_filter,const int blockLen,float * y)44 int bench_oop_core(
45         const conv_f_ptrs & conv_arch,
46         const float * signal, const int sz_signal,
47         const float * filter, const int sz_filter,
48         const int blockLen,
49         float * y
50         )
51 {
52     conv_buffer_state state;
53     const auto conv_oop = conv_arch.fp_conv_float_oop;
54     int n_out_sum = 0;
55     state.offset = 0;
56     state.size = 0;
57     papi_perf_counter perf_counter(1);
58     for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
59     {
60         state.size += blockLen;
61         int n_out = conv_oop(signal, &state, filter, sz_filter, y);
62         n_out_sum += n_out;
63     }
64     return n_out_sum;
65 }
66 
bench_inplace_core(const conv_f_ptrs & conv_arch,float * signal,const int sz_signal,const float * filter,const int sz_filter,const int blockLen)67 int bench_inplace_core(
68         const conv_f_ptrs & conv_arch,
69         float * signal, const int sz_signal,
70         const float * filter, const int sz_filter,
71         const int blockLen
72         )
73 {
74     conv_buffer_state state;
75     const auto conv_inplace = conv_arch.fp_conv_float_inplace;
76     int n_out_sum = 0;
77     state.offset = 0;
78     state.size = 0;
79     papi_perf_counter perf_counter(1);
80     for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
81     {
82         state.size += blockLen;
83         int n_out = conv_inplace(signal, &state, filter, sz_filter);
84         n_out_sum += n_out;
85     }
86     return n_out_sum;
87 }
88 
89 
bench_oop(const conv_f_ptrs & conv_arch,float * buffer,const float * signal,const int sz_signal,const float * filter,const int sz_filter,const int blockLen,float * y)90 int bench_oop(
91         const conv_f_ptrs & conv_arch,
92         float * buffer,
93         const float * signal, const int sz_signal,
94         const float * filter, const int sz_filter,
95         const int blockLen,
96         float * y
97         )
98 {
99     conv_buffer_state state;
100     const auto conv_oop = conv_arch.fp_conv_float_oop;
101     const auto move_rest = conv_arch.fp_conv_float_move_rest;
102     int n_out_sum = 0;
103     state.offset = 0;
104     state.size = 0;
105     papi_perf_counter perf_counter(1);
106     for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
107     {
108         move_rest(buffer, &state);
109         //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
110         std::copy(&signal[off], &signal[off+blockLen], buffer+state.size);
111         state.size += blockLen;
112         int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
113         n_out_sum += n_out;
114     }
115     return n_out_sum;
116 }
117 
bench_cx_real_oop(const conv_f_ptrs & conv_arch,complexf * buffer,const float * signal_re,const int sz_signal_re,const float * filter,const int sz_filter,const int blockLen,float * y_re)118 int bench_cx_real_oop(
119         const conv_f_ptrs & conv_arch,
120         complexf * buffer,
121         const float * signal_re, const int sz_signal_re,
122         const float * filter, const int sz_filter,
123         const int blockLen,
124         float * y_re
125         )
126 {
127     conv_buffer_state state;
128     const auto conv_oop = conv_arch.fp_conv_cplx_float_oop;
129     const auto move_rest = conv_arch.fp_conv_cplx_move_rest;
130     // interpret buffer, signal and output vector y  as complex data
131     complexf * y = reinterpret_cast<complexf *>(y_re);
132     const complexf * signal = reinterpret_cast<const complexf *>(signal_re);
133     const int sz_signal = sz_signal_re / 2;
134     int n_out_sum = 0;
135     state.offset = 0;
136     state.size = 0;
137     papi_perf_counter perf_counter(1);
138     for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
139     {
140         move_rest(buffer, &state);
141         //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
142         std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]);
143         state.size += blockLen;
144         int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
145         n_out_sum += n_out;
146     }
147     return n_out_sum;
148 }
149 
150 
main(int argc,char * argv[])151 int main(int argc, char *argv[])
152 {
153     // cli defaults:
154     // process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128
155     int arch = 0, N = 64 * 1024 * 1024;
156     int filterLen = 128, blockLen = 1024;
157     int seed_sig = 1, seed_filter = 2;
158     bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1);
159 
160     for (int i = 1; i < argc; ++i)
161     {
162         if (i+1 < argc && !strcmp(argv[i], "-a"))
163             arch = atoi(argv[++i]);
164         else if (i+1 < argc && !strcmp(argv[i], "-n"))
165             N = atoi(argv[++i]) * 1024 * 1024;
166         else if (i+1 < argc && !strcmp(argv[i], "-f"))
167             filterLen = atoi(argv[++i]);
168         else if (i+1 < argc && !strcmp(argv[i], "-b"))
169             blockLen = atoi(argv[++i]);
170         else if (i+1 < argc && !strcmp(argv[i], "-ss"))
171             seed_sig = atoi(argv[++i]);
172         else if (i+1 < argc && !strcmp(argv[i], "-sf"))
173             seed_filter = atoi(argv[++i]);
174         else if (!strcmp(argv[i], "-v"))
175             verbose = true;
176         else if (!strcmp(argv[i], "-h"))
177             showUsage = exitFromUsage = true;
178         else
179             fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]);
180     }
181 
182     int num_arch = 0;
183     const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch);
184 
185     if (verbose)
186     {
187         fprintf(stderr, "num_arch is %d\n", num_arch);
188         for (int a = 0; a < num_arch; ++a)
189             if (conv_arch_ptrs[a])
190                 fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id );
191             else
192                 fprintf(stderr, " arch %d is nullptr !!!\n", a );
193         fprintf(stderr, "\n");
194     }
195 
196     if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage )
197     {
198         fprintf(stderr, "%s [-v] [-a <arch>] [-n <total # of MSamples> [-f <filter length>] [-b <blockLength in samples>]\n", argv[0]);
199         fprintf(stderr, "    [-ss <random seed for signal>] [-sf <random seed for filter coeffs>]\n");
200         fprintf(stderr, "arch is one of:");
201         for (int a = 0; a < num_arch; ++a)
202             if (conv_arch_ptrs[a])
203                 fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") );
204         fprintf(stderr, "\n");
205         if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch )
206             return 0;
207     }
208 
209     if (verbose)
210     {
211         #ifdef HAVE_PAPI
212         fprintf(stderr, "PAPI is available\n");
213         #else
214         fprintf(stderr, "PAPI is NOT available!\n");
215         #endif
216     }
217     #if !defined(HAVE_MIPP)
218     fprintf(stderr, "MIPP is NOT available!\n");
219     #endif
220 
221     //int float_simd_size[num_arch];
222     int max_simd_size = -1;
223     for (int a = 0; a < num_arch; ++a)
224     {
225         if (conv_arch_ptrs[a])
226         {
227             const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size();
228             //float_simd_size[a] = sz;
229             if (max_simd_size < sz)
230                 max_simd_size = sz;
231             if (verbose)
232                 fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz);
233         }
234         //else
235         //    float_simd_size[a] = 0;
236     }
237     //const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] );
238     if (verbose)
239         fprintf(stderr, "max float simd size: %d\n", max_simd_size);
240 
241 #if TEST_WITH_MIN_LEN
242     filterLen = 2;
243 #endif
244 
245     // round up filter length
246     filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size );
247 
248 #if TEST_WITH_MIN_LEN
249     blockLen = 1;
250     N = 2 * (3 + filterLen);    // produce 3+1 samples
251 #endif
252 
253     if (!conv_arch_ptrs[arch])
254     {
255         fprintf(stderr, "Error: architecture %d is NOT available!\n", arch);
256         return 1;
257     }
258     const conv_f_ptrs & conv_arch =  *conv_arch_ptrs[arch];
259     if (verbose)
260         fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp);
261 
262     fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n",
263         N / (1024 * 1024), blockLen, filterLen, conv_arch.id );
264 
265     MIPP_VECTOR<float> s = generate_rng_vec(N + 1, N + 1, seed_sig);
266     MIPP_VECTOR<float> y(N + 1, 0.0F);
267     MIPP_VECTOR<float> filter = generate_rng_vec(filterLen, filterLen, seed_filter);
268     MIPP_VECTOR<float> buffer(blockLen + filterLen + 1, 0.0F);
269     MIPP_VECTOR<complexf> buffer_cx(blockLen + filterLen + 1);
270 
271 #if 1 && TEST_WITH_MIN_LEN
272     for (int k = 0; k < N; ++k)
273         s[k] = (k+1);
274     for (int k = 0; k < filterLen; ++k)
275         filter[k] = (k+1);
276 #endif
277 
278     s[N] = 123.0F;
279     y[N] = 321.0F;
280     buffer[blockLen + filterLen] = 789.0F;
281     buffer_cx[blockLen + filterLen].i = 987.0F;
282 
283     fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id);
284     int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data());
285     fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
286 #if TEST_WITH_MIN_LEN
287     for (int k = 0; k < n_oop_out; ++k )
288         fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
289     fprintf(stderr, "\n");
290 #endif
291 
292     fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id);
293     n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
294     fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
295     assert(s[N] == 123.0F);
296     assert(y[N] == 321.0F);
297     assert(buffer[blockLen + filterLen] == 789.0F);
298     assert(buffer_cx[blockLen + filterLen].i == 987.0F);
299 #if TEST_WITH_MIN_LEN
300     for (int k = 0; k < n_oop_out; ++k )
301         fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
302     fprintf(stderr, "\n");
303 #endif
304 
305     fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id);
306     n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
307     fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
308     assert(s[N] == 123.0F);
309     assert(y[N] == 321.0F);
310     assert(buffer[blockLen + filterLen] == 789.0F);
311     assert(buffer_cx[blockLen + filterLen].i == 987.0F);
312 #if TEST_WITH_MIN_LEN
313     fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out);
314     for (int k = 0; k < n_oop_out; ++k )
315         fprintf(stderr, "y[%2d] = %g  %+g * i\n", k, y[2*k], y[2*k+1]);
316     fprintf(stderr, "\n");
317 
318     const std::complex<float> * sc = reinterpret_cast< std::complex<float>* >( s.data() );
319     const int Nc = N /2;
320     fprintf(stderr, "reference with std::complex<float>:\n");
321     for (int off = 0; off +filterLen <= Nc; ++off )
322     {
323         std::complex<float> sum(0.0F, 0.0F);
324         for (int k=0; k < filterLen; ++k)
325             sum += sc[off+k] * filter[k];
326         fprintf(stderr, "yv[%2d] = %g  %+g * i\n", off, sum.real(), sum.imag() );
327     }
328 #endif
329 
330     fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id);
331     int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen);
332     fprintf(stderr, "inp produced %d output samples\n", n_inp_out);
333     assert(s[N] == 123.0F);
334     assert(y[N] == 321.0F);
335     assert(buffer[blockLen + filterLen] == 789.0F);
336     assert(buffer_cx[blockLen + filterLen].i == 987.0F);
337 #if TEST_WITH_MIN_LEN
338     for (int k = 0; k < n_inp_out; ++k )
339         fprintf(stderr, "y[%2d] = %g\n", k, s[k]);
340     fprintf(stderr, "\n");
341 #endif
342 
343     fprintf(stderr, "\n");
344     return 0;
345 }
346