#include "pf_conv.h" #include #include #include #if 0 #include #define DPRINT(...) fprintf(stderr, __VA_ARGS__) #else #define DPRINT(...) do { } while (0) #endif #ifdef HAVE_MIPP #include #endif #ifndef CONV_ARCH_POST #error CONV_ARCH_POST not defined #endif #define PP_STRINGIFY(X) #X #define PP_TOSTRING(X) PP_STRINGIFY(X) #define PP_CONCAT_IMPL(x, y) x##y #define PP_CONCAT(x, y) PP_CONCAT_IMPL( x, y ) #define ARCHFUNCNAME(X) PP_CONCAT(X##_,CONV_ARCH_POST) const char * ARCHFUNCNAME(id)() { return PP_TOSTRING(CONV_ARCH_POST); } int ARCHFUNCNAME(conv_float_simd_size)() { #if defined(MIPP_NO_INTRINSICS) || !defined(HAVE_MIPP) // have a completely MIPP independent implementation return 1; #else return mipp::N(); #endif } void ARCHFUNCNAME(conv_float_move_rest)(float * RESTRICT s, conv_buffer_state * RESTRICT state) { int R = state->size - state->offset; // this many samples from prev conv_float were not processed if (R > 0) { // memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin std::copy(&s[state->offset], &s[state->size], s); } else R = 0; state->offset = 0; // data - to be processed - is at begin state->size = R; // this many unprocessed samples } void ARCHFUNCNAME(conv_cplx_move_rest)(complexf * RESTRICT s, conv_buffer_state * RESTRICT state) { int R = state->size - state->offset; // this many samples from prev conv_float were not processed if (R > 0) { // memmove(s, &s[state->offset], R * sizeof(s[0])); // move them to the begin std::copy(&s[state->offset], &s[state->size], s); } else R = 0; state->offset = 0; // data - to be processed - is at begin state->size = R; // this many unprocessed samples } #if defined(MIPP_NO_INTRINSICS) // have a completely MIPP independent implementation // #error missing HAVE_MIPP: there is no MIPP-independent implementation int ARCHFUNCNAME(conv_float_inplace)( float * RESTRICT s, conv_buffer_state * RESTRICT state, const float * RESTRICT filter, const int sz_filter ) { const int off0 = state->offset; const int sz_s = state->size; int offset; for ( offset = off0; offset + sz_filter <= sz_s; ++offset) { float accu = 0.0F; for (int k = 0; k < sz_filter; ++k) accu += s[offset+k] * filter[k]; s[offset] = accu; } state->offset = offset; return offset - off0; } int ARCHFUNCNAME(conv_float_oop)( const float * RESTRICT s, conv_buffer_state * RESTRICT state, const float * RESTRICT filter, const int sz_filter, float * RESTRICT y ) { const int off0 = state->offset; const int sz_s = state->size; int offset; for ( offset = off0; offset + sz_filter <= sz_s; ++offset) { float accu = 0.0F; for (int k = 0; k < sz_filter; ++k) accu += s[offset+k] * filter[k]; y[offset] = accu; } state->offset = offset; return offset - off0; } int ARCHFUNCNAME(conv_cplx_float_oop)( const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state, const float * RESTRICT filter, const int sz_filter, complexf * RESTRICT y_cplx ) { const int off0 = state->offset; const int sz_s = state->size; const int sz_f = sz_filter; int offset; for ( offset = off0; offset + sz_f <= sz_s; ++offset) { float accu_re = 0.0F; float accu_im = 0.0F; for (int k = 0; k < sz_filter; ++k) { accu_re = s_cplx[offset+k].i * filter[k]; // accu += rS * rH; accu_im = s_cplx[offset+k].q * filter[k]; // accu += rS * rH; } y_cplx[offset].i = accu_re; // == hadd() == sum of real parts y_cplx[offset].q = accu_im; // == hadd() == sum of imag parts } state->offset = offset; return offset - off0; } #elif defined(HAVE_MIPP) int ARCHFUNCNAME(conv_float_inplace)( float * RESTRICT s, conv_buffer_state * RESTRICT state, const float * RESTRICT filter, const int sz_filter ) { assert( (sz_filter % mipp::N()) == 0 ); // size of filter must be divisible by conv_float_simd_size() mipp::Reg accu, rS, rH; const int off0 = state->offset; const int sz_s = state->size; int offset; for ( offset = off0; offset + sz_filter <= sz_s; ++offset) { accu.set0(); for (int k = 0; k < sz_filter; k += mipp::N()) { rS.load(&s[offset+k]); rH.load(&filter[k]); accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH; } s[offset] = accu.sum(); // == hadd() } state->offset = offset; return offset - off0; } int ARCHFUNCNAME(conv_float_oop)( const float * RESTRICT s, conv_buffer_state * RESTRICT state, const float * RESTRICT filter, const int sz_filter, float * RESTRICT y ) { assert( (sz_filter % mipp::N()) == 0 ); // size of filter must be divisible by conv_float_simd_size() mipp::Reg accu, rS, rH; const int off0 = state->offset; const int sz_s = state->size; int offset; for ( offset = off0; offset + sz_filter <= sz_s; ++offset) { accu.set0(); for (int k = 0; k < sz_filter; k += mipp::N()) { rS.loadu(&s[offset+k]); rH.load(&filter[k]); accu = mipp::fmadd(rS, rH, accu); // accu += rS * rH; } y[offset] = accu.sum(); // == hadd() } state->offset = offset; return offset - off0; } int ARCHFUNCNAME(conv_cplx_float_oop)( const complexf * RESTRICT s_cplx, conv_buffer_state * RESTRICT state, const float * RESTRICT filter, const int sz_filter, complexf * RESTRICT y_cplx ) { assert( (sz_filter % mipp::N()) == 0 ); // size of filter must be divisible by conv_float_simd_size() const float * RESTRICT s = &(s_cplx[0].i); float * RESTRICT y = &(y_cplx[0].i); mipp::Regx2 accu_x2, rS_x2, H_x2; const int off0 = 2 * state->offset; const int sz_s = 2 * state->size; const int sz_f2 = 2 * sz_filter; int offset; for ( offset = off0; offset + sz_f2 <= sz_s; offset += 2) { accu_x2.val[0].set0(); accu_x2.val[1].set0(); for (int k = 0; k < sz_filter; k += mipp::N()) { mipp::Reg rH; rS_x2.loadu(&s[offset+2*k]); rH.load(&filter[k]); H_x2 = mipp::interleave(rH, rH); accu_x2.val[0] = mipp::fmadd(rS_x2.val[0], H_x2.val[0], accu_x2.val[0]); // accu += rS * rH; accu_x2.val[1] = mipp::fmadd(rS_x2.val[1], H_x2.val[1], accu_x2.val[1]); // accu += rS * rH; } H_x2 = mipp::deinterleave(accu_x2); y[offset] = H_x2.val[0].sum(); // == hadd() == sum of real parts y[offset+1] = H_x2.val[1].sum(); // == hadd() == sum of imag parts } state->offset = offset /2; return (offset - off0) / 2; } #endif static const conv_f_ptrs conv_ptrs = { PP_TOSTRING(CONV_ARCH_POST), #ifndef MIPP_NO_INTRINSICS 1, #else 0, #endif ARCHFUNCNAME(id), ARCHFUNCNAME(conv_float_simd_size), #if defined(MIPP_NO_INTRINSICS) || defined(HAVE_MIPP) ARCHFUNCNAME(conv_float_move_rest), ARCHFUNCNAME(conv_float_inplace), ARCHFUNCNAME(conv_float_oop), ARCHFUNCNAME(conv_cplx_move_rest), ARCHFUNCNAME(conv_cplx_float_oop) #else nullptr, nullptr, nullptr, nullptr, nullptr #endif }; const conv_f_ptrs* ARCHFUNCNAME(conv_ptrs)() { DPRINT("arch pointer for '%s':\n", conv_ptrs.id); if (!strcmp(conv_ptrs.id, "none")) return &conv_ptrs; #if defined(MIPP_NO_INTRINSICS) DPRINT("arch pointer for '%s' - BUT defined(MIPP_NO_INTRINSICS)\n", conv_ptrs.id); return &conv_ptrs; #elif defined(HAVE_MIPP) DPRINT("arch pointer for '%s' - defined(HAVE_MIPP)\n", conv_ptrs.id); DPRINT("'%s': conv_ptrs.using_mipp %d\n", conv_ptrs.id, conv_ptrs.using_mipp); DPRINT("'%s': simd_size() %d\n", conv_ptrs.id, conv_ptrs.fp_conv_float_simd_size()); if (conv_ptrs.using_mipp && conv_ptrs.fp_conv_float_simd_size() > 1) return &conv_ptrs; else DPRINT("arch pointer for '%s': HAVE_MIPP BUT using_mipp %d, float_simd_size %d\n", conv_ptrs.id, conv_ptrs.using_mipp, conv_ptrs.fp_conv_float_simd_size()); #else DPRINT("arch pointer for '%s': neither MIPP_NO_INTRINSICS nor HAVE_MIPP\n", conv_ptrs.id); #endif DPRINT("arch pointer for '%s' => nullptr\n", conv_ptrs.id); return nullptr; } #if defined(__cplusplus) && (__cplusplus >= 201703L) [[maybe_unused]] #endif static f_conv_ptrs test_f_ptrs = ARCHFUNCNAME(conv_ptrs);