• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  * Unless required by applicable law or agreed to in writing, software
10  *
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include "intrinsic_utils.h"
20 
21 #include <algorithm>
22 #include <array>
23 #include <cmath>
24 #include <functional>
25 #include <utility>
26 #include <vector>
27 
28 #include <assert.h>
29 
30 // We conditionally include neon optimizations for ARM devices
31 #pragma push_macro("USE_NEON")
32 #undef USE_NEON
33 
34 #if defined(__ARM_NEON__) || defined(__aarch64__)
35 #include <arm_neon.h>
36 #define USE_NEON
37 #endif
38 
39 // Use dither to prevent subnormals for CPUs that raise an exception.
40 #pragma push_macro("USE_DITHER")
41 #undef USE_DITHER
42 
43 #if defined(__i386__) || defined(__x86_x64__)
44 #define USE_DITHER
45 #endif
46 
47 namespace android::audio_utils {
48 
49 static constexpr size_t kBiquadNumCoefs  = 5;
50 static constexpr size_t kBiquadNumDelays = 2;
51 
52 /**
53  * The BiquadDirect2Transpose is a low overhead
54  * Biquad filter with coefficients b0, b1, b2, a1, a2.
55  *
56  * This can be used by itself, but it is preferred for best data management
57  * to use the BiquadFilter abstraction below.
58  *
59  * T is the data type (scalar or vector).
60  * F is the filter coefficient type.  It is either a scalar or vector (matching T).
61  */
62 template <typename T, typename F>
63 struct BiquadDirect2Transpose {
64     F coef_[5]; // these are stored with the denominator a's negated.
65     T s_[2]; // delay states
66 
67     // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
68     // as expressed by a bitmask.
69     static inline constexpr size_t required_occupancies_[] = {
70         0x1,  // constant scale
71         0x3,  // single zero
72         0x7,  // double zero
73         0x9,  // single pole
74         0xb,  // (11) first order IIR
75         0x1b, // (27) double pole + single zero
76         0x1f, // (31) second order IIR (full Biquad)
77     };
78 
79     // Take care the order of arguments - starts with b's then goes to a's.
80     // The a's are "positive" reference, some filters take negative.
81     BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
82             const T& s0 = {}, const T& s1 = {})
83         // : coef_{b0, b1, b2, -a1, -a2}
84         : coef_{ b0,
85             b1,
86             b2,
87             intrinsics::vneg(a1),
88             intrinsics::vneg(a2) }
89         , s_{ s0, s1 } {
90     }
91 
92     // D is the data type.  It must be the same element type of T or F.
93     // Take care the order of input and output.
94     template<typename D, size_t OCCUPANCY = 0x1f>
95     __attribute__((always_inline)) // required for 1ch speedup (30% faster)
processBiquadDirect2Transpose96     void process(D* output, const D* input, size_t frames, size_t stride) {
97         using namespace intrinsics;
98         // For SSE it is possible to vdup F to T if F is scalar.
99         const F b0 = coef_[0];         // b0
100         const F b1 = coef_[1];         // b1
101         const F b2 = coef_[2];         // b2
102         const F negativeA1 = coef_[3]; // -a1
103         const F negativeA2 = coef_[4]; // -a2
104         T s[2] = { s_[0], s_[1] };
105         T xn, yn; // OK to declare temps outside loop rather than at the point of initialization.
106 #ifdef USE_DITHER
107         constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
108         T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
109 #endif
110 
111         // Unroll control.  Make sure the constexpr remains constexpr :-).
112         constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
113         constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2;   // below this won't be unrolled.
114         constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16;  // above this won't be unrolled.
115         constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
116                 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
117         size_t remainder = 0;
118         if constexpr (UNROLL_LOOPS > 1) {
119             remainder = frames % UNROLL_LOOPS;
120             frames /= UNROLL_LOOPS;
121         }
122 
123         // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
124         // The other alternative is to use a MACRO, but that doesn't read as well.
125         const auto KERNEL = [&]() __attribute__((always_inline)) {
126             xn = vld1<T>(input);
127             input += stride;
128 #ifdef USE_DITHER
129             xn = vadd(xn, dither);
130             dither = vneg(dither);
131 #endif
132 
133             yn = s[0];
134             if constexpr (OCCUPANCY >> 0 & 1) {
135                 yn = vmla(yn, b0, xn);
136             }
137             vst1(output, yn);
138             output += stride;
139 
140             s[0] = s[1];
141             if constexpr (OCCUPANCY >> 3 & 1) {
142                 s[0] = vmla(s[0], negativeA1, yn);
143             }
144             if constexpr (OCCUPANCY >> 1 & 1) {
145                 s[0] = vmla(s[0], b1, xn);
146             }
147             if constexpr (OCCUPANCY >> 2 & 1) {
148                 s[1] = vmul(b2, xn);
149             } else {
150                 s[1] = vdupn<T>(0.f);
151             }
152             if constexpr (OCCUPANCY >> 4 & 1) {
153                 s[1] = vmla(s[1], negativeA2, yn);
154             }
155         };
156 
157         while (frames > 0) {
158             #pragma unroll
159             for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
160                 KERNEL();
161             }
162             frames--;
163         }
164         if constexpr (UNROLL_LOOPS > 1) {
165             for (size_t i = 0; i < remainder; ++i) {
166                 KERNEL();
167             }
168         }
169         s_[0] = s[0];
170         s_[1] = s[1];
171     }
172 };
173 
174 /**
175  * A state space formulation of filtering converts a n-th order difference equation update
176  * to a first order vector difference equation. For the Biquad filter, the state space form
177  * has improved numerical precision properties with poles near the unit circle as well as
178  * increased speed due to better parallelization of the state update [1][2].
179  *
180  * [1] Raph Levien: (observerable canonical form)
181  * https://github.com/google/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
182  *
183  * [2] Julius O Smith III: (controllable canonical form)
184  * https://ccrma.stanford.edu/~jos/filters/State_Space_Filters.html
185  *
186  * The signal flow is as follows, where for scalar x and y, the matrix D is a scalar d.
187  *
188  *
189  *        +------[ d ]--------------------------+
190  *        |                         S           |
191  *  x ----+--[ B ]--(+)--[ z^-1 ]---+---[ C ]--(+)--- y
192  *                   |              |
193  *                   +----[ A ]-----+
194  *
195  * The 2nd order Biquad IIR coefficients are as follows in observerable canonical form:
196  *
197  * y = [C_1 C_2] | S_1 | + d x
198  *               | S_2 |
199  *
200  *
201  * | S_1 | = | A_11 A_12 | | S_1 | + | B_1 | x
202  * | S_2 |   | A_21 A_22 | | S_2 |   | B_2 |
203  *
204  *
205  * A_11 = -a1
206  * A_12 = 1
207  * A_21 = -a2
208  * A_22 = 0
209  *
210  * B_1 = b1 - b0 * a1
211  * B_2 = b2 - b0 * a2
212  *
213  * C_1 = 1
214  * C_2 = 0
215  *
216  * d = b0
217  *
218  * Implementation details: The state space filter is typically expressed in either observable or
219  * controllable canonical form [3].  We follow the observable form here.
220  * Raph [4] discovered that the single channel Biquad implementation can be further optimized
221  * by doing 2 filter updates at once (improving speed on NEON by about 20%).
222  * Doing 2 updates at once costs 8 multiplies / sample instead of 5 multiples / sample,
223  * but uses a 4 x 4 matrix multiply, exploiting single cycle multiply-add SIMD throughput.
224  *
225  * [3] Mathworks
226  * https://www.mathworks.com/help/control/ug/canonical-state-space-realizations.html
227  * [4] Raph Levien
228  * https://github.com/kysko/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
229  *
230  * The template variables
231  * T is the data type (scalar or vector).
232  * F is the filter coefficient type.  It is either a scalar or vector (matching T).
233  */
234 template <typename T, typename F, bool SEPARATE_CHANNEL_OPTIMIZATION = false>
235 struct BiquadStateSpace {
236     F coef_[5]; // these are stored as state-space converted.
237     T s_[2]; // delay states
238 
239     // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
240     // as expressed by a bitmask.  This must include 31.
241     static inline constexpr size_t required_occupancies_[] = {
242         1,  // constant scale
243         3,  // single zero
244         7,  // double zero
245         9,  // single pole
246         11, // first order IIR
247         27, // double pole + single zero
248         31, // second order IIR (full Biquad)
249     };
250 
251     // Take care the order of arguments - starts with b's then goes to a's.
252     // The a's are "positive" reference, some filters take negative.
253     BiquadStateSpace(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
254             const T& s0 = {}, const T& s1 = {})
255         // : coef_{b0, b1 - b0 * a1, b2 - b0 * a2, -a1, -a2}
256         : coef_{ b0,
257             intrinsics::vsub(b1, intrinsics::vmul(b0, a1)),
258             intrinsics::vsub(b2, intrinsics::vmul(b0, a2)),
259             intrinsics::vneg(a1),
260             intrinsics::vneg(a2) }
261         , s_{s0, s1} {
262     }
263 
264     // D is the data type.  It must be the same element type of T or F.
265     // Take care the order of input and output.
266     template<typename D, size_t OCCUPANCY = 0x1f>
processBiquadStateSpace267     void process(D* output, const D* input, size_t frames, size_t stride) {
268         using namespace intrinsics;
269         const F b0 = coef_[0];         // b0
270         const F b1ss = coef_[1];       // b1 - b0 * a1,
271         const F b2ss = coef_[2];       // b2 - b0 * a2,
272         const F negativeA1 = coef_[3]; // -a1
273         const F negativeA2 = coef_[4]; // -a2
274         T s[2] = { s_[0], s_[1] };
275         T x, new_s0; // OK to declare temps here rather than at the point of initialization.
276 #ifdef USE_DITHER
277         constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
278         T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
279 #endif
280         constexpr bool b0_present = (OCCUPANCY & 0x1) != 0;
281         constexpr bool a1_present = (OCCUPANCY & 0x8) != 0;
282         constexpr bool a2_present = (OCCUPANCY & 0x10) != 0;
283         constexpr bool b1ss_present = (OCCUPANCY & 0x2) != 0 ||
284                 (b0_present && a1_present);
285         constexpr bool b2ss_present = (OCCUPANCY & 0x4) != 0 ||
286                 (b0_present && a2_present);
287 
288         // Unroll control.  Make sure the constexpr remains constexpr :-).
289         constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
290         constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 1;   // below this won't be unrolled.
291         constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16;  // above this won't be unrolled.
292         constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
293                 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
294 
295         if constexpr (SEPARATE_CHANNEL_OPTIMIZATION && CHANNELS == 1 && OCCUPANCY >= 11) {
296             // Special acceleration which computes 2 samples at a time.
297             // see reference [4] for computation of this matrix.
298             intrinsics::internal_array_t<T, 4> A[4] = {
299                 {b0, b1ss, negativeA1 * b1ss + b2ss, negativeA2 * b1ss},
300                 {0, b0, b1ss, b2ss},
301                 {1, negativeA1, negativeA2 + negativeA1 * negativeA1, negativeA1 * negativeA2},
302                 {0, 1, negativeA1, negativeA2},
303                 };
304             intrinsics::internal_array_t<T, 4> y;
305             while (frames > 1) {
306                 x = vld1<T>(input);
307                 input += stride;
308 #ifdef USE_DITHER
309                 x = vadd(x, dither);
310                 dither = vneg(dither);
311 #endif
312                 y = vmul(A[0], x);
313 
314                 x = vld1<T>(input);
315                 input += stride;
316 #ifdef USE_DITHER
317                 x = vadd(x, dither);
318                 dither = vneg(dither);
319 #endif
320                 y = vmla(y, A[1], x);
321                 y = vmla(y, A[2], s[0]);
322                 y = vmla(y, A[3], s[1]);
323 
324                 vst1(output, y.v[0]);
325                 output += stride;
326 
327                 vst1(output, y.v[1]);
328                 output += stride;
329 
330                 s[0] = y.v[2];
331                 s[1] = y.v[3];
332                 frames -= 2;
333             }
334             if (frames == 0) {
335                 s_[0] = s[0];
336                 s_[1] = s[1];
337                 return;
338             }
339         }
340 
341         size_t remainder = 0;
342         if constexpr (UNROLL_LOOPS > 1) {
343             remainder = frames % UNROLL_LOOPS;
344             frames /= UNROLL_LOOPS;
345         }
346 
347         // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
348         // The other alternative is to use a MACRO, but that doesn't read as well.
349         const auto KERNEL = [&]() __attribute__((always_inline)) {
350             x = vld1<T>(input);
351             input += stride;
352 #ifdef USE_DITHER
353             x = vadd(x, dither);
354             dither = vneg(dither);
355 #endif
356             // vst1(output, vadd(s[0], vmul(b0, x)));
357             // output += stride;
358             // new_s0 = vadd(vadd(vmul(b1ss, x), vmul(negativeA1, s[0])), s[1]);
359             // s[1] = vadd(vmul(b2ss, x), vmul(negativeA2, s[0]));
360 
361             if constexpr (b0_present) {
362                 vst1(output, vadd(s[0], vmul(b0, x)));
363             } else /* constexpr */ {
364                 vst1(output, s[0]);
365             }
366             output += stride;
367             new_s0 = s[1];
368             if constexpr (b1ss_present) {
369                 new_s0 = vadd(new_s0, vmul(b1ss, x));
370             }
371             if constexpr (a1_present) {
372                 new_s0 = vadd(new_s0, vmul(negativeA1, s[0]));
373             }
374             if constexpr (b2ss_present) {
375                 s[1] = vmul(b2ss, x);
376                 if constexpr (a2_present) {
377                     s[1] = vadd(s[1], vmul(negativeA2, s[0]));
378                 }
379             } else if constexpr (a2_present) {
380                 s[1] = vmul(negativeA2, s[0]);
381             }
382             s[0] = new_s0;
383         };
384 
385         while (frames > 0) {
386             #pragma unroll
387             for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
388                 KERNEL();
389             }
390             frames--;
391         }
392         if constexpr (UNROLL_LOOPS > 1) {
393             for (size_t i = 0; i < remainder; ++i) {
394                 KERNEL();
395             }
396         }
397         s_[0] = s[0];
398         s_[1] = s[1];
399     }
400 };
401 
402 namespace details {
403 
404 // Helper methods for constructing a constexpr array of function pointers.
405 // As function pointers are efficient and have no constructor/destructor
406 // this is preferred over std::function.
407 //
408 // SC stands for SAME_COEF_PER_CHANNEL, a compile time boolean constant.
409 template <template <size_t, bool, typename ...> typename F, bool SC, size_t... Is>
make_functional_array_from_index_sequence(std::index_sequence<Is...>)410 static inline constexpr auto make_functional_array_from_index_sequence(std::index_sequence<Is...>) {
411     using first_t = decltype(&F<0, false>::func);  // type from function
412     using result_t = std::array<first_t, sizeof...(Is)>;   // type of array
413     return result_t{{F<Is, SC>::func...}};      // initialize with functions.
414 }
415 
416 template <template <size_t, bool, typename ...> typename F, size_t M, bool SC>
make_functional_array()417 static inline constexpr auto make_functional_array() {
418     return make_functional_array_from_index_sequence<F, SC>(std::make_index_sequence<M>());
419 }
420 
421 // Returns true if the poles are stable for a Biquad.
422 template <typename D>
isStable(const D & a1,const D & a2)423 static inline constexpr bool isStable(const D& a1, const D& a2) {
424     return fabs(a2) < D(1) && fabs(a1) < D(1) + a2;
425 }
426 
427 // Simplifies Biquad coefficients.
428 // TODO: consider matched pole/zero cancellation.
429 //       consider subnormal elimination for Intel processors.
430 template <typename D, typename T>
reduceCoefficients(const T & coef)431 std::array<D, kBiquadNumCoefs> reduceCoefficients(const T& coef) {
432     std::array<D, kBiquadNumCoefs> lcoef;
433     if (coef.size() == kBiquadNumCoefs + 1) {
434         // General form of Biquad.
435         // Remove matched z^-1 factors in top and bottom (e.g. coefs[0] == coefs[3] == 0).
436         size_t offset = 0;
437         for (; offset < 2 && coef[offset] == 0 && coef[offset + 3] == 0; ++offset);
438         assert(coefs[offset + 3] != 0); // hmm... shouldn't we be causal?
439 
440         // Normalize 6 coefficients to 5 for storage.
441         lcoef[0] = coef[offset] / coef[offset + 3];
442         for (size_t i = 1; i + offset < 3; ++i) {
443             lcoef[i] = coef[i + offset] / coef[offset + 3];
444             lcoef[i + 2] = coef[i + offset + 3] / coef[offset + 3];
445          }
446     } else if (coef.size() == kBiquadNumCoefs) {
447         std::copy(coef.begin(), coef.end(), lcoef.begin());
448     } else {
449         assert(coef.size() == kBiquadNumCoefs + 1 || coef.size() == kBiquadNumCoefs);
450     }
451     return lcoef;
452 }
453 
454 // Sets a container of coefficients to storage.
455 template <typename D, typename T, typename DEST>
setCoefficients(DEST & dest,size_t offset,size_t stride,size_t channelCount,const T & coef)456 static inline void setCoefficients(
457         DEST& dest, size_t offset, size_t stride, size_t channelCount, const T& coef) {
458     auto lcoef = reduceCoefficients<D, T>(coef);
459     // replicate as needed
460     for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
461         for (size_t j = 0; j < channelCount; ++j) {
462             dest[i * stride + offset + j] = lcoef[i];
463         }
464     }
465 }
466 
467 // Helper function to zero channels in the input buffer.
468 // This is used for the degenerate coefficient case which results in all zeroes.
469 template <typename D>
zeroChannels(D * out,size_t frames,size_t stride,size_t channelCount)470 void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) {
471     if (stride == channelCount) {
472         memset(out, 0, sizeof(float) * frames * channelCount);
473     } else {
474         for (size_t i = 0; i < frames; i++) {
475             memset(out, 0, sizeof(float) * channelCount);
476             out += stride;
477         }
478     }
479 }
480 
481 template <typename ConstOptions,
482         size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
biquad_filter_func_impl(F * out,const F * in,size_t frames,size_t stride,size_t channelCount,F * delays,const F * coefs,size_t localStride)483 void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride,
484         size_t channelCount, F *delays, const F *coefs, size_t localStride) {
485     using namespace android::audio_utils::intrinsics;
486 
487     constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
488     const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride;
489     using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>;
490     using KernelType = typename ConstOptions::template FilterType<T, CoefType>;
491 
492     for (size_t i = 0; i < channelCount; i += elements) {
493         T s1 = vld1<T>(&delays[0]);
494         T s2 = vld1<T>(&delays[localStride]);
495 
496         KernelType kernel(
497                 vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride),
498                 vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3),
499                 vld1<CoefType>(coefs + coefStride * 4),
500                 s1, s2);
501         if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements;
502         kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride);
503         vst1(&delays[0], kernel.s_[0]);
504         vst1(&delays[localStride], kernel.s_[1]);
505         delays += elements;
506     }
507 }
508 
509 // Find the nearest occupancy mask that includes all the desired bits.
510 template <typename T, size_t N>
nearestOccupancy(T occupancy,const T (& occupancies)[N])511 static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) {
512     if (occupancy < 32) {
513         for (auto test : occupancies) {
514             if ((occupancy & test) == occupancy) return test;
515         }
516     }
517     return 31;
518 }
519 
520 enum FILTER_OPTION {
521     FILTER_OPTION_SCALAR_ONLY = (1 << 0),
522 };
523 
524 
525 /**
526  * DefaultBiquadConstOptions holds the default set of options for customizing
527  * the Biquad filter at compile time.
528  *
529  * Consider inheriting from this structure and overriding the options
530  * desired; this is backward compatible to new options added in the future.
531  */
532 struct DefaultBiquadConstOptions {
533 
534     // Sets the Biquad filter type.
535     // Can be one of the already defined BiquadDirect2Transpose or BiquadStateSpace
536     // filter kernels; also can be a user defined filter kernel as well.
537     template <typename T, typename F>
538     using FilterType = BiquadStateSpace<T, F, false /* SEPARATE_CHANNEL_OPTIMIZATION */>;
539 };
540 
541 #define BIQUAD_FILTER_CASE(N, ... /* type */) \
542             case N: { \
543                 using VectorType = __VA_ARGS__; \
544                 biquad_filter_func_impl<ConstOptions, \
545                         nearestOccupancy(OCCUPANCY, \
546                                 ConstOptions::template FilterType<VectorType, D> \
547                                             ::required_occupancies_), \
548                         SAME_COEF_PER_CHANNEL, VectorType>( \
549                         out + offset, in + offset, frames, stride, remaining, \
550                         delays + offset, c, localStride); \
551                 goto exit; \
552             }
553 
554 template <typename ConstOptions, size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_func(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride,FILTER_OPTION filterOptions)555 void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride,
556         size_t channelCount, D *delays, const D *coefs, size_t localStride,
557         FILTER_OPTION filterOptions) {
558     if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
559         zeroChannels(out, frames, stride, channelCount);
560         return;
561     }
562 
563     // Possible alternative intrinsic types for 2, 9, 15 float elements.
564     // using alt_2_t = struct {struct { float a; float b; } s; };
565     // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
566     // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
567 
568 #ifdef USE_NEON
569     // use NEON types to ensure we have the proper intrinsic acceleration.
570     using alt_16_t = float32x4x4_t;
571     using alt_8_t = float32x4x2_t;
572     using alt_4_t = float32x4_t;
573 #else
574     // Use C++ types, no NEON needed.
575     using alt_16_t = intrinsics::internal_array_t<float, 16>;
576     using alt_8_t = intrinsics::internal_array_t<float, 8>;
577     using alt_4_t = intrinsics::internal_array_t<float, 4>;
578 #endif
579 
580     for (size_t offset = 0; offset < channelCount; ) {
581         size_t remaining = channelCount - offset;
582         auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
583         if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar;
584         if constexpr (std::is_same_v<D, float>) {
585             switch (remaining) {
586             default:
587                 if (remaining >= 16) {
588                     remaining &= ~15;
589                     biquad_filter_func_impl<ConstOptions,
590                             nearestOccupancy(OCCUPANCY,
591                                     ConstOptions::template FilterType<D, D>
592                                                 ::required_occupancies_),
593                             SAME_COEF_PER_CHANNEL, alt_16_t>(
594                             out + offset, in + offset, frames, stride, remaining,
595                             delays + offset, c, localStride);
596                     offset += remaining;
597                     continue;
598                 }
599                 break;  // case 1 handled at bottom.
600             BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
601             BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
602             BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
603             BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
604             BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
605             BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
606             BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
607             BIQUAD_FILTER_CASE(8, alt_8_t)
608             BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
609             BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
610             BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
611             BIQUAD_FILTER_CASE(4, alt_4_t)
612             BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
613             BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
614             // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>)
615             }
616         } else if constexpr (std::is_same_v<D, double>) {
617 #if defined(__aarch64__)
618             switch (remaining) {
619             default:
620                 if (remaining >= 8) {
621                     remaining &= ~7;
622                     biquad_filter_func_impl<ConstOptions,
623                             nearestOccupancy(OCCUPANCY,
624                                      ConstOptions::template FilterType<D, D>
625                                                  ::required_occupancies_),
626                             SAME_COEF_PER_CHANNEL,
627                             intrinsics::internal_array_t<double, 8>>(
628                             out + offset, in + offset, frames, stride, remaining,
629                             delays + offset, c, localStride);
630                     offset += remaining;
631                     continue;
632                 }
633                 break; // case 1 handled at bottom.
634             BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
635             BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
636             BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
637             BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
638             BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
639             BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
640             };
641 #endif
642         }
643         scalar:
644         // Essentially the code below is scalar, the same as
645         // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
646         // but formulated with NEON intrinsic-like call pattern.
647         biquad_filter_func_impl<ConstOptions,
648                 nearestOccupancy(OCCUPANCY,
649                         ConstOptions::template FilterType<D, D>::required_occupancies_),
650                  SAME_COEF_PER_CHANNEL, D>(
651                 out + offset, in + offset, frames, stride, remaining,
652                 delays + offset, c, localStride);
653         offset += remaining;
654     }
655     exit:;
656 }
657 
658 } // namespace details
659 
660 /**
661  * BiquadFilter
662  *
663  * A multichannel Biquad filter implementation of the following transfer function.
664  *
665  * \f[
666  *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
667  *               { 1   + a_1 z^{-1} + a_2 z^{-2} }
668  * \f]
669  *
670  * <!--
671  *        b_0 + b_1 z^{-1} + b_2 z^{-2}
672  *  H(z)= -----------------------------
673  *        1 + a_1 z^{-1} + a_2 z^{-2}
674  * -->
675  *
676  *  Details:
677  *    1. The transposed direct type 2 implementation allows zeros to be computed
678  *       before poles in the internal state for improved filter precision and
679  *       better time-varying coefficient performance.
680  *    2. We optimize for zero coefficients using a compile-time generated function table.
681  *    3. We optimize for vector operations using column vector operations with stride
682  *       into interleaved audio data.
683  *    4. The denominator coefficients a_1 and a_2 are stored in positive form, despite the
684  *       negated form being slightly simpler for optimization (addition is commutative but
685  *       subtraction is not commutative).  This is to permit obtaining the coefficients
686  *       as a const reference.
687  *
688  *       Compatibility issue: Some Biquad libraries store the denominator coefficients
689  *       in negated form.  We explicitly negate before entering into the inner loop.
690  *    5. The full 6 coefficient Biquad filter form with a_0 != 1 may be used for setting
691  *       coefficients.  See setCoefficients() below.
692  *
693  * If SAME_COEFFICIENTS_PER_CHANNEL is false, then mCoefs is stored interleaved by channel.
694  *
695  * The Biquad filter update equation in transposed Direct form 2 is as follows:
696  *
697  * \f{eqnarray*}{
698  * y[n] &=& b0 * x[n] + s1[n - 1] \\
699  * s1[n] &=& s2[n - 1] + b1 * x[n] - a1 * y[n] \\
700  * s2[n] &=& b2 * x[n] - a2 * y[n]
701  * \f}
702  *
703  * For the transposed Direct form 2 update equation s1 and s2 represent the delay state
704  * contained in the internal vector mDelays[].  This is stored interleaved by channel.
705  *
706  * Use -ffast-math` to permit associative math optimizations to get non-zero optimization as
707  * we do not rely on strict C operator precedence and associativity here.
708  * TODO(b/159373530): Use compound statement scoped pragmas instead of `-ffast-math`.
709  *
710  * \param D type variable representing the data type, one of float or double.
711  *         The default is float.
712  * \param SAME_COEF_PER_CHANNEL bool which is true if all the Biquad coefficients
713  *         are shared between channels, or false if the Biquad coefficients
714  *         may differ between channels. The default is true.
715  */
716 
717 template <typename D = float,
718         bool SAME_COEF_PER_CHANNEL = true,
719         typename ConstOptions = details::DefaultBiquadConstOptions>
720 class BiquadFilter {
721 public:
722     template <typename T = std::array<D, kBiquadNumCoefs>>
723     explicit BiquadFilter(size_t channelCount,
724             const T& coefs = {}, bool optimized = true)
mChannelCount(channelCount)725             : mChannelCount(channelCount)
726             , mCoefs(kBiquadNumCoefs * (SAME_COEF_PER_CHANNEL ? 1 : mChannelCount))
727             , mDelays(channelCount * kBiquadNumDelays) {
728         setCoefficients(coefs, optimized);
729     }
730 
731     // copy constructors
BiquadFilter(const BiquadFilter<D,SAME_COEF_PER_CHANNEL> & other)732     BiquadFilter(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
733         *this = other;
734     }
735 
BiquadFilter(BiquadFilter<D,SAME_COEF_PER_CHANNEL> && other)736     BiquadFilter(BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
737         *this = std::move(other);
738     }
739 
740     // copy assignment
741     BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
742             const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
743         mChannelCount = other.mChannelCount;
744         mCoefs = other.mCoefs;
745         mDelays = other.mDelays;
746         return *this;
747     }
748 
749     BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
750             BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
751         mChannelCount = other.mChannelCount;
752         mCoefs = std::move(other.mCoefs);
753         mDelays = std::move(other.mDelays);
754         return *this;
755     }
756 
757     // operator overloads for equality tests
758     bool operator==(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
759         return mChannelCount == other.mChannelCount
760                 && mCoefs == other.mCoefs
761                 && mDelays == other.mDelays;
762     }
763 
764     bool operator!=(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
765         return !operator==(other);
766     }
767 
768     /**
769      * \brief Sets filter coefficients
770      *
771      * \param coefs  pointer to the filter coefficients array.
772      * \param optimized whether to use processor optimized function (optional, defaults true).
773      * \return true if the BiquadFilter is stable, otherwise, return false.
774      *
775      * The input coefficients are interpreted in the following manner:
776      *
777      * If size of container is 5 (normalized Biquad):
778      * coefs[0] is b0,
779      * coefs[1] is b1,
780      * coefs[2] is b2,
781      * coefs[3] is a1,
782      * coefs[4] is a2.
783      *
784      * \f[
785      *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
786      *               { 1   + a_1 z^{-1} + a_2 z^{-2} }
787      * \f]
788      * <!--
789      *        b_0 + b_1 z^{-1} + b_2 z^{-2}
790      *  H(z)= -----------------------------
791      *        1 + a_1 z^{-1} + a_2 z^{-2}
792      * -->
793      *
794      * If size of container is 6 (general Biquad):
795      * coefs[0] is b0,
796      * coefs[1] is b1,
797      * coefs[2] is b2,
798      * coefs[3] is a0,
799      * coefs[4] is a1,
800      * coefs[5] is a2.
801      *
802      * \f[
803      *  H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
804      *               { a_0 + a_1 z^{-1} + a_2 z^{-2} }
805      * \f]
806      * <!--
807      *        b_0 + b_1 z^{-1} + b_2 z^{-2}
808      *  H(z)= -----------------------------
809      *        a_0 + a_1 z^{-1} + a_2 z^{-2}
810      * -->
811      *
812      * The internal representation is a normalized Biquad.
813      */
814     template <typename T = std::array<D, kBiquadNumCoefs>>
815     bool setCoefficients(const T& coefs, bool optimized = true) {
816         if constexpr (SAME_COEF_PER_CHANNEL) {
817             details::setCoefficients<D, T>(
818                     mCoefs, 0 /* offset */, 1 /* stride */, 1 /* channelCount */, coefs);
819         } else {
820             if (coefs.size() == mCoefs.size()) {
821                 std::copy(coefs.begin(), coefs.end(), mCoefs.begin());
822             } else {
823                 details::setCoefficients<D, T>(
824                         mCoefs, 0 /* offset */, mChannelCount, mChannelCount, coefs);
825             }
826         }
827         setOptimization(optimized);
828         return isStable();
829     }
830 
831     /**
832      * Sets coefficients for one of the filter channels, specified by channelIndex.
833      *
834      * This method is only available if SAME_COEF_PER_CHANNEL is false.
835      *
836      * \param coefs the coefficients to set.
837      * \param channelIndex the particular channel index to set.
838      * \param optimized whether to use optimized function (optional, defaults true).
839      */
840     template <typename T = std::array<D, kBiquadNumCoefs>>
841     bool setCoefficients(const T& coefs, size_t channelIndex, bool optimized = true) {
842         static_assert(!SAME_COEF_PER_CHANNEL);
843 
844         details::setCoefficients<D, T>(
845                 mCoefs, channelIndex, mChannelCount, 1 /* channelCount */, coefs);
846         setOptimization(optimized);
847         return isStable();
848     }
849 
850     /**
851      * Returns the coefficients as a const vector reference.
852      *
853      * If multichannel and the template variable SAME_COEF_PER_CHANNEL is true,
854      * the coefficients are interleaved by channel.
855      */
getCoefficients()856     const std::vector<D>& getCoefficients() const {
857         return mCoefs;
858     }
859 
860     /**
861      * Returns true if the filter is stable.
862      *
863      * \param channelIndex ignored if SAME_COEF_PER_CHANNEL is true,
864      *        asserts if channelIndex >= channel count (zero based index).
865      */
866     bool isStable(size_t channelIndex = 0) const {
867         if constexpr (SAME_COEF_PER_CHANNEL) {
868             return details::isStable(mCoefs[3], mCoefs[4]);
869         } else {
870             assert(channelIndex < mChannelCount);
871             return details::isStable(
872                     mCoefs[3 * mChannelCount + channelIndex],
873                     mCoefs[4 * mChannelCount + channelIndex]);
874         }
875     }
876 
877     /**
878      * Updates the filter function based on processor optimization.
879      *
880      * \param optimized if true, enables Processor based optimization.
881      */
setOptimization(bool optimized)882     void setOptimization(bool optimized) {
883         // Determine which coefficients are nonzero as a bit field.
884         size_t category = 0;
885         for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
886             if constexpr (SAME_COEF_PER_CHANNEL) {
887                 category |= (mCoefs[i] != 0) << i;
888             } else {
889                 for (size_t j = 0; j < mChannelCount; ++j) {
890                     if (mCoefs[i * mChannelCount + j] != 0) {
891                         category |= 1 << i;
892                         break;
893                     }
894                 }
895             }
896         }
897 
898         // Select the proper filtering function from our array.
899         if (optimized) {
900             mFilterOptions = (details::FILTER_OPTION)
901                     (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY);
902         } else {
903              mFilterOptions = (details::FILTER_OPTION)
904                      (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY);
905         }
906         mFunc = mFilterFuncs[category];
907     }
908 
909     /**
910      * \brief Filters the input data
911      *
912      * \param out     pointer to the output data
913      * \param in      pointer to the input data
914      * \param frames  number of audio frames to be processed
915      */
process(D * out,const D * in,size_t frames)916     void process(D* out, const D* in, size_t frames) {
917         process(out, in, frames, mChannelCount);
918     }
919 
920     /**
921      * \brief Filters the input data with stride
922      *
923      * \param out     pointer to the output data
924      * \param in      pointer to the input data
925      * \param frames  number of audio frames to be processed
926      * \param stride  the total number of samples associated with a frame, if not channelCount.
927      */
process(D * out,const D * in,size_t frames,size_t stride)928     void process(D* out, const D* in, size_t frames, size_t stride) {
929         assert(stride >= mChannelCount);
930         mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
931                 mCoefs.data(), mChannelCount, mFilterOptions);
932     }
933 
934     /**
935      * EXPERIMENTAL:
936      * Processes 1D input data, with mChannel Biquads, using sliding window parallelism.
937      *
938      * Instead of considering mChannel Biquads as one-per-input channel, this method treats
939      * the mChannel biquads as applied in sequence to a single 1D input stream,
940      * with the last channel count Biquad being applied first.
941      *
942      * input audio data -> BQ_{n-1} -> BQ{n-2} -> BQ_{n-3} -> BQ_{0} -> output
943      *
944      * TODO: Make this code efficient for NEON and split the destination from the source.
945      *
946      * Theoretically this code should be much faster for 1D input if one has 4+ Biquads to be
947      * sequentially applied, but in practice it is *MUCH* slower.
948      * On NEON, the data cannot be written then read in-place without incurring
949      * memory stall penalties.  A shifting NEON holding register is required to make this
950      * a practical improvement.
951      */
process1D(D * inout,size_t frames)952     void process1D(D* inout, size_t frames) {
953         size_t remaining = mChannelCount;
954 #ifdef USE_NEON
955         // We apply NEON acceleration striped with 4 filters (channels) at once.
956         // Filters operations commute, nevertheless we apply the filters in order.
957         if (frames >= 2 * mChannelCount) {
958             constexpr size_t channelBlock = 4;
959             for (; remaining >= channelBlock; remaining -= channelBlock) {
960                 const size_t baseIdx = remaining - channelBlock;
961                 // This is the 1D accelerated method.
962                 // prime the data pipe.
963                 for (size_t i = 0; i < channelBlock - 1; ++i) {
964                     size_t fromEnd = remaining - i - 1;
965                     auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
966                     auto delays = mDelays.data() + fromEnd;
967                     mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
968                             delays, coefs, mChannelCount, mFilterOptions);
969                 }
970 
971                 auto delays = mDelays.data() + baseIdx;
972                 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : baseIdx);
973                 // Parallel processing - we use a sliding window doing channelBlock at once,
974                 // sliding one audio sample at a time.
975                 mFunc(inout, inout,
976                         frames - channelBlock + 1, 1 /* stride */, channelBlock,
977                         delays, coefs, mChannelCount, mFilterOptions);
978 
979                 // drain data pipe.
980                 for (size_t i = 1; i < channelBlock; ++i) {
981                     mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
982                             1 /* frames */, 1 /* stride */, channelBlock - i,
983                             delays, coefs, mChannelCount, mFilterOptions);
984                 }
985             }
986         }
987 #endif
988         // For short data sequences, we use the serial single channel logical equivalent
989         for (; remaining > 0; --remaining) {
990             size_t fromEnd = remaining - 1;
991             auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
992             mFunc(inout, inout,
993                     frames, 1 /* stride */, 1 /* channelCount */,
994                     mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions);
995         }
996     }
997 
998     /**
999      * \brief Clears the delay elements
1000      *
1001      * This function clears the delay elements representing the filter state.
1002      */
clear()1003     void clear() {
1004         std::fill(mDelays.begin(), mDelays.end(), 0.f);
1005     }
1006 
1007     /**
1008      * \brief Sets the internal delays from a vector
1009      *
1010      * For a multichannel stream, the delays are interleaved by channel:
1011      * delays[2 * i + 0] is s1 of i-th channel,
1012      * delays[2 * i + 1] is s2 of i-th channel,
1013      * where index i runs from 0 to (mChannelCount - 1).
1014      *
1015      * \param delays reference to vector containing delays.
1016      */
setDelays(std::vector<D> & delays)1017     void setDelays(std::vector<D>& delays) {
1018         assert(delays.size() == mDelays.size());
1019         mDelays = std::move(delays);
1020     }
1021 
1022     /**
1023      * \brief Gets delay elements as a vector
1024      *
1025      * For a multichannel stream, the delays are interleaved by channel:
1026      * delays[2 * i + 0] is s1 of i-th channel,
1027      * delays[2 * i + 1] is s2 of i-th channel,
1028      * where index i runs from 0 to (mChannelCount - 1).
1029      *
1030      * \return a const vector reference of delays.
1031      */
getDelays()1032     const std::vector<D>& getDelays() const {
1033         return mDelays;
1034     }
1035 
1036 private:
1037     /* const */ size_t mChannelCount; // not const because we can assign to it on operator equals.
1038 
1039     /*
1040      * \var D mCoefs
1041      * \brief Stores the filter coefficients
1042      *
1043      * If SAME_COEF_PER_CHANNEL is false, the filter coefficients are stored
1044      * interleaved by channel.
1045      */
1046     std::vector<D> mCoefs;
1047 
1048     /**
1049      * \var D mDelays
1050      * \brief The delay state.
1051      *
1052      * The delays are stored channel interleaved in the following manner,
1053      * mDelays[2 * i + 0] is s1 of i-th channel
1054      * mDelays[2 * i + 1] is s2 of i-th channel
1055      * index i runs from 0 to (mChannelCount - 1).
1056      */
1057     std::vector<D> mDelays;
1058 
1059     details::FILTER_OPTION mFilterOptions{};
1060 
1061     // Consider making a separate delegation class.
1062     /*
1063      * We store an array of functions based on the occupancy.
1064      *
1065      * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
1066      * b0 b1 b2 a1 a2  (from lsb to msb)
1067      *
1068      *  static inline constexpr std::array<filter_func*, M> mArray = {
1069      *     biquad_filter_func<0>,
1070      *     biquad_filter_func<1>,
1071      *     biquad_filter_func<2>,
1072      *      ...
1073      *     biquad_filter_func<(1 << kBiquadNumCoefs) - 1>,
1074      *  };
1075      *
1076      * Every time the coefficients are changed, we select the processing function from
1077      * this table.
1078      */
1079 
1080     // Used to build the functional array.
1081     template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
1082     struct FuncWrap {
funcFuncWrap1083         static void func(D* out, const D *in, size_t frames, size_t stride,
1084                 size_t channelCount, D *delays, const D *coef, size_t localStride,
1085                 details::FILTER_OPTION filterOptions) {
1086             constexpr size_t NEAREST_OCCUPANCY =
1087                 details::nearestOccupancy(
1088                         OCCUPANCY, ConstOptions::template FilterType<D, D>
1089                                                ::required_occupancies_);
1090             details::biquad_filter_func<ConstOptions, NEAREST_OCCUPANCY, SC>(
1091                     out, in, frames, stride, channelCount, delays, coef, localStride,
1092                     filterOptions);
1093         }
1094     };
1095 
1096     // Vector optimized array of functions.
1097     static inline constexpr auto mFilterFuncs =
1098             details::make_functional_array<
1099                     FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
1100 
1101     /**
1102      * \var filter_func* mFunc
1103      *
1104      * The current filter function selected for the channel occupancy of the Biquad.
1105      * It will be one of mFilterFuncs.
1106      */
1107     std::decay_t<decltype(mFilterFuncs[0])> mFunc;
1108 };
1109 
1110 } // namespace android::audio_utils
1111 
1112 #pragma pop_macro("USE_DITHER")
1113 #pragma pop_macro("USE_NEON")
1114