1 /*
2 * Copyright (C) 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 *
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #pragma once
18
19 #include "intrinsic_utils.h"
20
21 #include <algorithm>
22 #include <array>
23 #include <cmath>
24 #include <functional>
25 #include <utility>
26 #include <vector>
27
28 #include <assert.h>
29
30 // We conditionally include neon optimizations for ARM devices
31 #pragma push_macro("USE_NEON")
32 #undef USE_NEON
33
34 #if defined(__ARM_NEON__) || defined(__aarch64__)
35 #include <arm_neon.h>
36 #define USE_NEON
37 #endif
38
39 // Use dither to prevent subnormals for CPUs that raise an exception.
40 #pragma push_macro("USE_DITHER")
41 #undef USE_DITHER
42
43 #if defined(__i386__) || defined(__x86_x64__)
44 #define USE_DITHER
45 #endif
46
47 namespace android::audio_utils {
48
49 static constexpr size_t kBiquadNumCoefs = 5;
50 static constexpr size_t kBiquadNumDelays = 2;
51
52 /**
53 * The BiquadDirect2Transpose is a low overhead
54 * Biquad filter with coefficients b0, b1, b2, a1, a2.
55 *
56 * This can be used by itself, but it is preferred for best data management
57 * to use the BiquadFilter abstraction below.
58 *
59 * T is the data type (scalar or vector).
60 * F is the filter coefficient type. It is either a scalar or vector (matching T).
61 */
62 template <typename T, typename F>
63 struct BiquadDirect2Transpose {
64 F coef_[5]; // these are stored with the denominator a's negated.
65 T s_[2]; // delay states
66
67 // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
68 // as expressed by a bitmask.
69 static inline constexpr size_t required_occupancies_[] = {
70 0x1, // constant scale
71 0x3, // single zero
72 0x7, // double zero
73 0x9, // single pole
74 0xb, // (11) first order IIR
75 0x1b, // (27) double pole + single zero
76 0x1f, // (31) second order IIR (full Biquad)
77 };
78
79 // Take care the order of arguments - starts with b's then goes to a's.
80 // The a's are "positive" reference, some filters take negative.
81 BiquadDirect2Transpose(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
82 const T& s0 = {}, const T& s1 = {})
83 // : coef_{b0, b1, b2, -a1, -a2}
84 : coef_{ b0,
85 b1,
86 b2,
87 intrinsics::vneg(a1),
88 intrinsics::vneg(a2) }
89 , s_{ s0, s1 } {
90 }
91
92 // D is the data type. It must be the same element type of T or F.
93 // Take care the order of input and output.
94 template<typename D, size_t OCCUPANCY = 0x1f>
95 __attribute__((always_inline)) // required for 1ch speedup (30% faster)
processBiquadDirect2Transpose96 void process(D* output, const D* input, size_t frames, size_t stride) {
97 using namespace intrinsics;
98 // For SSE it is possible to vdup F to T if F is scalar.
99 const F b0 = coef_[0]; // b0
100 const F b1 = coef_[1]; // b1
101 const F b2 = coef_[2]; // b2
102 const F negativeA1 = coef_[3]; // -a1
103 const F negativeA2 = coef_[4]; // -a2
104 T s[2] = { s_[0], s_[1] };
105 T xn, yn; // OK to declare temps outside loop rather than at the point of initialization.
106 #ifdef USE_DITHER
107 constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
108 T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
109 #endif
110
111 // Unroll control. Make sure the constexpr remains constexpr :-).
112 constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
113 constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 2; // below this won't be unrolled.
114 constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled.
115 constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
116 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
117 size_t remainder = 0;
118 if constexpr (UNROLL_LOOPS > 1) {
119 remainder = frames % UNROLL_LOOPS;
120 frames /= UNROLL_LOOPS;
121 }
122
123 // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
124 // The other alternative is to use a MACRO, but that doesn't read as well.
125 const auto KERNEL = [&]() __attribute__((always_inline)) {
126 xn = vld1<T>(input);
127 input += stride;
128 #ifdef USE_DITHER
129 xn = vadd(xn, dither);
130 dither = vneg(dither);
131 #endif
132
133 yn = s[0];
134 if constexpr (OCCUPANCY >> 0 & 1) {
135 yn = vmla(yn, b0, xn);
136 }
137 vst1(output, yn);
138 output += stride;
139
140 s[0] = s[1];
141 if constexpr (OCCUPANCY >> 3 & 1) {
142 s[0] = vmla(s[0], negativeA1, yn);
143 }
144 if constexpr (OCCUPANCY >> 1 & 1) {
145 s[0] = vmla(s[0], b1, xn);
146 }
147 if constexpr (OCCUPANCY >> 2 & 1) {
148 s[1] = vmul(b2, xn);
149 } else {
150 s[1] = vdupn<T>(0.f);
151 }
152 if constexpr (OCCUPANCY >> 4 & 1) {
153 s[1] = vmla(s[1], negativeA2, yn);
154 }
155 };
156
157 while (frames > 0) {
158 #pragma unroll
159 for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
160 KERNEL();
161 }
162 frames--;
163 }
164 if constexpr (UNROLL_LOOPS > 1) {
165 for (size_t i = 0; i < remainder; ++i) {
166 KERNEL();
167 }
168 }
169 s_[0] = s[0];
170 s_[1] = s[1];
171 }
172 };
173
174 /**
175 * A state space formulation of filtering converts a n-th order difference equation update
176 * to a first order vector difference equation. For the Biquad filter, the state space form
177 * has improved numerical precision properties with poles near the unit circle as well as
178 * increased speed due to better parallelization of the state update [1][2].
179 *
180 * [1] Raph Levien: (observerable canonical form)
181 * https://github.com/google/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
182 *
183 * [2] Julius O Smith III: (controllable canonical form)
184 * https://ccrma.stanford.edu/~jos/filters/State_Space_Filters.html
185 *
186 * The signal flow is as follows, where for scalar x and y, the matrix D is a scalar d.
187 *
188 *
189 * +------[ d ]--------------------------+
190 * | S |
191 * x ----+--[ B ]--(+)--[ z^-1 ]---+---[ C ]--(+)--- y
192 * | |
193 * +----[ A ]-----+
194 *
195 * The 2nd order Biquad IIR coefficients are as follows in observerable canonical form:
196 *
197 * y = [C_1 C_2] | S_1 | + d x
198 * | S_2 |
199 *
200 *
201 * | S_1 | = | A_11 A_12 | | S_1 | + | B_1 | x
202 * | S_2 | | A_21 A_22 | | S_2 | | B_2 |
203 *
204 *
205 * A_11 = -a1
206 * A_12 = 1
207 * A_21 = -a2
208 * A_22 = 0
209 *
210 * B_1 = b1 - b0 * a1
211 * B_2 = b2 - b0 * a2
212 *
213 * C_1 = 1
214 * C_2 = 0
215 *
216 * d = b0
217 *
218 * Implementation details: The state space filter is typically expressed in either observable or
219 * controllable canonical form [3]. We follow the observable form here.
220 * Raph [4] discovered that the single channel Biquad implementation can be further optimized
221 * by doing 2 filter updates at once (improving speed on NEON by about 20%).
222 * Doing 2 updates at once costs 8 multiplies / sample instead of 5 multiples / sample,
223 * but uses a 4 x 4 matrix multiply, exploiting single cycle multiply-add SIMD throughput.
224 *
225 * [3] Mathworks
226 * https://www.mathworks.com/help/control/ug/canonical-state-space-realizations.html
227 * [4] Raph Levien
228 * https://github.com/kysko/music-synthesizer-for-android/blob/master/lab/biquad%20in%20two.ipynb
229 *
230 * The template variables
231 * T is the data type (scalar or vector).
232 * F is the filter coefficient type. It is either a scalar or vector (matching T).
233 */
234 template <typename T, typename F, bool SEPARATE_CHANNEL_OPTIMIZATION = false>
235 struct BiquadStateSpace {
236 F coef_[5]; // these are stored as state-space converted.
237 T s_[2]; // delay states
238
239 // These are the coefficient occupancies we optimize for (from b0, b1, b2, a1, a2)
240 // as expressed by a bitmask. This must include 31.
241 static inline constexpr size_t required_occupancies_[] = {
242 1, // constant scale
243 3, // single zero
244 7, // double zero
245 9, // single pole
246 11, // first order IIR
247 27, // double pole + single zero
248 31, // second order IIR (full Biquad)
249 };
250
251 // Take care the order of arguments - starts with b's then goes to a's.
252 // The a's are "positive" reference, some filters take negative.
253 BiquadStateSpace(const F& b0, const F& b1, const F& b2, const F& a1, const F& a2,
254 const T& s0 = {}, const T& s1 = {})
255 // : coef_{b0, b1 - b0 * a1, b2 - b0 * a2, -a1, -a2}
256 : coef_{ b0,
257 intrinsics::vsub(b1, intrinsics::vmul(b0, a1)),
258 intrinsics::vsub(b2, intrinsics::vmul(b0, a2)),
259 intrinsics::vneg(a1),
260 intrinsics::vneg(a2) }
261 , s_{s0, s1} {
262 }
263
264 // D is the data type. It must be the same element type of T or F.
265 // Take care the order of input and output.
266 template<typename D, size_t OCCUPANCY = 0x1f>
processBiquadStateSpace267 void process(D* output, const D* input, size_t frames, size_t stride) {
268 using namespace intrinsics;
269 const F b0 = coef_[0]; // b0
270 const F b1ss = coef_[1]; // b1 - b0 * a1,
271 const F b2ss = coef_[2]; // b2 - b0 * a2,
272 const F negativeA1 = coef_[3]; // -a1
273 const F negativeA2 = coef_[4]; // -a2
274 T s[2] = { s_[0], s_[1] };
275 T x, new_s0; // OK to declare temps here rather than at the point of initialization.
276 #ifdef USE_DITHER
277 constexpr D DITHER_VALUE = std::numeric_limits<float>::min() * (1 << 24); // use FLOAT
278 T dither = vdupn<T>(DITHER_VALUE); // NEON does not have vector + scalar acceleration.
279 #endif
280 constexpr bool b0_present = (OCCUPANCY & 0x1) != 0;
281 constexpr bool a1_present = (OCCUPANCY & 0x8) != 0;
282 constexpr bool a2_present = (OCCUPANCY & 0x10) != 0;
283 constexpr bool b1ss_present = (OCCUPANCY & 0x2) != 0 ||
284 (b0_present && a1_present);
285 constexpr bool b2ss_present = (OCCUPANCY & 0x4) != 0 ||
286 (b0_present && a2_present);
287
288 // Unroll control. Make sure the constexpr remains constexpr :-).
289 constexpr size_t CHANNELS = sizeof(T) / sizeof(D);
290 constexpr size_t UNROLL_CHANNEL_LOWER_LIMIT = 1; // below this won't be unrolled.
291 constexpr size_t UNROLL_CHANNEL_UPPER_LIMIT = 16; // above this won't be unrolled.
292 constexpr size_t UNROLL_LOOPS = (CHANNELS >= UNROLL_CHANNEL_LOWER_LIMIT &&
293 CHANNELS <= UNROLL_CHANNEL_UPPER_LIMIT) ? 2 : 1;
294
295 if constexpr (SEPARATE_CHANNEL_OPTIMIZATION && CHANNELS == 1 && OCCUPANCY >= 11) {
296 // Special acceleration which computes 2 samples at a time.
297 // see reference [4] for computation of this matrix.
298 intrinsics::internal_array_t<T, 4> A[4] = {
299 {b0, b1ss, negativeA1 * b1ss + b2ss, negativeA2 * b1ss},
300 {0, b0, b1ss, b2ss},
301 {1, negativeA1, negativeA2 + negativeA1 * negativeA1, negativeA1 * negativeA2},
302 {0, 1, negativeA1, negativeA2},
303 };
304 intrinsics::internal_array_t<T, 4> y;
305 while (frames > 1) {
306 x = vld1<T>(input);
307 input += stride;
308 #ifdef USE_DITHER
309 x = vadd(x, dither);
310 dither = vneg(dither);
311 #endif
312 y = vmul(A[0], x);
313
314 x = vld1<T>(input);
315 input += stride;
316 #ifdef USE_DITHER
317 x = vadd(x, dither);
318 dither = vneg(dither);
319 #endif
320 y = vmla(y, A[1], x);
321 y = vmla(y, A[2], s[0]);
322 y = vmla(y, A[3], s[1]);
323
324 vst1(output, y.v[0]);
325 output += stride;
326
327 vst1(output, y.v[1]);
328 output += stride;
329
330 s[0] = y.v[2];
331 s[1] = y.v[3];
332 frames -= 2;
333 }
334 if (frames == 0) {
335 s_[0] = s[0];
336 s_[1] = s[1];
337 return;
338 }
339 }
340
341 size_t remainder = 0;
342 if constexpr (UNROLL_LOOPS > 1) {
343 remainder = frames % UNROLL_LOOPS;
344 frames /= UNROLL_LOOPS;
345 }
346
347 // For this lambda, attribute always_inline must be used to inline past CHANNELS > 4.
348 // The other alternative is to use a MACRO, but that doesn't read as well.
349 const auto KERNEL = [&]() __attribute__((always_inline)) {
350 x = vld1<T>(input);
351 input += stride;
352 #ifdef USE_DITHER
353 x = vadd(x, dither);
354 dither = vneg(dither);
355 #endif
356 // vst1(output, vadd(s[0], vmul(b0, x)));
357 // output += stride;
358 // new_s0 = vadd(vadd(vmul(b1ss, x), vmul(negativeA1, s[0])), s[1]);
359 // s[1] = vadd(vmul(b2ss, x), vmul(negativeA2, s[0]));
360
361 if constexpr (b0_present) {
362 vst1(output, vadd(s[0], vmul(b0, x)));
363 } else /* constexpr */ {
364 vst1(output, s[0]);
365 }
366 output += stride;
367 new_s0 = s[1];
368 if constexpr (b1ss_present) {
369 new_s0 = vadd(new_s0, vmul(b1ss, x));
370 }
371 if constexpr (a1_present) {
372 new_s0 = vadd(new_s0, vmul(negativeA1, s[0]));
373 }
374 if constexpr (b2ss_present) {
375 s[1] = vmul(b2ss, x);
376 if constexpr (a2_present) {
377 s[1] = vadd(s[1], vmul(negativeA2, s[0]));
378 }
379 } else if constexpr (a2_present) {
380 s[1] = vmul(negativeA2, s[0]);
381 }
382 s[0] = new_s0;
383 };
384
385 while (frames > 0) {
386 #pragma unroll
387 for (size_t i = 0; i < UNROLL_LOOPS; ++i) {
388 KERNEL();
389 }
390 frames--;
391 }
392 if constexpr (UNROLL_LOOPS > 1) {
393 for (size_t i = 0; i < remainder; ++i) {
394 KERNEL();
395 }
396 }
397 s_[0] = s[0];
398 s_[1] = s[1];
399 }
400 };
401
402 namespace details {
403
404 // Helper methods for constructing a constexpr array of function pointers.
405 // As function pointers are efficient and have no constructor/destructor
406 // this is preferred over std::function.
407 //
408 // SC stands for SAME_COEF_PER_CHANNEL, a compile time boolean constant.
409 template <template <size_t, bool, typename ...> typename F, bool SC, size_t... Is>
make_functional_array_from_index_sequence(std::index_sequence<Is...>)410 static inline constexpr auto make_functional_array_from_index_sequence(std::index_sequence<Is...>) {
411 using first_t = decltype(&F<0, false>::func); // type from function
412 using result_t = std::array<first_t, sizeof...(Is)>; // type of array
413 return result_t{{F<Is, SC>::func...}}; // initialize with functions.
414 }
415
416 template <template <size_t, bool, typename ...> typename F, size_t M, bool SC>
make_functional_array()417 static inline constexpr auto make_functional_array() {
418 return make_functional_array_from_index_sequence<F, SC>(std::make_index_sequence<M>());
419 }
420
421 // Returns true if the poles are stable for a Biquad.
422 template <typename D>
isStable(const D & a1,const D & a2)423 static inline constexpr bool isStable(const D& a1, const D& a2) {
424 return fabs(a2) < D(1) && fabs(a1) < D(1) + a2;
425 }
426
427 // Simplifies Biquad coefficients.
428 // TODO: consider matched pole/zero cancellation.
429 // consider subnormal elimination for Intel processors.
430 template <typename D, typename T>
reduceCoefficients(const T & coef)431 std::array<D, kBiquadNumCoefs> reduceCoefficients(const T& coef) {
432 std::array<D, kBiquadNumCoefs> lcoef;
433 if (coef.size() == kBiquadNumCoefs + 1) {
434 // General form of Biquad.
435 // Remove matched z^-1 factors in top and bottom (e.g. coefs[0] == coefs[3] == 0).
436 size_t offset = 0;
437 for (; offset < 2 && coef[offset] == 0 && coef[offset + 3] == 0; ++offset);
438 assert(coefs[offset + 3] != 0); // hmm... shouldn't we be causal?
439
440 // Normalize 6 coefficients to 5 for storage.
441 lcoef[0] = coef[offset] / coef[offset + 3];
442 for (size_t i = 1; i + offset < 3; ++i) {
443 lcoef[i] = coef[i + offset] / coef[offset + 3];
444 lcoef[i + 2] = coef[i + offset + 3] / coef[offset + 3];
445 }
446 } else if (coef.size() == kBiquadNumCoefs) {
447 std::copy(coef.begin(), coef.end(), lcoef.begin());
448 } else {
449 assert(coef.size() == kBiquadNumCoefs + 1 || coef.size() == kBiquadNumCoefs);
450 }
451 return lcoef;
452 }
453
454 // Sets a container of coefficients to storage.
455 template <typename D, typename T, typename DEST>
setCoefficients(DEST & dest,size_t offset,size_t stride,size_t channelCount,const T & coef)456 static inline void setCoefficients(
457 DEST& dest, size_t offset, size_t stride, size_t channelCount, const T& coef) {
458 auto lcoef = reduceCoefficients<D, T>(coef);
459 // replicate as needed
460 for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
461 for (size_t j = 0; j < channelCount; ++j) {
462 dest[i * stride + offset + j] = lcoef[i];
463 }
464 }
465 }
466
467 // Helper function to zero channels in the input buffer.
468 // This is used for the degenerate coefficient case which results in all zeroes.
469 template <typename D>
zeroChannels(D * out,size_t frames,size_t stride,size_t channelCount)470 void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) {
471 if (stride == channelCount) {
472 memset(out, 0, sizeof(float) * frames * channelCount);
473 } else {
474 for (size_t i = 0; i < frames; i++) {
475 memset(out, 0, sizeof(float) * channelCount);
476 out += stride;
477 }
478 }
479 }
480
481 template <typename ConstOptions,
482 size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
biquad_filter_func_impl(F * out,const F * in,size_t frames,size_t stride,size_t channelCount,F * delays,const F * coefs,size_t localStride)483 void biquad_filter_func_impl(F *out, const F *in, size_t frames, size_t stride,
484 size_t channelCount, F *delays, const F *coefs, size_t localStride) {
485 using namespace android::audio_utils::intrinsics;
486
487 constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
488 const size_t coefStride = SAME_COEF_PER_CHANNEL ? 1 : localStride;
489 using CoefType = std::conditional_t<SAME_COEF_PER_CHANNEL, F, T>;
490 using KernelType = typename ConstOptions::template FilterType<T, CoefType>;
491
492 for (size_t i = 0; i < channelCount; i += elements) {
493 T s1 = vld1<T>(&delays[0]);
494 T s2 = vld1<T>(&delays[localStride]);
495
496 KernelType kernel(
497 vld1<CoefType>(coefs), vld1<CoefType>(coefs + coefStride),
498 vld1<CoefType>(coefs + coefStride * 2), vld1<CoefType>(coefs + coefStride * 3),
499 vld1<CoefType>(coefs + coefStride * 4),
500 s1, s2);
501 if constexpr (!SAME_COEF_PER_CHANNEL) coefs += elements;
502 kernel.template process<F, OCCUPANCY>(&out[i], &in[i], frames, stride);
503 vst1(&delays[0], kernel.s_[0]);
504 vst1(&delays[localStride], kernel.s_[1]);
505 delays += elements;
506 }
507 }
508
509 // Find the nearest occupancy mask that includes all the desired bits.
510 template <typename T, size_t N>
nearestOccupancy(T occupancy,const T (& occupancies)[N])511 static constexpr size_t nearestOccupancy(T occupancy, const T (&occupancies)[N]) {
512 if (occupancy < 32) {
513 for (auto test : occupancies) {
514 if ((occupancy & test) == occupancy) return test;
515 }
516 }
517 return 31;
518 }
519
520 enum FILTER_OPTION {
521 FILTER_OPTION_SCALAR_ONLY = (1 << 0),
522 };
523
524
525 /**
526 * DefaultBiquadConstOptions holds the default set of options for customizing
527 * the Biquad filter at compile time.
528 *
529 * Consider inheriting from this structure and overriding the options
530 * desired; this is backward compatible to new options added in the future.
531 */
532 struct DefaultBiquadConstOptions {
533
534 // Sets the Biquad filter type.
535 // Can be one of the already defined BiquadDirect2Transpose or BiquadStateSpace
536 // filter kernels; also can be a user defined filter kernel as well.
537 template <typename T, typename F>
538 using FilterType = BiquadStateSpace<T, F, false /* SEPARATE_CHANNEL_OPTIMIZATION */>;
539 };
540
541 #define BIQUAD_FILTER_CASE(N, ... /* type */) \
542 case N: { \
543 using VectorType = __VA_ARGS__; \
544 biquad_filter_func_impl<ConstOptions, \
545 nearestOccupancy(OCCUPANCY, \
546 ConstOptions::template FilterType<VectorType, D> \
547 ::required_occupancies_), \
548 SAME_COEF_PER_CHANNEL, VectorType>( \
549 out + offset, in + offset, frames, stride, remaining, \
550 delays + offset, c, localStride); \
551 goto exit; \
552 }
553
554 template <typename ConstOptions, size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_func(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride,FILTER_OPTION filterOptions)555 void biquad_filter_func(D *out, const D *in, size_t frames, size_t stride,
556 size_t channelCount, D *delays, const D *coefs, size_t localStride,
557 FILTER_OPTION filterOptions) {
558 if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
559 zeroChannels(out, frames, stride, channelCount);
560 return;
561 }
562
563 // Possible alternative intrinsic types for 2, 9, 15 float elements.
564 // using alt_2_t = struct {struct { float a; float b; } s; };
565 // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
566 // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
567
568 #ifdef USE_NEON
569 // use NEON types to ensure we have the proper intrinsic acceleration.
570 using alt_16_t = float32x4x4_t;
571 using alt_8_t = float32x4x2_t;
572 using alt_4_t = float32x4_t;
573 #else
574 // Use C++ types, no NEON needed.
575 using alt_16_t = intrinsics::internal_array_t<float, 16>;
576 using alt_8_t = intrinsics::internal_array_t<float, 8>;
577 using alt_4_t = intrinsics::internal_array_t<float, 4>;
578 #endif
579
580 for (size_t offset = 0; offset < channelCount; ) {
581 size_t remaining = channelCount - offset;
582 auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
583 if (filterOptions & FILTER_OPTION_SCALAR_ONLY) goto scalar;
584 if constexpr (std::is_same_v<D, float>) {
585 switch (remaining) {
586 default:
587 if (remaining >= 16) {
588 remaining &= ~15;
589 biquad_filter_func_impl<ConstOptions,
590 nearestOccupancy(OCCUPANCY,
591 ConstOptions::template FilterType<D, D>
592 ::required_occupancies_),
593 SAME_COEF_PER_CHANNEL, alt_16_t>(
594 out + offset, in + offset, frames, stride, remaining,
595 delays + offset, c, localStride);
596 offset += remaining;
597 continue;
598 }
599 break; // case 1 handled at bottom.
600 BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
601 BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
602 BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
603 BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
604 BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
605 BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
606 BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
607 BIQUAD_FILTER_CASE(8, alt_8_t)
608 BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
609 BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
610 BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
611 BIQUAD_FILTER_CASE(4, alt_4_t)
612 BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
613 BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
614 // BIQUAD_FILTER_CASE(1, BiquadFilterType, intrinsics::internal_array_t<float, 1>)
615 }
616 } else if constexpr (std::is_same_v<D, double>) {
617 #if defined(__aarch64__)
618 switch (remaining) {
619 default:
620 if (remaining >= 8) {
621 remaining &= ~7;
622 biquad_filter_func_impl<ConstOptions,
623 nearestOccupancy(OCCUPANCY,
624 ConstOptions::template FilterType<D, D>
625 ::required_occupancies_),
626 SAME_COEF_PER_CHANNEL,
627 intrinsics::internal_array_t<double, 8>>(
628 out + offset, in + offset, frames, stride, remaining,
629 delays + offset, c, localStride);
630 offset += remaining;
631 continue;
632 }
633 break; // case 1 handled at bottom.
634 BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
635 BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
636 BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
637 BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
638 BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
639 BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
640 };
641 #endif
642 }
643 scalar:
644 // Essentially the code below is scalar, the same as
645 // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
646 // but formulated with NEON intrinsic-like call pattern.
647 biquad_filter_func_impl<ConstOptions,
648 nearestOccupancy(OCCUPANCY,
649 ConstOptions::template FilterType<D, D>::required_occupancies_),
650 SAME_COEF_PER_CHANNEL, D>(
651 out + offset, in + offset, frames, stride, remaining,
652 delays + offset, c, localStride);
653 offset += remaining;
654 }
655 exit:;
656 }
657
658 } // namespace details
659
660 /**
661 * BiquadFilter
662 *
663 * A multichannel Biquad filter implementation of the following transfer function.
664 *
665 * \f[
666 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
667 * { 1 + a_1 z^{-1} + a_2 z^{-2} }
668 * \f]
669 *
670 * <!--
671 * b_0 + b_1 z^{-1} + b_2 z^{-2}
672 * H(z)= -----------------------------
673 * 1 + a_1 z^{-1} + a_2 z^{-2}
674 * -->
675 *
676 * Details:
677 * 1. The transposed direct type 2 implementation allows zeros to be computed
678 * before poles in the internal state for improved filter precision and
679 * better time-varying coefficient performance.
680 * 2. We optimize for zero coefficients using a compile-time generated function table.
681 * 3. We optimize for vector operations using column vector operations with stride
682 * into interleaved audio data.
683 * 4. The denominator coefficients a_1 and a_2 are stored in positive form, despite the
684 * negated form being slightly simpler for optimization (addition is commutative but
685 * subtraction is not commutative). This is to permit obtaining the coefficients
686 * as a const reference.
687 *
688 * Compatibility issue: Some Biquad libraries store the denominator coefficients
689 * in negated form. We explicitly negate before entering into the inner loop.
690 * 5. The full 6 coefficient Biquad filter form with a_0 != 1 may be used for setting
691 * coefficients. See setCoefficients() below.
692 *
693 * If SAME_COEFFICIENTS_PER_CHANNEL is false, then mCoefs is stored interleaved by channel.
694 *
695 * The Biquad filter update equation in transposed Direct form 2 is as follows:
696 *
697 * \f{eqnarray*}{
698 * y[n] &=& b0 * x[n] + s1[n - 1] \\
699 * s1[n] &=& s2[n - 1] + b1 * x[n] - a1 * y[n] \\
700 * s2[n] &=& b2 * x[n] - a2 * y[n]
701 * \f}
702 *
703 * For the transposed Direct form 2 update equation s1 and s2 represent the delay state
704 * contained in the internal vector mDelays[]. This is stored interleaved by channel.
705 *
706 * Use -ffast-math` to permit associative math optimizations to get non-zero optimization as
707 * we do not rely on strict C operator precedence and associativity here.
708 * TODO(b/159373530): Use compound statement scoped pragmas instead of `-ffast-math`.
709 *
710 * \param D type variable representing the data type, one of float or double.
711 * The default is float.
712 * \param SAME_COEF_PER_CHANNEL bool which is true if all the Biquad coefficients
713 * are shared between channels, or false if the Biquad coefficients
714 * may differ between channels. The default is true.
715 */
716
717 template <typename D = float,
718 bool SAME_COEF_PER_CHANNEL = true,
719 typename ConstOptions = details::DefaultBiquadConstOptions>
720 class BiquadFilter {
721 public:
722 template <typename T = std::array<D, kBiquadNumCoefs>>
723 explicit BiquadFilter(size_t channelCount,
724 const T& coefs = {}, bool optimized = true)
mChannelCount(channelCount)725 : mChannelCount(channelCount)
726 , mCoefs(kBiquadNumCoefs * (SAME_COEF_PER_CHANNEL ? 1 : mChannelCount))
727 , mDelays(channelCount * kBiquadNumDelays) {
728 setCoefficients(coefs, optimized);
729 }
730
731 // copy constructors
BiquadFilter(const BiquadFilter<D,SAME_COEF_PER_CHANNEL> & other)732 BiquadFilter(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
733 *this = other;
734 }
735
BiquadFilter(BiquadFilter<D,SAME_COEF_PER_CHANNEL> && other)736 BiquadFilter(BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
737 *this = std::move(other);
738 }
739
740 // copy assignment
741 BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
742 const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
743 mChannelCount = other.mChannelCount;
744 mCoefs = other.mCoefs;
745 mDelays = other.mDelays;
746 return *this;
747 }
748
749 BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
750 BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
751 mChannelCount = other.mChannelCount;
752 mCoefs = std::move(other.mCoefs);
753 mDelays = std::move(other.mDelays);
754 return *this;
755 }
756
757 // operator overloads for equality tests
758 bool operator==(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
759 return mChannelCount == other.mChannelCount
760 && mCoefs == other.mCoefs
761 && mDelays == other.mDelays;
762 }
763
764 bool operator!=(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
765 return !operator==(other);
766 }
767
768 /**
769 * \brief Sets filter coefficients
770 *
771 * \param coefs pointer to the filter coefficients array.
772 * \param optimized whether to use processor optimized function (optional, defaults true).
773 * \return true if the BiquadFilter is stable, otherwise, return false.
774 *
775 * The input coefficients are interpreted in the following manner:
776 *
777 * If size of container is 5 (normalized Biquad):
778 * coefs[0] is b0,
779 * coefs[1] is b1,
780 * coefs[2] is b2,
781 * coefs[3] is a1,
782 * coefs[4] is a2.
783 *
784 * \f[
785 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
786 * { 1 + a_1 z^{-1} + a_2 z^{-2} }
787 * \f]
788 * <!--
789 * b_0 + b_1 z^{-1} + b_2 z^{-2}
790 * H(z)= -----------------------------
791 * 1 + a_1 z^{-1} + a_2 z^{-2}
792 * -->
793 *
794 * If size of container is 6 (general Biquad):
795 * coefs[0] is b0,
796 * coefs[1] is b1,
797 * coefs[2] is b2,
798 * coefs[3] is a0,
799 * coefs[4] is a1,
800 * coefs[5] is a2.
801 *
802 * \f[
803 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
804 * { a_0 + a_1 z^{-1} + a_2 z^{-2} }
805 * \f]
806 * <!--
807 * b_0 + b_1 z^{-1} + b_2 z^{-2}
808 * H(z)= -----------------------------
809 * a_0 + a_1 z^{-1} + a_2 z^{-2}
810 * -->
811 *
812 * The internal representation is a normalized Biquad.
813 */
814 template <typename T = std::array<D, kBiquadNumCoefs>>
815 bool setCoefficients(const T& coefs, bool optimized = true) {
816 if constexpr (SAME_COEF_PER_CHANNEL) {
817 details::setCoefficients<D, T>(
818 mCoefs, 0 /* offset */, 1 /* stride */, 1 /* channelCount */, coefs);
819 } else {
820 if (coefs.size() == mCoefs.size()) {
821 std::copy(coefs.begin(), coefs.end(), mCoefs.begin());
822 } else {
823 details::setCoefficients<D, T>(
824 mCoefs, 0 /* offset */, mChannelCount, mChannelCount, coefs);
825 }
826 }
827 setOptimization(optimized);
828 return isStable();
829 }
830
831 /**
832 * Sets coefficients for one of the filter channels, specified by channelIndex.
833 *
834 * This method is only available if SAME_COEF_PER_CHANNEL is false.
835 *
836 * \param coefs the coefficients to set.
837 * \param channelIndex the particular channel index to set.
838 * \param optimized whether to use optimized function (optional, defaults true).
839 */
840 template <typename T = std::array<D, kBiquadNumCoefs>>
841 bool setCoefficients(const T& coefs, size_t channelIndex, bool optimized = true) {
842 static_assert(!SAME_COEF_PER_CHANNEL);
843
844 details::setCoefficients<D, T>(
845 mCoefs, channelIndex, mChannelCount, 1 /* channelCount */, coefs);
846 setOptimization(optimized);
847 return isStable();
848 }
849
850 /**
851 * Returns the coefficients as a const vector reference.
852 *
853 * If multichannel and the template variable SAME_COEF_PER_CHANNEL is true,
854 * the coefficients are interleaved by channel.
855 */
getCoefficients()856 const std::vector<D>& getCoefficients() const {
857 return mCoefs;
858 }
859
860 /**
861 * Returns true if the filter is stable.
862 *
863 * \param channelIndex ignored if SAME_COEF_PER_CHANNEL is true,
864 * asserts if channelIndex >= channel count (zero based index).
865 */
866 bool isStable(size_t channelIndex = 0) const {
867 if constexpr (SAME_COEF_PER_CHANNEL) {
868 return details::isStable(mCoefs[3], mCoefs[4]);
869 } else {
870 assert(channelIndex < mChannelCount);
871 return details::isStable(
872 mCoefs[3 * mChannelCount + channelIndex],
873 mCoefs[4 * mChannelCount + channelIndex]);
874 }
875 }
876
877 /**
878 * Updates the filter function based on processor optimization.
879 *
880 * \param optimized if true, enables Processor based optimization.
881 */
setOptimization(bool optimized)882 void setOptimization(bool optimized) {
883 // Determine which coefficients are nonzero as a bit field.
884 size_t category = 0;
885 for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
886 if constexpr (SAME_COEF_PER_CHANNEL) {
887 category |= (mCoefs[i] != 0) << i;
888 } else {
889 for (size_t j = 0; j < mChannelCount; ++j) {
890 if (mCoefs[i * mChannelCount + j] != 0) {
891 category |= 1 << i;
892 break;
893 }
894 }
895 }
896 }
897
898 // Select the proper filtering function from our array.
899 if (optimized) {
900 mFilterOptions = (details::FILTER_OPTION)
901 (mFilterOptions & ~details::FILTER_OPTION_SCALAR_ONLY);
902 } else {
903 mFilterOptions = (details::FILTER_OPTION)
904 (mFilterOptions | details::FILTER_OPTION_SCALAR_ONLY);
905 }
906 mFunc = mFilterFuncs[category];
907 }
908
909 /**
910 * \brief Filters the input data
911 *
912 * \param out pointer to the output data
913 * \param in pointer to the input data
914 * \param frames number of audio frames to be processed
915 */
process(D * out,const D * in,size_t frames)916 void process(D* out, const D* in, size_t frames) {
917 process(out, in, frames, mChannelCount);
918 }
919
920 /**
921 * \brief Filters the input data with stride
922 *
923 * \param out pointer to the output data
924 * \param in pointer to the input data
925 * \param frames number of audio frames to be processed
926 * \param stride the total number of samples associated with a frame, if not channelCount.
927 */
process(D * out,const D * in,size_t frames,size_t stride)928 void process(D* out, const D* in, size_t frames, size_t stride) {
929 assert(stride >= mChannelCount);
930 mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
931 mCoefs.data(), mChannelCount, mFilterOptions);
932 }
933
934 /**
935 * EXPERIMENTAL:
936 * Processes 1D input data, with mChannel Biquads, using sliding window parallelism.
937 *
938 * Instead of considering mChannel Biquads as one-per-input channel, this method treats
939 * the mChannel biquads as applied in sequence to a single 1D input stream,
940 * with the last channel count Biquad being applied first.
941 *
942 * input audio data -> BQ_{n-1} -> BQ{n-2} -> BQ_{n-3} -> BQ_{0} -> output
943 *
944 * TODO: Make this code efficient for NEON and split the destination from the source.
945 *
946 * Theoretically this code should be much faster for 1D input if one has 4+ Biquads to be
947 * sequentially applied, but in practice it is *MUCH* slower.
948 * On NEON, the data cannot be written then read in-place without incurring
949 * memory stall penalties. A shifting NEON holding register is required to make this
950 * a practical improvement.
951 */
process1D(D * inout,size_t frames)952 void process1D(D* inout, size_t frames) {
953 size_t remaining = mChannelCount;
954 #ifdef USE_NEON
955 // We apply NEON acceleration striped with 4 filters (channels) at once.
956 // Filters operations commute, nevertheless we apply the filters in order.
957 if (frames >= 2 * mChannelCount) {
958 constexpr size_t channelBlock = 4;
959 for (; remaining >= channelBlock; remaining -= channelBlock) {
960 const size_t baseIdx = remaining - channelBlock;
961 // This is the 1D accelerated method.
962 // prime the data pipe.
963 for (size_t i = 0; i < channelBlock - 1; ++i) {
964 size_t fromEnd = remaining - i - 1;
965 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
966 auto delays = mDelays.data() + fromEnd;
967 mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
968 delays, coefs, mChannelCount, mFilterOptions);
969 }
970
971 auto delays = mDelays.data() + baseIdx;
972 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : baseIdx);
973 // Parallel processing - we use a sliding window doing channelBlock at once,
974 // sliding one audio sample at a time.
975 mFunc(inout, inout,
976 frames - channelBlock + 1, 1 /* stride */, channelBlock,
977 delays, coefs, mChannelCount, mFilterOptions);
978
979 // drain data pipe.
980 for (size_t i = 1; i < channelBlock; ++i) {
981 mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
982 1 /* frames */, 1 /* stride */, channelBlock - i,
983 delays, coefs, mChannelCount, mFilterOptions);
984 }
985 }
986 }
987 #endif
988 // For short data sequences, we use the serial single channel logical equivalent
989 for (; remaining > 0; --remaining) {
990 size_t fromEnd = remaining - 1;
991 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
992 mFunc(inout, inout,
993 frames, 1 /* stride */, 1 /* channelCount */,
994 mDelays.data() + fromEnd, coefs, mChannelCount, mFilterOptions);
995 }
996 }
997
998 /**
999 * \brief Clears the delay elements
1000 *
1001 * This function clears the delay elements representing the filter state.
1002 */
clear()1003 void clear() {
1004 std::fill(mDelays.begin(), mDelays.end(), 0.f);
1005 }
1006
1007 /**
1008 * \brief Sets the internal delays from a vector
1009 *
1010 * For a multichannel stream, the delays are interleaved by channel:
1011 * delays[2 * i + 0] is s1 of i-th channel,
1012 * delays[2 * i + 1] is s2 of i-th channel,
1013 * where index i runs from 0 to (mChannelCount - 1).
1014 *
1015 * \param delays reference to vector containing delays.
1016 */
setDelays(std::vector<D> & delays)1017 void setDelays(std::vector<D>& delays) {
1018 assert(delays.size() == mDelays.size());
1019 mDelays = std::move(delays);
1020 }
1021
1022 /**
1023 * \brief Gets delay elements as a vector
1024 *
1025 * For a multichannel stream, the delays are interleaved by channel:
1026 * delays[2 * i + 0] is s1 of i-th channel,
1027 * delays[2 * i + 1] is s2 of i-th channel,
1028 * where index i runs from 0 to (mChannelCount - 1).
1029 *
1030 * \return a const vector reference of delays.
1031 */
getDelays()1032 const std::vector<D>& getDelays() const {
1033 return mDelays;
1034 }
1035
1036 private:
1037 /* const */ size_t mChannelCount; // not const because we can assign to it on operator equals.
1038
1039 /*
1040 * \var D mCoefs
1041 * \brief Stores the filter coefficients
1042 *
1043 * If SAME_COEF_PER_CHANNEL is false, the filter coefficients are stored
1044 * interleaved by channel.
1045 */
1046 std::vector<D> mCoefs;
1047
1048 /**
1049 * \var D mDelays
1050 * \brief The delay state.
1051 *
1052 * The delays are stored channel interleaved in the following manner,
1053 * mDelays[2 * i + 0] is s1 of i-th channel
1054 * mDelays[2 * i + 1] is s2 of i-th channel
1055 * index i runs from 0 to (mChannelCount - 1).
1056 */
1057 std::vector<D> mDelays;
1058
1059 details::FILTER_OPTION mFilterOptions{};
1060
1061 // Consider making a separate delegation class.
1062 /*
1063 * We store an array of functions based on the occupancy.
1064 *
1065 * OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
1066 * b0 b1 b2 a1 a2 (from lsb to msb)
1067 *
1068 * static inline constexpr std::array<filter_func*, M> mArray = {
1069 * biquad_filter_func<0>,
1070 * biquad_filter_func<1>,
1071 * biquad_filter_func<2>,
1072 * ...
1073 * biquad_filter_func<(1 << kBiquadNumCoefs) - 1>,
1074 * };
1075 *
1076 * Every time the coefficients are changed, we select the processing function from
1077 * this table.
1078 */
1079
1080 // Used to build the functional array.
1081 template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
1082 struct FuncWrap {
funcFuncWrap1083 static void func(D* out, const D *in, size_t frames, size_t stride,
1084 size_t channelCount, D *delays, const D *coef, size_t localStride,
1085 details::FILTER_OPTION filterOptions) {
1086 constexpr size_t NEAREST_OCCUPANCY =
1087 details::nearestOccupancy(
1088 OCCUPANCY, ConstOptions::template FilterType<D, D>
1089 ::required_occupancies_);
1090 details::biquad_filter_func<ConstOptions, NEAREST_OCCUPANCY, SC>(
1091 out, in, frames, stride, channelCount, delays, coef, localStride,
1092 filterOptions);
1093 }
1094 };
1095
1096 // Vector optimized array of functions.
1097 static inline constexpr auto mFilterFuncs =
1098 details::make_functional_array<
1099 FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
1100
1101 /**
1102 * \var filter_func* mFunc
1103 *
1104 * The current filter function selected for the channel occupancy of the Biquad.
1105 * It will be one of mFilterFuncs.
1106 */
1107 std::decay_t<decltype(mFilterFuncs[0])> mFunc;
1108 };
1109
1110 } // namespace android::audio_utils
1111
1112 #pragma pop_macro("USE_DITHER")
1113 #pragma pop_macro("USE_NEON")
1114