1 /*
2 * Copyright (C) 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 * Unless required by applicable law or agreed to in writing, software
10 *
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
18 #define ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
19
20 #include "intrinsic_utils.h"
21
22 #include <array>
23 #include <cmath>
24 #include <functional>
25 #include <utility>
26 #include <vector>
27
28 #include <assert.h>
29
30 // We conditionally include neon optimizations for ARM devices
31 #pragma push_macro("USE_NEON")
32 #undef USE_NEON
33
34 #if defined(__ARM_NEON__) || defined(__aarch64__)
35 #include <arm_neon.h>
36 #define USE_NEON
37 #endif
38
39 namespace android::audio_utils {
40
41 static constexpr size_t kBiquadNumCoefs = 5;
42 static constexpr size_t kBiquadNumDelays = 2;
43
44 namespace details {
45 // Helper methods for constructing a constexpr array of function pointers.
46 // As function pointers are efficient and have no constructor/destructor
47 // this is preferred over std::function.
48 //
49 // SC stands for SAME_COEF_PER_CHANNEL, a compile time boolean constant.
50 template <template <size_t, bool, typename ...> typename F, bool SC, size_t... Is>
make_functional_array_from_index_sequence(std::index_sequence<Is...>)51 static inline constexpr auto make_functional_array_from_index_sequence(std::index_sequence<Is...>) {
52 using first_t = decltype(&F<0, false>::func); // type from function
53 using result_t = std::array<first_t, sizeof...(Is)>; // type of array
54 return result_t{{F<Is, SC>::func...}}; // initialize with functions.
55 }
56
57 template <template <size_t, bool, typename ...> typename F, size_t M, bool SC>
make_functional_array()58 static inline constexpr auto make_functional_array() {
59 return make_functional_array_from_index_sequence<F, SC>(std::make_index_sequence<M>());
60 }
61
62 // Returns true if the poles are stable for a Biquad.
63 template <typename D>
isStable(const D & a1,const D & a2)64 static inline constexpr bool isStable(const D& a1, const D& a2) {
65 return fabs(a2) < D(1) && fabs(a1) < D(1) + a2;
66 }
67
68 // Simplifies Biquad coefficients.
69 // TODO: consider matched pole/zero cancellation.
70 // consider subnormal elimination for Intel processors.
71 template <typename D, typename T>
reduceCoefficients(const T & coef)72 std::array<D, kBiquadNumCoefs> reduceCoefficients(const T& coef) {
73 std::array<D, kBiquadNumCoefs> lcoef;
74 if (coef.size() == kBiquadNumCoefs + 1) {
75 // General form of Biquad.
76 // Remove matched z^-1 factors in top and bottom (e.g. coefs[0] == coefs[3] == 0).
77 size_t offset = 0;
78 for (; offset < 2 && coef[offset] == 0 && coef[offset + 3] == 0; ++offset);
79 assert(coefs[offset + 3] != 0); // hmm... shouldn't we be causal?
80
81 // Normalize 6 coefficients to 5 for storage.
82 lcoef[0] = coef[offset] / coef[offset + 3];
83 for (size_t i = 1; i + offset < 3; ++i) {
84 lcoef[i] = coef[i + offset] / coef[offset + 3];
85 lcoef[i + 2] = coef[i + offset + 3] / coef[offset + 3];
86 }
87 } else if (coef.size() == kBiquadNumCoefs) {
88 std::copy(coef.begin(), coef.end(), lcoef.begin());
89 } else {
90 assert(coef.size() == kBiquadNumCoefs + 1 || coef.size() == kBiquadNumCoefs);
91 }
92 return lcoef;
93 }
94
95 // Sets a container of coefficients to storage.
96 template <typename D, typename T, typename DEST>
setCoefficients(DEST & dest,size_t offset,size_t stride,size_t channelCount,const T & coef)97 static inline void setCoefficients(
98 DEST& dest, size_t offset, size_t stride, size_t channelCount, const T& coef) {
99 auto lcoef = reduceCoefficients<D, T>(coef);
100 // replicate as needed
101 for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
102 for (size_t j = 0; j < channelCount; ++j) {
103 dest[i * stride + offset + j] = lcoef[i];
104 }
105 }
106 }
107
108 // For biquad_filter_fast, we template based on whether coef[i] is nonzero - this should be
109 // determined in a constexpr fashion for optimization.
110
111 // Helper which takes a stride to allow column processing of interleaved audio streams.
112 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_1fast(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride)113 void biquad_filter_1fast(D *out, const D *in, size_t frames, size_t stride,
114 size_t channelCount, D *delays, const D *coefs, size_t localStride) {
115 #if defined(__i386__) || defined(__x86_x64__)
116 D delta = std::numeric_limits<float>::min() * (1 << 24);
117 #endif
118 D b0, b1, b2, negativeA1, negativeA2;
119
120 if constexpr (SAME_COEF_PER_CHANNEL) {
121 b0 = coefs[0];
122 b1 = coefs[1];
123 b2 = coefs[2];
124 negativeA1 = -coefs[3];
125 negativeA2 = -coefs[4];
126 }
127 for (size_t i = 0; i < channelCount; ++i) {
128 if constexpr (!SAME_COEF_PER_CHANNEL) {
129 b0 = coefs[0];
130 b1 = coefs[localStride];
131 b2 = coefs[2 * localStride];
132 negativeA1 = -coefs[3 * localStride];
133 negativeA2 = -coefs[4 * localStride];
134 ++coefs;
135 }
136
137 D s1n1 = delays[0];
138 D s2n1 = delays[localStride];
139 const D *input = &in[i];
140 D *output = &out[i];
141 for (size_t j = frames; j > 0; --j) {
142 // Adding a delta to avoid subnormal exception handling on the x86/x64 platform;
143 // this is not a problem with the ARM platform. The delta will not affect the
144 // precision of the result.
145 #if defined(__i386__) || defined(__x86_x64__)
146 const D xn = *input + delta;
147 #else
148 const D xn = *input;
149 #endif
150 D yn = (OCCUPANCY >> 0 & 1) * b0 * xn + s1n1;
151 s1n1 = (OCCUPANCY >> 1 & 1) * b1 * xn + (OCCUPANCY >> 3 & 1) * negativeA1 * yn + s2n1;
152 s2n1 = (OCCUPANCY >> 2 & 1) * b2 * xn + (OCCUPANCY >> 4 & 1) * negativeA2 * yn;
153
154 input += stride;
155
156 *output = yn;
157 output += stride;
158
159 #if defined(__i386__) || defined(__x86_x64__)
160 delta = -delta;
161 #endif
162 }
163 delays[0] = s1n1;
164 delays[localStride] = s2n1;
165 ++delays;
166 }
167 }
168
169 // Helper function to zero channels in the input buffer.
170 // This is used for the degenerate coefficient case which results in all zeroes.
171 template <typename D>
zeroChannels(D * out,size_t frames,size_t stride,size_t channelCount)172 void zeroChannels(D *out, size_t frames, size_t stride, size_t channelCount) {
173 if (stride == channelCount) {
174 memset(out, 0, sizeof(float) * frames * channelCount);
175 } else {
176 for (size_t i = 0; i < frames; i++) {
177 memset(out, 0, sizeof(float) * channelCount);
178 out += stride;
179 }
180 }
181 }
182
183 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_fast(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride)184 void biquad_filter_fast(D *out, const D *in, size_t frames, size_t stride,
185 size_t channelCount, D *delays, const D *coefs, size_t localStride) {
186 if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
187 zeroChannels(out, frames, stride, channelCount);
188 return;
189 }
190 biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>(
191 out, in, frames, stride, channelCount, delays, coefs, localStride);
192 }
193
194 #ifdef USE_NEON
195
196 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename T, typename F>
biquad_filter_neon_impl(F * out,const F * in,size_t frames,size_t stride,size_t channelCount,F * delays,const F * coefs,size_t localStride)197 void biquad_filter_neon_impl(F *out, const F *in, size_t frames, size_t stride,
198 size_t channelCount, F *delays, const F *coefs, size_t localStride) {
199 using namespace android::audio_utils::intrinsics;
200
201 constexpr size_t elements = sizeof(T) / sizeof(F); // how many float elements in T.
202 T b0, b1, b2, negativeA1, negativeA2;
203 if constexpr (SAME_COEF_PER_CHANNEL) {
204 b0 = vdupn<T>(coefs[0]);
205 b1 = vdupn<T>(coefs[1]);
206 b2 = vdupn<T>(coefs[2]);
207 negativeA1 = vneg(vdupn<T>(coefs[3]));
208 negativeA2 = vneg(vdupn<T>(coefs[4]));
209 }
210 for (size_t i = 0; i < channelCount; i += elements) {
211 if constexpr (!SAME_COEF_PER_CHANNEL) {
212 b0 = vld1<T>(coefs);
213 b1 = vld1<T>(coefs + localStride);
214 b2 = vld1<T>(coefs + localStride * 2);
215 negativeA1 = vneg(vld1<T>(coefs + localStride * 3));
216 negativeA2 = vneg(vld1<T>(coefs + localStride * 4));
217 coefs += elements;
218 }
219 T s1 = vld1<T>(&delays[0]);
220 T s2 = vld1<T>(&delays[localStride]);
221 const F *input = &in[i];
222 F *output = &out[i];
223 for (size_t j = frames; j > 0; --j) {
224 T xn = vld1<T>(input);
225 T yn = s1;
226
227 if constexpr (OCCUPANCY >> 0 & 1) {
228 yn = vmla(yn, b0, xn);
229 }
230 s1 = s2;
231 if constexpr (OCCUPANCY >> 3 & 1) {
232 s1 = vmla(s1, negativeA1, yn);
233 }
234 if constexpr (OCCUPANCY >> 1 & 1) {
235 s1 = vmla(s1, b1, xn);
236 }
237 if constexpr (OCCUPANCY >> 2 & 1) {
238 s2 = vmul(b2, xn);
239 } else {
240 s2 = vdupn<T>(0.f);
241 }
242 if constexpr (OCCUPANCY >> 4 & 1) {
243 s2 = vmla(s2, negativeA2, yn);
244 }
245
246 input += stride;
247 vst1(output, yn);
248 output += stride;
249 }
250 vst1(&delays[0], s1);
251 vst1(&delays[localStride], s2);
252 delays += elements;
253 }
254 }
255
256 #define BIQUAD_FILTER_CASE(N, ... /* type */) \
257 case N: { \
258 biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, __VA_ARGS__>( \
259 out + offset, in + offset, frames, stride, remaining, \
260 delays + offset, c, localStride); \
261 goto exit; \
262 }
263
264 template <size_t OCCUPANCY, bool SAME_COEF_PER_CHANNEL, typename D>
biquad_filter_neon(D * out,const D * in,size_t frames,size_t stride,size_t channelCount,D * delays,const D * coefs,size_t localStride)265 void biquad_filter_neon(D *out, const D *in, size_t frames, size_t stride,
266 size_t channelCount, D *delays, const D *coefs, size_t localStride) {
267 if constexpr ((OCCUPANCY & 7) == 0) { // all b's are zero, output is zero.
268 zeroChannels(out, frames, stride, channelCount);
269 return;
270 }
271
272 // Possible alternative intrinsic types for 2, 9, 15 float elements.
273 // using alt_2_t = struct {struct { float a; float b; } s; };
274 // using alt_9_t = struct { struct { float32x4x2_t a; float b; } s; };
275 // using alt_15_t = struct { struct { float32x4x2_t a; struct { float v[7]; } b; } s; };
276
277 for (size_t offset = 0; offset < channelCount; ) {
278 size_t remaining = channelCount - offset;
279 auto *c = SAME_COEF_PER_CHANNEL ? coefs : coefs + offset;
280 if constexpr (std::is_same_v<D, float>) {
281 switch (remaining) {
282 default:
283 if (remaining >= 16) {
284 remaining &= ~15;
285 biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, float32x4x4_t>(
286 out + offset, in + offset, frames, stride, remaining,
287 delays + offset, c, localStride);
288 offset += remaining;
289 continue;
290 }
291 break; // case 1 handled at bottom.
292 BIQUAD_FILTER_CASE(15, intrinsics::internal_array_t<float, 15>)
293 BIQUAD_FILTER_CASE(14, intrinsics::internal_array_t<float, 14>)
294 BIQUAD_FILTER_CASE(13, intrinsics::internal_array_t<float, 13>)
295 BIQUAD_FILTER_CASE(12, intrinsics::internal_array_t<float, 12>)
296 BIQUAD_FILTER_CASE(11, intrinsics::internal_array_t<float, 11>)
297 BIQUAD_FILTER_CASE(10, intrinsics::internal_array_t<float, 10>)
298 BIQUAD_FILTER_CASE(9, intrinsics::internal_array_t<float, 9>)
299 // We choose the NEON intrinsic type over internal_array for 8 to
300 // check if there is any performance difference in benchmark (should be similar).
301 // BIQUAD_FILTER_CASE(8, intrinsics::internal_array_t<float, 8>)
302 BIQUAD_FILTER_CASE(8, float32x4x2_t)
303 BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<float, 7>)
304 BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<float, 6>)
305 BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<float, 5>)
306 BIQUAD_FILTER_CASE(4, float32x4_t)
307 // We choose the NEON intrinsic type over internal_array for 4 to
308 // check if there is any performance difference in benchmark (should be similar).
309 // BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<float, 4>)
310 BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<float, 3>)
311 BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<float, 2>)
312 }
313 } else if constexpr (std::is_same_v<D, double>) {
314 #if defined(__aarch64__)
315 switch (remaining) {
316 default:
317 if (remaining >= 8) {
318 remaining &= ~7;
319 biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL,
320 intrinsics::internal_array_t<double, 8>>(
321 out + offset, in + offset, frames, stride, remaining,
322 delays + offset, c, localStride);
323 offset += remaining;
324 continue;
325 }
326 break; // case 1 handled at bottom.
327 BIQUAD_FILTER_CASE(7, intrinsics::internal_array_t<double, 7>)
328 BIQUAD_FILTER_CASE(6, intrinsics::internal_array_t<double, 6>)
329 BIQUAD_FILTER_CASE(5, intrinsics::internal_array_t<double, 5>)
330 BIQUAD_FILTER_CASE(4, intrinsics::internal_array_t<double, 4>)
331 BIQUAD_FILTER_CASE(3, intrinsics::internal_array_t<double, 3>)
332 BIQUAD_FILTER_CASE(2, intrinsics::internal_array_t<double, 2>)
333 };
334 #endif
335 }
336 // Essentially the code below is scalar, the same as
337 // biquad_filter_1fast<OCCUPANCY, SAME_COEF_PER_CHANNEL>,
338 // but formulated with NEON intrinsic-like call pattern.
339 biquad_filter_neon_impl<OCCUPANCY, SAME_COEF_PER_CHANNEL, D>(
340 out + offset, in + offset, frames, stride, remaining,
341 delays + offset, c, localStride);
342 offset += remaining;
343 }
344 exit:;
345 }
346
347 #endif // USE_NEON
348
349 } // namespace details
350
351 /**
352 * BiquadFilter
353 *
354 * A multichannel Biquad filter implementation of the following transfer function.
355 *
356 * \f[
357 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
358 * { 1 + a_1 z^{-1} + a_2 z^{-2} }
359 * \f]
360 *
361 * <!--
362 * b_0 + b_1 z^{-1} + b_2 z^{-2}
363 * H(z)= -----------------------------
364 * 1 + a_1 z^{-1} + a_2 z^{-2}
365 * -->
366 *
367 * Details:
368 * 1. The transposed direct type 2 implementation allows zeros to be computed
369 * before poles in the internal state for improved filter precision and
370 * better time-varying coefficient performance.
371 * 2. We optimize for zero coefficients using a compile-time generated function table.
372 * 3. We optimize for vector operations using column vector operations with stride
373 * into interleaved audio data.
374 * 4. The denominator coefficients a_1 and a_2 are stored in positive form, despite the
375 * negated form being slightly simpler for optimization (addition is commutative but
376 * subtraction is not commutative). This is to permit obtaining the coefficients
377 * as a const reference.
378 *
379 * Compatibility issue: Some Biquad libraries store the denominator coefficients
380 * in negated form. We explicitly negate before entering into the inner loop.
381 * 5. The full 6 coefficient Biquad filter form with a_0 != 1 may be used for setting
382 * coefficients. See setCoefficients() below.
383 *
384 * If SAME_COEFFICIENTS_PER_CHANNEL is false, then mCoefs is stored interleaved by channel.
385 *
386 * The Biquad filter update equation in transposed Direct form 2 is as follows:
387 *
388 * \f{eqnarray*}{
389 * y[n] &=& b0 * x[n] + s1[n - 1] \\
390 * s1[n] &=& s2[n - 1] + b1 * x[n] - a1 * y[n] \\
391 * s2[n] &=& b2 * x[n] - a2 * y[n]
392 * \f}
393 *
394 * For the transposed Direct form 2 update equation s1 and s2 represent the delay state
395 * contained in the internal vector mDelays[]. This is stored interleaved by channel.
396 *
397 * Use -ffast-math` to permit associative math optimizations to get non-zero optimization as
398 * we do not rely on strict C operator precedence and associativity here.
399 * TODO(b/159373530): Use compound statement scoped pragmas instead of `-ffast-math`.
400 *
401 * \param D type variable representing the data type, one of float or double.
402 * The default is float.
403 * \param SAME_COEF_PER_CHANNEL bool which is true if all the Biquad coefficients
404 * are shared between channels, or false if the Biquad coefficients
405 * may differ between channels. The default is true.
406 */
407 template <typename D = float, bool SAME_COEF_PER_CHANNEL = true>
408 class BiquadFilter {
409 public:
410 template <typename T = std::array<D, kBiquadNumCoefs>>
411 explicit BiquadFilter(size_t channelCount,
412 const T& coefs = {}, bool optimized = true)
mChannelCount(channelCount)413 : mChannelCount(channelCount)
414 , mCoefs(kBiquadNumCoefs * (SAME_COEF_PER_CHANNEL ? 1 : mChannelCount))
415 , mDelays(channelCount * kBiquadNumDelays) {
416 setCoefficients(coefs, optimized);
417 }
418
419 // copy constructors
BiquadFilter(const BiquadFilter<D,SAME_COEF_PER_CHANNEL> & other)420 BiquadFilter(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
421 *this = other;
422 }
423
BiquadFilter(BiquadFilter<D,SAME_COEF_PER_CHANNEL> && other)424 BiquadFilter(BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
425 *this = std::move(other);
426 }
427
428 // copy assignment
429 BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
430 const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) {
431 mChannelCount = other.mChannelCount;
432 mCoefs = other.mCoefs;
433 mDelays = other.mDelays;
434 return *this;
435 }
436
437 BiquadFilter<D, SAME_COEF_PER_CHANNEL>& operator=(
438 BiquadFilter<D, SAME_COEF_PER_CHANNEL>&& other) {
439 mChannelCount = other.mChannelCount;
440 mCoefs = std::move(other.mCoefs);
441 mDelays = std::move(other.mDelays);
442 return *this;
443 }
444
445 // operator overloads for equality tests
446 bool operator==(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
447 return mChannelCount == other.mChannelCount
448 && mCoefs == other.mCoefs
449 && mDelays == other.mDelays;
450 }
451
452 bool operator!=(const BiquadFilter<D, SAME_COEF_PER_CHANNEL>& other) const {
453 return !operator==(other);
454 }
455
456 /**
457 * \brief Sets filter coefficients
458 *
459 * \param coefs pointer to the filter coefficients array.
460 * \param optimized whether to use processor optimized function (optional, defaults true).
461 * \return true if the BiquadFilter is stable, otherwise, return false.
462 *
463 * The input coefficients are interpreted in the following manner:
464 *
465 * If size of container is 5 (normalized Biquad):
466 * coefs[0] is b0,
467 * coefs[1] is b1,
468 * coefs[2] is b2,
469 * coefs[3] is a1,
470 * coefs[4] is a2.
471 *
472 * \f[
473 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
474 * { 1 + a_1 z^{-1} + a_2 z^{-2} }
475 * \f]
476 * <!--
477 * b_0 + b_1 z^{-1} + b_2 z^{-2}
478 * H(z)= -----------------------------
479 * 1 + a_1 z^{-1} + a_2 z^{-2}
480 * -->
481 *
482 * If size of container is 6 (general Biquad):
483 * coefs[0] is b0,
484 * coefs[1] is b1,
485 * coefs[2] is b2,
486 * coefs[3] is a0,
487 * coefs[4] is a1,
488 * coefs[5] is a2.
489 *
490 * \f[
491 * H(z) = \frac { b_0 + b_1 z^{-1} + b_2 z^{-2} }
492 * { a_0 + a_1 z^{-1} + a_2 z^{-2} }
493 * \f]
494 * <!--
495 * b_0 + b_1 z^{-1} + b_2 z^{-2}
496 * H(z)= -----------------------------
497 * a_0 + a_1 z^{-1} + a_2 z^{-2}
498 * -->
499 *
500 * The internal representation is a normalized Biquad.
501 */
502 template <typename T = std::array<D, kBiquadNumCoefs>>
503 bool setCoefficients(const T& coefs, bool optimized = true) {
504 if constexpr (SAME_COEF_PER_CHANNEL) {
505 details::setCoefficients<D, T>(
506 mCoefs, 0 /* offset */, 1 /* stride */, 1 /* channelCount */, coefs);
507 } else {
508 if (coefs.size() == mCoefs.size()) {
509 std::copy(coefs.begin(), coefs.end(), mCoefs.begin());
510 } else {
511 details::setCoefficients<D, T>(
512 mCoefs, 0 /* offset */, mChannelCount, mChannelCount, coefs);
513 }
514 }
515 setOptimization(optimized);
516 return isStable();
517 }
518
519 /**
520 * Sets coefficients for one of the filter channels, specified by channelIndex.
521 *
522 * This method is only available if SAME_COEF_PER_CHANNEL is false.
523 *
524 * \param coefs the coefficients to set.
525 * \param channelIndex the particular channel index to set.
526 * \param optimized whether to use optimized function (optional, defaults true).
527 */
528 template <typename T = std::array<D, kBiquadNumCoefs>>
529 bool setCoefficients(const T& coefs, size_t channelIndex, bool optimized = true) {
530 static_assert(!SAME_COEF_PER_CHANNEL);
531
532 details::setCoefficients<D, T>(
533 mCoefs, channelIndex, mChannelCount, 1 /* channelCount */, coefs);
534 setOptimization(optimized);
535 return isStable();
536 }
537
538 /**
539 * Returns the coefficients as a const vector reference.
540 *
541 * If multichannel and the template variable SAME_COEF_PER_CHANNEL is true,
542 * the coefficients are interleaved by channel.
543 */
getCoefficients()544 const std::vector<D>& getCoefficients() const {
545 return mCoefs;
546 }
547
548 /**
549 * Returns true if the filter is stable.
550 *
551 * \param channelIndex ignored if SAME_COEF_PER_CHANNEL is true,
552 * asserts if channelIndex >= channel count (zero based index).
553 */
554 bool isStable(size_t channelIndex = 0) const {
555 if constexpr (SAME_COEF_PER_CHANNEL) {
556 return details::isStable(mCoefs[3], mCoefs[4]);
557 } else {
558 assert(channelIndex < mChannelCount);
559 return details::isStable(
560 mCoefs[3 * mChannelCount + channelIndex],
561 mCoefs[4 * mChannelCount + channelIndex]);
562 }
563 }
564
565 /**
566 * Updates the filter function based on processor optimization.
567 *
568 * \param optimized if true, enables Processor based optimization.
569 */
setOptimization(bool optimized)570 void setOptimization(bool optimized) {
571 // Determine which coefficients are nonzero as a bit field.
572 size_t category = 0;
573 for (size_t i = 0; i < kBiquadNumCoefs; ++i) {
574 if constexpr (SAME_COEF_PER_CHANNEL) {
575 category |= (mCoefs[i] != 0) << i;
576 } else {
577 for (size_t j = 0; j < mChannelCount; ++j) {
578 if (mCoefs[i * mChannelCount + j] != 0) {
579 category |= 1 << i;
580 break;
581 }
582 }
583 }
584 }
585
586 // Select the proper filtering function from our array.
587 (void)optimized; // avoid unused variable warning.
588 mFunc = mFilterFast[category]; // default if we don't have processor optimization.
589
590 #ifdef USE_NEON
591 /* if constexpr (std::is_same_v<D, float>) */ {
592 if (optimized) {
593 mFunc = mFilterNeon[category];
594 }
595 }
596 #endif
597 }
598
599 /**
600 * \brief Filters the input data
601 *
602 * \param out pointer to the output data
603 * \param in pointer to the input data
604 * \param frames number of audio frames to be processed
605 */
process(D * out,const D * in,size_t frames)606 void process(D* out, const D *in, size_t frames) {
607 process(out, in, frames, mChannelCount);
608 }
609
610 /**
611 * \brief Filters the input data with stride
612 *
613 * \param out pointer to the output data
614 * \param in pointer to the input data
615 * \param frames number of audio frames to be processed
616 * \param stride the total number of samples associated with a frame, if not channelCount.
617 */
process(D * out,const D * in,size_t frames,size_t stride)618 void process(D* out, const D *in, size_t frames, size_t stride) {
619 assert(stride >= mChannelCount);
620 mFunc(out, in, frames, stride, mChannelCount, mDelays.data(),
621 mCoefs.data(), mChannelCount);
622 }
623
624 /**
625 * EXPERIMENTAL:
626 * Processes 1D input data, with mChannel Biquads, using sliding window parallelism.
627 *
628 * Instead of considering mChannel Biquads as one-per-input channel, this method treats
629 * the mChannel biquads as applied in sequence to a single 1D input stream,
630 * with the last channel count Biquad being applied first.
631 *
632 * input audio data -> BQ_{n-1} -> BQ{n-2} -> BQ_{n-3} -> BQ_{0} -> output
633 *
634 * TODO: Make this code efficient for NEON and split the destination from the source.
635 *
636 * Theoretically this code should be much faster for 1D input if one has 4+ Biquads to be
637 * sequentially applied, but in practice it is *MUCH* slower.
638 * On NEON, the data cannot be written then read in-place without incurring
639 * memory stall penalties. A shifting NEON holding register is required to make this
640 * a practical improvement.
641 */
process1D(D * inout,size_t frames)642 void process1D(D* inout, size_t frames) {
643 size_t remaining = mChannelCount;
644 #ifdef USE_NEON
645 // We apply NEON acceleration striped with 4 filters (channels) at once.
646 // Filters operations commute, nevertheless we apply the filters in order.
647 if (frames >= 2 * mChannelCount) {
648 constexpr size_t channelBlock = 4;
649 for (; remaining >= channelBlock; remaining -= channelBlock) {
650 const size_t baseIdx = remaining - channelBlock;
651 // This is the 1D accelerated method.
652 // prime the data pipe.
653 for (size_t i = 0; i < channelBlock - 1; ++i) {
654 size_t fromEnd = remaining - i - 1;
655 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
656 auto delays = mDelays.data() + fromEnd;
657 mFunc(inout, inout, 1 /* frames */, 1 /* stride */, i + 1,
658 delays, coefs, mChannelCount);
659 }
660
661 auto delays = mDelays.data() + baseIdx;
662 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : baseIdx);
663 // Parallel processing - we use a sliding window doing channelBlock at once,
664 // sliding one audio sample at a time.
665 mFunc(inout, inout,
666 frames - channelBlock + 1, 1 /* stride */, channelBlock,
667 delays, coefs, mChannelCount);
668
669 // drain data pipe.
670 for (size_t i = 1; i < channelBlock; ++i) {
671 mFunc(inout + frames - channelBlock + i, inout + frames - channelBlock + i,
672 1 /* frames */, 1 /* stride */, channelBlock - i,
673 delays, coefs, mChannelCount);
674 }
675 }
676 }
677 #endif
678 // For short data sequences, we use the serial single channel logical equivalent
679 for (; remaining > 0; --remaining) {
680 size_t fromEnd = remaining - 1;
681 auto coefs = mCoefs.data() + (SAME_COEF_PER_CHANNEL ? 0 : fromEnd);
682 mFunc(inout, inout,
683 frames, 1 /* stride */, 1 /* channelCount */,
684 mDelays.data() + fromEnd, coefs, mChannelCount);
685 }
686 }
687
688 /**
689 * \brief Clears the delay elements
690 *
691 * This function clears the delay elements representing the filter state.
692 */
clear()693 void clear() {
694 std::fill(mDelays.begin(), mDelays.end(), 0.f);
695 }
696
697 /**
698 * \brief Sets the internal delays from a vector
699 *
700 * For a multichannel stream, the delays are interleaved by channel:
701 * delays[2 * i + 0] is s1 of i-th channel,
702 * delays[2 * i + 1] is s2 of i-th channel,
703 * where index i runs from 0 to (mChannelCount - 1).
704 *
705 * \param delays reference to vector containing delays.
706 */
setDelays(std::vector<D> & delays)707 void setDelays(std::vector<D>& delays) {
708 assert(delays.size() == mDelays.size());
709 mDelays = std::move(delays);
710 }
711
712 /**
713 * \brief Gets delay elements as a vector
714 *
715 * For a multichannel stream, the delays are interleaved by channel:
716 * delays[2 * i + 0] is s1 of i-th channel,
717 * delays[2 * i + 1] is s2 of i-th channel,
718 * where index i runs from 0 to (mChannelCount - 1).
719 *
720 * \return a const vector reference of delays.
721 */
getDelays()722 const std::vector<D>& getDelays() const {
723 return mDelays;
724 }
725
726 private:
727 /* const */ size_t mChannelCount; // not const because we can assign to it on operator equals.
728
729 /*
730 * \var D mCoefs
731 * \brief Stores the filter coefficients
732 *
733 * If SAME_COEF_PER_CHANNEL is false, the filter coefficients are stored
734 * interleaved by channel.
735 */
736 std::vector<D> mCoefs;
737
738 /**
739 * \var D mDelays
740 * \brief The delay state.
741 *
742 * The delays are stored channel interleaved in the following manner,
743 * mDelays[2 * i + 0] is s1 of i-th channel
744 * mDelays[2 * i + 1] is s2 of i-th channel
745 * index i runs from 0 to (mChannelCount - 1).
746 */
747 std::vector<D> mDelays;
748
749 using filter_func = decltype(details::biquad_filter_fast<0, true, D>);
750
751 /**
752 * \var filter_func* mFunc
753 *
754 * The current filter function selected for the channel occupancy of the Biquad.
755 */
756 filter_func *mFunc;
757
758 // Create a functional wrapper to feed "biquad_filter_fast" to
759 // make_functional_array() to populate the array.
760 //
761 // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
762 // b0 b1 b2 a1 a2 (from lsb to msb)
763 template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
764 struct FuncWrap {
765 template<typename T>
nearestFuncWrap766 static constexpr size_t nearest() {
767 // Combine cases to both improve expected performance and reduce code space.
768 // Some occupancy masks provide worse performance than more occupied masks.
769 constexpr size_t required_occupancies[] = {
770 1, // constant scale
771 3, // single zero
772 7, // double zero
773 9, // single pole
774 // 11, // first order IIR (unnecessary optimization, close enough to 31).
775 27, // double pole + single zero
776 31, // second order IIR (full Biquad)
777 };
778 if constexpr (OCCUPANCY < 32) {
779 for (auto test : required_occupancies) {
780 if ((OCCUPANCY & test) == OCCUPANCY) return test;
781 }
782 } else {
783 static_assert(intrinsics::dependent_false_v<T>);
784 }
785 return 0; // never gets here.
786 }
787
funcFuncWrap788 static void func(D* out, const D *in, size_t frames, size_t stride,
789 size_t channelCount, D *delays, const D *coef, size_t localStride) {
790 constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
791 details::biquad_filter_fast<NEAREST_OCCUPANCY, SC>(
792 out, in, frames, stride, channelCount, delays, coef, localStride);
793 }
794 };
795
796 /**
797 * \var mFilterFast
798 *
799 * std::array of functions based on coefficient occupancy.
800 *
801 * static inline constexpr std::array<filter_func*, M> mArray = {
802 * biquad_filter_fast<0>,
803 * biquad_filter_fast<1>,
804 * biquad_filter_fast<2>,
805 * ...
806 * biquad_filter_fast<(1 << kBiquadNumCoefs) - 1>,
807 * };
808 *
809 * Every time the coefficients are changed, we select the processing function from
810 * this table.
811 */
812 static inline constexpr auto mFilterFast =
813 details::make_functional_array<
814 FuncWrap, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
815
816 #ifdef USE_NEON
817 // OCCUPANCY is a bitmask corresponding to the presence of nonzero Biquad coefficients
818 // b0 b1 b2 a1 a2 (from lsb to msb)
819
820 template <size_t OCCUPANCY, bool SC> // note SC == SAME_COEF_PER_CHANNEL
821 struct FuncWrapNeon {
822 template<typename T>
nearestFuncWrapNeon823 static constexpr size_t nearest() {
824 // combine cases to both improve expected performance and reduce code space.
825 //
826 // This lists the occupancies we will specialize functions for.
827 constexpr size_t required_occupancies[] = {
828 1, // constant scale
829 3, // single zero
830 7, // double zero
831 9, // single pole
832 11, // first order IIR
833 27, // double pole + single zero
834 31, // second order IIR (full Biquad)
835 };
836 if constexpr (OCCUPANCY < 32) {
837 for (auto test : required_occupancies) {
838 if ((OCCUPANCY & test) == OCCUPANCY) return test;
839 }
840 } else {
841 static_assert(intrinsics::dependent_false_v<T>);
842 }
843 return 0; // never gets here.
844 }
845
funcFuncWrapNeon846 static void func(D* out, const D *in, size_t frames, size_t stride,
847 size_t channelCount, D *delays, const D *coef, size_t localStride) {
848 constexpr size_t NEAREST_OCCUPANCY = nearest<D>();
849 details::biquad_filter_neon<NEAREST_OCCUPANCY, SC>(
850 out, in, frames, stride, channelCount, delays, coef, localStride);
851 }
852 };
853
854 // Neon optimized array of functions.
855 static inline constexpr auto mFilterNeon =
856 details::make_functional_array<
857 FuncWrapNeon, 1 << kBiquadNumCoefs, SAME_COEF_PER_CHANNEL>();
858 #endif // USE_NEON
859
860 };
861
862 } // namespace android::audio_utils
863
864 #pragma pop_macro("USE_NEON")
865
866 #endif // !ANDROID_AUDIO_UTILS_BIQUAD_FILTER_H
867