1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 /* Copyright (C) 2011 Google Inc. All rights reserved.
7 * Use of this source code is governed by a BSD-style license that can be
8 * found in the LICENSE.WEBKIT file.
9 */
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "drc_math.h"
16 #include "drc_kernel.h"
17
18 #define MAX_PRE_DELAY_FRAMES 1024
19 #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1)
20 #define DEFAULT_PRE_DELAY_FRAMES 256
21 #define DIVISION_FRAMES 32
22 #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1)
23
24 #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)]))
25 #define assert_on_compile_is_power_of_2(n) \
26 assert_on_compile((n) != 0 && (((n) & ((n)-1)) == 0))
27
28 const float uninitialized_value = -1;
29 static int drc_math_initialized;
30
dk_init(struct drc_kernel * dk,float sample_rate)31 void dk_init(struct drc_kernel *dk, float sample_rate)
32 {
33 int i;
34
35 if (!drc_math_initialized) {
36 drc_math_initialized = 1;
37 drc_math_init();
38 }
39
40 dk->sample_rate = sample_rate;
41 dk->detector_average = 0;
42 dk->compressor_gain = 1;
43 dk->enabled = 0;
44 dk->processed = 0;
45 dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES;
46 dk->pre_delay_read_index = 0;
47 dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES;
48 dk->max_attack_compression_diff_db = -INFINITY;
49 dk->ratio = uninitialized_value;
50 dk->slope = uninitialized_value;
51 dk->linear_threshold = uninitialized_value;
52 dk->db_threshold = uninitialized_value;
53 dk->db_knee = uninitialized_value;
54 dk->knee_threshold = uninitialized_value;
55 dk->ratio_base = uninitialized_value;
56 dk->K = uninitialized_value;
57
58 assert_on_compile_is_power_of_2(DIVISION_FRAMES);
59 assert_on_compile(DIVISION_FRAMES % 4 == 0);
60 /* Allocate predelay buffers */
61 assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES);
62 for (i = 0; i < DRC_NUM_CHANNELS; i++) {
63 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
64 dk->pre_delay_buffers[i] = (float *)calloc(1, size);
65 }
66 }
67
dk_free(struct drc_kernel * dk)68 void dk_free(struct drc_kernel *dk)
69 {
70 int i;
71 for (i = 0; i < DRC_NUM_CHANNELS; ++i)
72 free(dk->pre_delay_buffers[i]);
73 }
74
75 /* Sets the pre-delay (lookahead) buffer size */
set_pre_delay_time(struct drc_kernel * dk,float pre_delay_time)76 static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time)
77 {
78 int i;
79 /* Re-configure look-ahead section pre-delay if delay time has
80 * changed. */
81 unsigned pre_delay_frames = pre_delay_time * dk->sample_rate;
82 pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1);
83
84 /* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we
85 * won't split a division of samples into two blocks of memory, so it is
86 * easier to process. This may make the actual delay time slightly less
87 * than the specified value, but the difference is less than 1ms. */
88 pre_delay_frames &= ~DIVISION_FRAMES_MASK;
89
90 /* We need at least one division buffer, so the incoming data won't
91 * overwrite the output data */
92 pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES);
93
94 if (dk->last_pre_delay_frames != pre_delay_frames) {
95 dk->last_pre_delay_frames = pre_delay_frames;
96 for (i = 0; i < DRC_NUM_CHANNELS; ++i) {
97 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
98 memset(dk->pre_delay_buffers[i], 0, size);
99 }
100
101 dk->pre_delay_read_index = 0;
102 dk->pre_delay_write_index = pre_delay_frames;
103 }
104 }
105
106 /* Exponential curve for the knee. It is 1st derivative matched at
107 * dk->linear_threshold and asymptotically approaches the value
108 * dk->linear_threshold + 1 / k.
109 *
110 * This is used only when calculating the static curve, not used when actually
111 * compress the input data (knee_curveK below is used instead).
112 */
knee_curve(struct drc_kernel * dk,float x,float k)113 static float knee_curve(struct drc_kernel *dk, float x, float k)
114 {
115 /* Linear up to threshold. */
116 if (x < dk->linear_threshold)
117 return x;
118
119 return dk->linear_threshold +
120 (1 - knee_expf(-k * (x - dk->linear_threshold))) / k;
121 }
122
123 /* Approximate 1st derivative with input and output expressed in dB. This slope
124 * is equal to the inverse of the compression "ratio". In other words, a
125 * compression ratio of 20 would be a slope of 1/20.
126 */
slope_at(struct drc_kernel * dk,float x,float k)127 static float slope_at(struct drc_kernel *dk, float x, float k)
128 {
129 if (x < dk->linear_threshold)
130 return 1;
131
132 float x2 = x * 1.001;
133
134 float x_db = linear_to_decibels(x);
135 float x2Db = linear_to_decibels(x2);
136
137 float y_db = linear_to_decibels(knee_curve(dk, x, k));
138 float y2Db = linear_to_decibels(knee_curve(dk, x2, k));
139
140 float m = (y2Db - y_db) / (x2Db - x_db);
141
142 return m;
143 }
144
k_at_slope(struct drc_kernel * dk,float desired_slope)145 static float k_at_slope(struct drc_kernel *dk, float desired_slope)
146 {
147 float x_db = dk->db_threshold + dk->db_knee;
148 float x = decibels_to_linear(x_db);
149
150 /* Approximate k given initial values. */
151 float minK = 0.1;
152 float maxK = 10000;
153 float k = 5;
154 int i;
155
156 for (i = 0; i < 15; ++i) {
157 /* A high value for k will more quickly asymptotically approach
158 * a slope of 0. */
159 float slope = slope_at(dk, x, k);
160
161 if (slope < desired_slope) {
162 /* k is too high. */
163 maxK = k;
164 } else {
165 /* k is too low. */
166 minK = k;
167 }
168
169 /* Re-calculate based on geometric mean. */
170 k = sqrtf(minK * maxK);
171 }
172
173 return k;
174 }
175
update_static_curve_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio)176 static void update_static_curve_parameters(struct drc_kernel *dk,
177 float db_threshold, float db_knee,
178 float ratio)
179 {
180 if (db_threshold != dk->db_threshold || db_knee != dk->db_knee ||
181 ratio != dk->ratio) {
182 /* Threshold and knee. */
183 dk->db_threshold = db_threshold;
184 dk->linear_threshold = decibels_to_linear(db_threshold);
185 dk->db_knee = db_knee;
186
187 /* Compute knee parameters. */
188 dk->ratio = ratio;
189 dk->slope = 1 / dk->ratio;
190
191 float k = k_at_slope(dk, 1 / dk->ratio);
192 dk->K = k;
193 /* See knee_curveK() for details */
194 dk->knee_alpha = dk->linear_threshold + 1 / k;
195 dk->knee_beta = -expf(k * dk->linear_threshold) / k;
196
197 dk->knee_threshold = decibels_to_linear(db_threshold + db_knee);
198 /* See volume_gain() for details */
199 float y0 = knee_curve(dk, dk->knee_threshold, k);
200 dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope);
201 }
202 }
203
204 /* This is the knee part of the compression curve. Returns the output level
205 * given the input level x. */
knee_curveK(struct drc_kernel * dk,float x)206 static float knee_curveK(struct drc_kernel *dk, float x)
207 {
208 /* The formula in knee_curveK is dk->linear_threshold +
209 * (1 - expf(-k * (x - dk->linear_threshold))) / k
210 * which simplifies to (alpha + beta * expf(gamma))
211 * where alpha = dk->linear_threshold + 1 / k
212 * beta = -expf(k * dk->linear_threshold) / k
213 * gamma = -k * x
214 */
215 return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x);
216 }
217
218 /* Full compression curve with constant ratio after knee. Returns the ratio of
219 * output and input signal. */
volume_gain(struct drc_kernel * dk,float x)220 static float volume_gain(struct drc_kernel *dk, float x)
221 {
222 float y;
223
224 if (x < dk->knee_threshold) {
225 if (x < dk->linear_threshold)
226 return 1;
227 y = knee_curveK(dk, x) / x;
228 } else {
229 /* Constant ratio after knee.
230 * log(y/y0) = s * log(x/x0)
231 * => y = y0 * (x/x0)^s
232 * => y = [y0 * (1/x0)^s] * x^s
233 * => y = dk->ratio_base * x^s
234 * => y/x = dk->ratio_base * x^(s - 1)
235 * => y/x = dk->ratio_base * e^(log(x) * (s - 1))
236 */
237 y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1));
238 }
239
240 return y;
241 }
242
dk_set_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio,float attack_time,float release_time,float pre_delay_time,float db_post_gain,float releaseZone1,float releaseZone2,float releaseZone3,float releaseZone4)243 void dk_set_parameters(struct drc_kernel *dk, float db_threshold, float db_knee,
244 float ratio, float attack_time, float release_time,
245 float pre_delay_time, float db_post_gain,
246 float releaseZone1, float releaseZone2,
247 float releaseZone3, float releaseZone4)
248 {
249 float sample_rate = dk->sample_rate;
250
251 update_static_curve_parameters(dk, db_threshold, db_knee, ratio);
252
253 /* Makeup gain. */
254 float full_range_gain = volume_gain(dk, 1);
255 float full_range_makeup_gain = 1 / full_range_gain;
256
257 /* Empirical/perceptual tuning. */
258 full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f);
259
260 dk->main_linear_gain =
261 decibels_to_linear(db_post_gain) * full_range_makeup_gain;
262
263 /* Attack parameters. */
264 attack_time = max(0.001f, attack_time);
265 dk->attack_frames = attack_time * sample_rate;
266
267 /* Release parameters. */
268 float release_frames = sample_rate * release_time;
269
270 /* Detector release time. */
271 float sat_release_time = 0.0025f;
272 float sat_release_frames = sat_release_time * sample_rate;
273 dk->sat_release_frames_inv_neg = -1 / sat_release_frames;
274 dk->sat_release_rate_at_neg_two_db =
275 decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1;
276
277 /* Create a smooth function which passes through four points.
278 * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4
279 */
280 float y1 = release_frames * releaseZone1;
281 float y2 = release_frames * releaseZone2;
282 float y3 = release_frames * releaseZone3;
283 float y4 = release_frames * releaseZone4;
284
285 /* All of these coefficients were derived for 4th order polynomial curve
286 * fitting where the y values match the evenly spaced x values as
287 * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3)
288 */
289 dk->kA = 0.9999999999999998f * y1 + 1.8432219684323923e-16f * y2 -
290 1.9373394351676423e-16f * y3 + 8.824516011816245e-18f * y4;
291 dk->kB = -1.5788320352845888f * y1 + 2.3305837032074286f * y2 -
292 0.9141194204840429f * y3 + 0.1623677525612032f * y4;
293 dk->kC = 0.5334142869106424f * y1 - 1.272736789213631f * y2 +
294 0.9258856042207512f * y3 - 0.18656310191776226f * y4;
295 dk->kD = 0.08783463138207234f * y1 - 0.1694162967925622f * y2 +
296 0.08588057951595272f * y3 - 0.00429891410546283f * y4;
297 dk->kE = -0.042416883008123074f * y1 + 0.1115693827987602f * y2 -
298 0.09764676325265872f * y3 + 0.028494263462021576f * y4;
299
300 /* x ranges from 0 -> 3 0 1 2 3
301 * -15 -10 -5 0db
302 *
303 * y calculates adaptive release frames depending on the amount of
304 * compression.
305 */
306 set_pre_delay_time(dk, pre_delay_time);
307 }
308
dk_set_enabled(struct drc_kernel * dk,int enabled)309 void dk_set_enabled(struct drc_kernel *dk, int enabled)
310 {
311 dk->enabled = enabled;
312 }
313
314 /* Updates the envelope_rate used for the next division */
dk_update_envelope(struct drc_kernel * dk)315 static void dk_update_envelope(struct drc_kernel *dk)
316 {
317 const float kA = dk->kA;
318 const float kB = dk->kB;
319 const float kC = dk->kC;
320 const float kD = dk->kD;
321 const float kE = dk->kE;
322 const float attack_frames = dk->attack_frames;
323
324 /* Calculate desired gain */
325 float desired_gain = dk->detector_average;
326
327 /* Pre-warp so we get desired_gain after sin() warp below. */
328 float scaled_desired_gain = warp_asinf(desired_gain);
329
330 /* Deal with envelopes */
331
332 /* envelope_rate is the rate we slew from current compressor level to
333 * the desired level. The exact rate depends on if we're attacking or
334 * releasing and by how much.
335 */
336 float envelope_rate;
337
338 int is_releasing = scaled_desired_gain > dk->compressor_gain;
339
340 /* compression_diff_db is the difference between current compression
341 * level and the desired level. */
342 float compression_diff_db =
343 linear_to_decibels(dk->compressor_gain / scaled_desired_gain);
344
345 if (is_releasing) {
346 /* Release mode - compression_diff_db should be negative dB */
347 dk->max_attack_compression_diff_db = -INFINITY;
348
349 /* Fix gremlins. */
350 if (isbadf(compression_diff_db))
351 compression_diff_db = -1;
352
353 /* Adaptive release - higher compression (lower
354 * compression_diff_db) releases faster. Contain within range:
355 * -12 -> 0 then scale to go from 0 -> 3
356 */
357 float x = compression_diff_db;
358 x = max(-12.0f, x);
359 x = min(0.0f, x);
360 x = 0.25f * (x + 12);
361
362 /* Compute adaptive release curve using 4th order polynomial.
363 * Normal values for the polynomial coefficients would create a
364 * monotonically increasing function.
365 */
366 float x2 = x * x;
367 float x3 = x2 * x;
368 float x4 = x2 * x2;
369 float release_frames =
370 kA + kB * x + kC * x2 + kD * x3 + kE * x4;
371
372 #define kSpacingDb 5
373 float db_per_frame = kSpacingDb / release_frames;
374 envelope_rate = decibels_to_linear(db_per_frame);
375 } else {
376 /* Attack mode - compression_diff_db should be positive dB */
377
378 /* Fix gremlins. */
379 if (isbadf(compression_diff_db))
380 compression_diff_db = 1;
381
382 /* As long as we're still in attack mode, use a rate based off
383 * the largest compression_diff_db we've encountered so far.
384 */
385 dk->max_attack_compression_diff_db =
386 max(dk->max_attack_compression_diff_db,
387 compression_diff_db);
388
389 float eff_atten_diff_db =
390 max(0.5f, dk->max_attack_compression_diff_db);
391
392 float x = 0.25f / eff_atten_diff_db;
393 envelope_rate = 1 - powf(x, 1 / attack_frames);
394 }
395
396 dk->envelope_rate = envelope_rate;
397 dk->scaled_desired_gain = scaled_desired_gain;
398 }
399
400 /* For a division of frames, take the absolute values of left channel and right
401 * channel, store the maximum of them in output. */
402 #if defined(__aarch64__)
max_abs_division(float * output,const float * data0,const float * data1)403 static inline void max_abs_division(float *output, const float *data0,
404 const float *data1)
405 {
406 int count = DIVISION_FRAMES / 4;
407
408 // clang-format off
409 __asm__ __volatile__(
410 "1: \n"
411 "ld1 {v0.4s}, [%[data0]], #16 \n"
412 "ld1 {v1.4s}, [%[data1]], #16 \n"
413 "fabs v0.4s, v0.4s \n"
414 "fabs v1.4s, v1.4s \n"
415 "fmax v0.4s, v0.4s, v1.4s \n"
416 "st1 {v0.4s}, [%[output]], #16 \n"
417 "subs %w[count], %w[count], #1 \n"
418 "b.ne 1b \n"
419 : /* output */
420 [data0]"+r"(data0),
421 [data1]"+r"(data1),
422 [output]"+r"(output),
423 [count]"+r"(count)
424 : /* input */
425 : /* clobber */
426 "v0", "v1", "memory", "cc");
427 // clang-format on
428 }
429 #elif defined(__ARM_NEON__)
max_abs_division(float * output,const float * data0,const float * data1)430 static inline void max_abs_division(float *output, const float *data0,
431 const float *data1)
432 {
433 int count = DIVISION_FRAMES / 4;
434
435 // clang-format off
436 __asm__ __volatile__(
437 "1: \n"
438 "vld1.32 {q0}, [%[data0]]! \n"
439 "vld1.32 {q1}, [%[data1]]! \n"
440 "vabs.f32 q0, q0 \n"
441 "vabs.f32 q1, q1 \n"
442 "vmax.f32 q0, q1 \n"
443 "vst1.32 {q0}, [%[output]]! \n"
444 "subs %[count], #1 \n"
445 "bne 1b \n"
446 : /* output */
447 [data0]"+r"(data0),
448 [data1]"+r"(data1),
449 [output]"+r"(output),
450 [count]"+r"(count)
451 : /* input */
452 : /* clobber */
453 "q0", "q1", "memory", "cc");
454 // clang-format on
455 }
456 #elif defined(__SSE3__)
457 #include <emmintrin.h>
max_abs_division(float * output,const float * data0,const float * data1)458 static inline void max_abs_division(float *output, const float *data0,
459 const float *data1)
460 {
461 __m128 x, y;
462 int count = DIVISION_FRAMES / 4;
463 // clang-format off
464 __asm__ __volatile__(
465 "1: \n"
466 "lddqu (%[data0]), %[x] \n"
467 "lddqu (%[data1]), %[y] \n"
468 "andps %[mask], %[x] \n"
469 "andps %[mask], %[y] \n"
470 "maxps %[y], %[x] \n"
471 "movdqu %[x], (%[output]) \n"
472 "add $16, %[data0] \n"
473 "add $16, %[data1] \n"
474 "add $16, %[output] \n"
475 "sub $1, %[count] \n"
476 "jnz 1b \n"
477 : /* output */
478 [data0]"+r"(data0),
479 [data1]"+r"(data1),
480 [output]"+r"(output),
481 [count]"+r"(count),
482 [x]"=&x"(x),
483 [y]"=&x"(y)
484 : /* input */
485 [mask]"x"(_mm_set1_epi32(0x7fffffff))
486 : /* clobber */
487 "memory", "cc");
488 // clang-format on
489 }
490 #else
max_abs_division(float * output,const float * data0,const float * data1)491 static inline void max_abs_division(float *output, const float *data0,
492 const float *data1)
493 {
494 int i;
495 for (i = 0; i < DIVISION_FRAMES; i++)
496 output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i]));
497 }
498 #endif
499
500 /* Update detector_average from the last input division. */
dk_update_detector_average(struct drc_kernel * dk)501 static void dk_update_detector_average(struct drc_kernel *dk)
502 {
503 float abs_input_array[DIVISION_FRAMES];
504 const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg;
505 const float sat_release_rate_at_neg_two_db =
506 dk->sat_release_rate_at_neg_two_db;
507 float detector_average = dk->detector_average;
508 int div_start, i;
509
510 /* Calculate the start index of the last input division */
511 if (dk->pre_delay_write_index == 0) {
512 div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES;
513 } else {
514 div_start = dk->pre_delay_write_index - DIVISION_FRAMES;
515 }
516
517 /* The max abs value across all channels for this frame */
518 max_abs_division(abs_input_array, &dk->pre_delay_buffers[0][div_start],
519 &dk->pre_delay_buffers[1][div_start]);
520
521 for (i = 0; i < DIVISION_FRAMES; i++) {
522 /* Compute compression amount from un-delayed signal */
523 float abs_input = abs_input_array[i];
524
525 /* Calculate shaped power on undelayed input. Put through
526 * shaping curve. This is linear up to the threshold, then
527 * enters a "knee" portion followed by the "ratio" portion. The
528 * transition from the threshold to the knee is smooth (1st
529 * derivative matched). The transition from the knee to the
530 * ratio portion is smooth (1st derivative matched).
531 */
532 float gain = volume_gain(dk, abs_input);
533 int is_release = (gain > detector_average);
534 if (is_release) {
535 if (gain > NEG_TWO_DB) {
536 detector_average +=
537 (gain - detector_average) *
538 sat_release_rate_at_neg_two_db;
539 } else {
540 float gain_db = linear_to_decibels(gain);
541 float db_per_frame =
542 gain_db * sat_release_frames_inv_neg;
543 float sat_release_rate =
544 decibels_to_linear(db_per_frame) - 1;
545 detector_average += (gain - detector_average) *
546 sat_release_rate;
547 }
548 } else {
549 detector_average = gain;
550 }
551
552 /* Fix gremlins. */
553 if (isbadf(detector_average))
554 detector_average = 1.0f;
555 else
556 detector_average = min(detector_average, 1.0f);
557 }
558
559 dk->detector_average = detector_average;
560 }
561
562 /* Calculate compress_gain from the envelope and apply total_gain to compress
563 * the next output division. */
564 /* TODO(fbarchard): Port to aarch64 */
565 #if defined(__ARM_NEON__)
566 #include <arm_neon.h>
dk_compress_output(struct drc_kernel * dk)567 static void dk_compress_output(struct drc_kernel *dk)
568 {
569 const float main_linear_gain = dk->main_linear_gain;
570 const float envelope_rate = dk->envelope_rate;
571 const float scaled_desired_gain = dk->scaled_desired_gain;
572 const float compressor_gain = dk->compressor_gain;
573 const int div_start = dk->pre_delay_read_index;
574 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
575 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
576 int count = DIVISION_FRAMES / 4;
577
578 /* See warp_sinf() for the details for the constants. */
579 const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f);
580 const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f);
581 const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f);
582 const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f);
583
584 /* Exponential approach to desired gain. */
585 if (envelope_rate < 1) {
586 float c = compressor_gain - scaled_desired_gain;
587 float r = 1 - envelope_rate;
588 float32x4_t x0 = { c * r, c * r * r, c * r * r * r,
589 c * r * r * r * r };
590 float32x4_t x, x2, x4, left, right, tmp1, tmp2;
591
592 // clang-format off
593 __asm__ __volatile(
594 "b 2f \n"
595 "1: \n"
596 "vmul.f32 %q[x0], %q[r4] \n"
597 "2: \n"
598 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n"
599 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n"
600 "vadd.f32 %q[x], %q[x0], %q[base] \n"
601 /* Calculate warp_sin() for four values in x. */
602 "vmul.f32 %q[x2], %q[x], %q[x] \n"
603 "vmov.f32 %q[tmp1], %q[A5] \n"
604 "vmov.f32 %q[tmp2], %q[A1] \n"
605 "vmul.f32 %q[x4], %q[x2], %q[x2] \n"
606 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n"
607 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n"
608 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n"
609 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n"
610 /* Now tmp2 contains the result of warp_sin(). */
611 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n"
612 "vmul.f32 %q[left], %q[tmp2] \n"
613 "vmul.f32 %q[right], %q[tmp2] \n"
614 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n"
615 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n"
616 "subs %[count], #1 \n"
617 "bne 1b \n"
618 : /* output */
619 "=r"(count),
620 "=r"(ptr_left),
621 "=r"(ptr_right),
622 "=w"(x0),
623 [x]"=&w"(x),
624 [x2]"=&w"(x2),
625 [x4]"=&w"(x4),
626 [left]"=&w"(left),
627 [right]"=&w"(right),
628 [tmp1]"=&w"(tmp1),
629 [tmp2]"=&w"(tmp2)
630 : /* input */
631 [count]"0"(count),
632 [ptr_left]"1"(ptr_left),
633 [ptr_right]"2"(ptr_right),
634 [x0]"3"(x0),
635 [A1]"w"(A1),
636 [A3]"w"(A3),
637 [A5]"w"(A5),
638 [A7]"w"(A7),
639 [base]"w"(vdupq_n_f32(scaled_desired_gain)),
640 [r4]"w"(vdupq_n_f32(r*r*r*r)),
641 [g]"w"(vdupq_n_f32(main_linear_gain))
642 : /* clobber */
643 "memory", "cc");
644 // clang-format on
645 dk->compressor_gain = x[3];
646 } else {
647 float c = compressor_gain;
648 float r = envelope_rate;
649 float32x4_t x = { c * r, c * r * r, c * r * r * r,
650 c * r * r * r * r };
651 float32x4_t x2, x4, left, right, tmp1, tmp2;
652
653 // clang-format off
654 __asm__ __volatile(
655 "b 2f \n"
656 "1: \n"
657 "vmul.f32 %q[x], %q[r4] \n"
658 "2: \n"
659 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n"
660 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n"
661 "vmin.f32 %q[x], %q[one] \n"
662 /* Calculate warp_sin() for four values in x. */
663 "vmul.f32 %q[x2], %q[x], %q[x] \n"
664 "vmov.f32 %q[tmp1], %q[A5] \n"
665 "vmov.f32 %q[tmp2], %q[A1] \n"
666 "vmul.f32 %q[x4], %q[x2], %q[x2] \n"
667 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n"
668 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n"
669 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n"
670 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n"
671 /* Now tmp2 contains the result of warp_sin(). */
672 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n"
673 "vmul.f32 %q[left], %q[tmp2] \n"
674 "vmul.f32 %q[right], %q[tmp2] \n"
675 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n"
676 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n"
677 "subs %[count], #1 \n"
678 "bne 1b \n"
679 : /* output */
680 "=r"(count),
681 "=r"(ptr_left),
682 "=r"(ptr_right),
683 "=w"(x),
684 [x2]"=&w"(x2),
685 [x4]"=&w"(x4),
686 [left]"=&w"(left),
687 [right]"=&w"(right),
688 [tmp1]"=&w"(tmp1),
689 [tmp2]"=&w"(tmp2)
690 : /* input */
691 [count]"0"(count),
692 [ptr_left]"1"(ptr_left),
693 [ptr_right]"2"(ptr_right),
694 [x]"3"(x),
695 [A1]"w"(A1),
696 [A3]"w"(A3),
697 [A5]"w"(A5),
698 [A7]"w"(A7),
699 [one]"w"(vdupq_n_f32(1)),
700 [r4]"w"(vdupq_n_f32(r*r*r*r)),
701 [g]"w"(vdupq_n_f32(main_linear_gain))
702 : /* clobber */
703 "memory", "cc");
704 // clang-format on
705 dk->compressor_gain = x[3];
706 }
707 }
708 #elif defined(__SSE3__) && defined(__x86_64__)
709 #include <emmintrin.h>
dk_compress_output(struct drc_kernel * dk)710 static void dk_compress_output(struct drc_kernel *dk)
711 {
712 const float main_linear_gain = dk->main_linear_gain;
713 const float envelope_rate = dk->envelope_rate;
714 const float scaled_desired_gain = dk->scaled_desired_gain;
715 const float compressor_gain = dk->compressor_gain;
716 const int div_start = dk->pre_delay_read_index;
717 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
718 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
719 int count = DIVISION_FRAMES / 4;
720
721 /* See warp_sinf() for the details for the constants. */
722 const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
723 const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
724 const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
725 const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
726
727 /* Exponential approach to desired gain. */
728 if (envelope_rate < 1) {
729 float c = compressor_gain - scaled_desired_gain;
730 float r = 1 - envelope_rate;
731 __m128 x0 = { c * r, c * r * r, c * r * r * r,
732 c * r * r * r * r };
733 __m128 x, x2, x4, left, right, tmp1, tmp2;
734
735 // clang-format off
736 __asm__ __volatile(
737 "jmp 2f \n"
738 "1: \n"
739 "mulps %[r4], %[x0] \n"
740 "2: \n"
741 "lddqu (%[ptr_left]), %[left] \n"
742 "lddqu (%[ptr_right]), %[right] \n"
743 "movaps %[x0], %[x] \n"
744 "addps %[base], %[x] \n"
745 /* Calculate warp_sin() for four values in x. */
746 "movaps %[x], %[x2] \n"
747 "mulps %[x], %[x2] \n"
748 "movaps %[x2], %[x4] \n"
749 "movaps %[x2], %[tmp1] \n"
750 "movaps %[x2], %[tmp2] \n"
751 "mulps %[x2], %[x4] \n"
752 "mulps %[A7], %[tmp1] \n"
753 "mulps %[A3], %[tmp2] \n"
754 "addps %[A5], %[tmp1] \n"
755 "addps %[A1], %[tmp2] \n"
756 "mulps %[x4], %[tmp1] \n"
757 "addps %[tmp1], %[tmp2] \n"
758 "mulps %[x], %[tmp2] \n"
759 /* Now tmp2 contains the result of warp_sin(). */
760 "mulps %[g], %[tmp2] \n"
761 "mulps %[tmp2], %[left] \n"
762 "mulps %[tmp2], %[right] \n"
763 "movdqu %[left], (%[ptr_left]) \n"
764 "movdqu %[right], (%[ptr_right]) \n"
765 "add $16, %[ptr_left] \n"
766 "add $16, %[ptr_right] \n"
767 "sub $1, %[count] \n"
768 "jne 1b \n"
769 : /* output */
770 "=r"(count),
771 "=r"(ptr_left),
772 "=r"(ptr_right),
773 "=x"(x0),
774 [x]"=&x"(x),
775 [x2]"=&x"(x2),
776 [x4]"=&x"(x4),
777 [left]"=&x"(left),
778 [right]"=&x"(right),
779 [tmp1]"=&x"(tmp1),
780 [tmp2]"=&x"(tmp2)
781 : /* input */
782 [count]"0"(count),
783 [ptr_left]"1"(ptr_left),
784 [ptr_right]"2"(ptr_right),
785 [x0]"3"(x0),
786 [A1]"x"(A1),
787 [A3]"x"(A3),
788 [A5]"x"(A5),
789 [A7]"x"(A7),
790 [base]"x"(_mm_set1_ps(scaled_desired_gain)),
791 [r4]"x"(_mm_set1_ps(r*r*r*r)),
792 [g]"x"(_mm_set1_ps(main_linear_gain))
793 : /* clobber */
794 "memory", "cc");
795 // clang-format on
796 dk->compressor_gain = x[3];
797 } else {
798 /* See warp_sinf() for the details for the constants. */
799 __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
800 __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
801 __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
802 __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
803
804 float c = compressor_gain;
805 float r = envelope_rate;
806 __m128 x = { c * r, c * r * r, c * r * r * r,
807 c * r * r * r * r };
808 __m128 x2, x4, left, right, tmp1, tmp2;
809
810 // clang-format off
811 __asm__ __volatile(
812 "jmp 2f \n"
813 "1: \n"
814 "mulps %[r4], %[x] \n"
815 "2: \n"
816 "lddqu (%[ptr_left]), %[left] \n"
817 "lddqu (%[ptr_right]), %[right] \n"
818 "minps %[one], %[x] \n"
819 /* Calculate warp_sin() for four values in x. */
820 "movaps %[x], %[x2] \n"
821 "mulps %[x], %[x2] \n"
822 "movaps %[x2], %[x4] \n"
823 "movaps %[x2], %[tmp1] \n"
824 "movaps %[x2], %[tmp2] \n"
825 "mulps %[x2], %[x4] \n"
826 "mulps %[A7], %[tmp1] \n"
827 "mulps %[A3], %[tmp2] \n"
828 "addps %[A5], %[tmp1] \n"
829 "addps %[A1], %[tmp2] \n"
830 "mulps %[x4], %[tmp1] \n"
831 "addps %[tmp1], %[tmp2] \n"
832 "mulps %[x], %[tmp2] \n"
833 /* Now tmp2 contains the result of warp_sin(). */
834 "mulps %[g], %[tmp2] \n"
835 "mulps %[tmp2], %[left] \n"
836 "mulps %[tmp2], %[right] \n"
837 "movdqu %[left], (%[ptr_left]) \n"
838 "movdqu %[right], (%[ptr_right]) \n"
839 "add $16, %[ptr_left] \n"
840 "add $16, %[ptr_right] \n"
841 "sub $1, %[count] \n"
842 "jne 1b \n"
843 : /* output */
844 "=r"(count),
845 "=r"(ptr_left),
846 "=r"(ptr_right),
847 "=x"(x),
848 [x2]"=&x"(x2),
849 [x4]"=&x"(x4),
850 [left]"=&x"(left),
851 [right]"=&x"(right),
852 [tmp1]"=&x"(tmp1),
853 [tmp2]"=&x"(tmp2)
854 : /* input */
855 [count]"0"(count),
856 [ptr_left]"1"(ptr_left),
857 [ptr_right]"2"(ptr_right),
858 [x]"3"(x),
859 [A1]"x"(A1),
860 [A3]"x"(A3),
861 [A5]"x"(A5),
862 [A7]"x"(A7),
863 [one]"x"(_mm_set1_ps(1)),
864 [r4]"x"(_mm_set1_ps(r*r*r*r)),
865 [g]"x"(_mm_set1_ps(main_linear_gain))
866 : /* clobber */
867 "memory", "cc");
868 // clang-format on
869 dk->compressor_gain = x[3];
870 }
871 }
872 #else
dk_compress_output(struct drc_kernel * dk)873 static void dk_compress_output(struct drc_kernel *dk)
874 {
875 const float main_linear_gain = dk->main_linear_gain;
876 const float envelope_rate = dk->envelope_rate;
877 const float scaled_desired_gain = dk->scaled_desired_gain;
878 const float compressor_gain = dk->compressor_gain;
879 const int div_start = dk->pre_delay_read_index;
880 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
881 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
882 int count = DIVISION_FRAMES / 4;
883
884 int i, j;
885
886 /* Exponential approach to desired gain. */
887 if (envelope_rate < 1) {
888 /* Attack - reduce gain to desired. */
889 float c = compressor_gain - scaled_desired_gain;
890 float base = scaled_desired_gain;
891 float r = 1 - envelope_rate;
892 float x[4] = { c * r, c * r * r, c * r * r * r,
893 c * r * r * r * r };
894 float r4 = r * r * r * r;
895
896 i = 0;
897 while (1) {
898 for (j = 0; j < 4; j++) {
899 /* Warp pre-compression gain to smooth out sharp
900 * exponential transition points.
901 */
902 float post_warp_compressor_gain =
903 warp_sinf(x[j] + base);
904
905 /* Calculate total gain using main gain. */
906 float total_gain = main_linear_gain *
907 post_warp_compressor_gain;
908
909 /* Apply final gain. */
910 *ptr_left++ *= total_gain;
911 *ptr_right++ *= total_gain;
912 }
913
914 if (++i == count)
915 break;
916
917 for (j = 0; j < 4; j++)
918 x[j] = x[j] * r4;
919 }
920
921 dk->compressor_gain = x[3] + base;
922 } else {
923 /* Release - exponentially increase gain to 1.0 */
924 float c = compressor_gain;
925 float r = envelope_rate;
926 float x[4] = { c * r, c * r * r, c * r * r * r,
927 c * r * r * r * r };
928 float r4 = r * r * r * r;
929
930 i = 0;
931 while (1) {
932 for (j = 0; j < 4; j++) {
933 /* Warp pre-compression gain to smooth out sharp
934 * exponential transition points.
935 */
936 float post_warp_compressor_gain =
937 warp_sinf(x[j]);
938
939 /* Calculate total gain using main gain. */
940 float total_gain = main_linear_gain *
941 post_warp_compressor_gain;
942
943 /* Apply final gain. */
944 *ptr_left++ *= total_gain;
945 *ptr_right++ *= total_gain;
946 }
947
948 if (++i == count)
949 break;
950
951 for (j = 0; j < 4; j++)
952 x[j] = min(1.0f, x[j] * r4);
953 }
954
955 dk->compressor_gain = x[3];
956 }
957 }
958 #endif
959
960 /* After one complete divison of samples have been received (and one divison of
961 * samples have been output), we calculate shaped power average
962 * (detector_average) from the input division, update envelope parameters from
963 * detector_average, then prepare the next output division by applying the
964 * envelope to compress the samples.
965 */
dk_process_one_division(struct drc_kernel * dk)966 static void dk_process_one_division(struct drc_kernel *dk)
967 {
968 dk_update_detector_average(dk);
969 dk_update_envelope(dk);
970 dk_compress_output(dk);
971 }
972
973 /* Copy the input data to the pre-delay buffer, and copy the output data back to
974 * the input buffer */
dk_copy_fragment(struct drc_kernel * dk,float * data_channels[],unsigned frame_index,int frames_to_process)975 static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[],
976 unsigned frame_index, int frames_to_process)
977 {
978 int write_index = dk->pre_delay_write_index;
979 int read_index = dk->pre_delay_read_index;
980 int j;
981
982 for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
983 memcpy(&dk->pre_delay_buffers[j][write_index],
984 &data_channels[j][frame_index],
985 frames_to_process * sizeof(float));
986 memcpy(&data_channels[j][frame_index],
987 &dk->pre_delay_buffers[j][read_index],
988 frames_to_process * sizeof(float));
989 }
990
991 dk->pre_delay_write_index =
992 (write_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK;
993 dk->pre_delay_read_index =
994 (read_index + frames_to_process) & MAX_PRE_DELAY_FRAMES_MASK;
995 }
996
997 /* Delay the input sample only and don't do other processing. This is used when
998 * the kernel is disabled. We want to do this to match the processing delay in
999 * kernels of other bands.
1000 */
dk_process_delay_only(struct drc_kernel * dk,float * data_channels[],unsigned count)1001 static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[],
1002 unsigned count)
1003 {
1004 int read_index = dk->pre_delay_read_index;
1005 int write_index = dk->pre_delay_write_index;
1006 int i = 0;
1007
1008 while (i < count) {
1009 int j;
1010 int small = min(read_index, write_index);
1011 int large = max(read_index, write_index);
1012 /* chunk is the minimum of readable samples in contiguous
1013 * buffer, writable samples in contiguous buffer, and the
1014 * available input samples. */
1015 int chunk = min(large - small, MAX_PRE_DELAY_FRAMES - large);
1016 chunk = min(chunk, count - i);
1017 for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
1018 memcpy(&dk->pre_delay_buffers[j][write_index],
1019 &data_channels[j][i], chunk * sizeof(float));
1020 memcpy(&data_channels[j][i],
1021 &dk->pre_delay_buffers[j][read_index],
1022 chunk * sizeof(float));
1023 }
1024 read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1025 write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1026 i += chunk;
1027 }
1028
1029 dk->pre_delay_read_index = read_index;
1030 dk->pre_delay_write_index = write_index;
1031 }
1032
dk_process(struct drc_kernel * dk,float * data_channels[],unsigned count)1033 void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count)
1034 {
1035 int i = 0;
1036 int fragment;
1037
1038 if (!dk->enabled) {
1039 dk_process_delay_only(dk, data_channels, count);
1040 return;
1041 }
1042
1043 if (!dk->processed) {
1044 dk_update_envelope(dk);
1045 dk_compress_output(dk);
1046 dk->processed = 1;
1047 }
1048
1049 int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK;
1050 while (i < count) {
1051 fragment = min(DIVISION_FRAMES - offset, count - i);
1052 dk_copy_fragment(dk, data_channels, i, fragment);
1053 i += fragment;
1054 offset = (offset + fragment) & DIVISION_FRAMES_MASK;
1055
1056 /* Process the input division (32 frames). */
1057 if (offset == 0)
1058 dk_process_one_division(dk);
1059 }
1060 }
1061