1 /* Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file.
4 */
5
6 /* Copyright (C) 2011 Google Inc. All rights reserved.
7 * Use of this source code is governed by a BSD-style license that can be
8 * found in the LICENSE.WEBKIT file.
9 */
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14
15 #include "drc_math.h"
16 #include "drc_kernel.h"
17
18 #define MAX_PRE_DELAY_FRAMES 1024
19 #define MAX_PRE_DELAY_FRAMES_MASK (MAX_PRE_DELAY_FRAMES - 1)
20 #define DEFAULT_PRE_DELAY_FRAMES 256
21 #define DIVISION_FRAMES 32
22 #define DIVISION_FRAMES_MASK (DIVISION_FRAMES - 1)
23
24 #define assert_on_compile(e) ((void)sizeof(char[1 - 2 * !(e)]))
25 #define assert_on_compile_is_power_of_2(n) \
26 assert_on_compile((n) != 0 && (((n) & ((n) - 1)) == 0))
27
28 const float uninitialized_value = -1;
29 static int drc_math_initialized;
30
dk_init(struct drc_kernel * dk,float sample_rate)31 void dk_init(struct drc_kernel *dk, float sample_rate)
32 {
33 int i;
34
35 if (!drc_math_initialized) {
36 drc_math_initialized = 1;
37 drc_math_init();
38 }
39
40 dk->sample_rate = sample_rate;
41 dk->detector_average = 0;
42 dk->compressor_gain = 1;
43 dk->enabled = 0;
44 dk->processed = 0;
45 dk->last_pre_delay_frames = DEFAULT_PRE_DELAY_FRAMES;
46 dk->pre_delay_read_index = 0;
47 dk->pre_delay_write_index = DEFAULT_PRE_DELAY_FRAMES;
48 dk->max_attack_compression_diff_db = -INFINITY;
49 dk->ratio = uninitialized_value;
50 dk->slope = uninitialized_value;
51 dk->linear_threshold = uninitialized_value;
52 dk->db_threshold = uninitialized_value;
53 dk->db_knee = uninitialized_value;
54 dk->knee_threshold = uninitialized_value;
55 dk->ratio_base = uninitialized_value;
56 dk->K = uninitialized_value;
57
58 assert_on_compile_is_power_of_2(DIVISION_FRAMES);
59 assert_on_compile(DIVISION_FRAMES % 4 == 0);
60 /* Allocate predelay buffers */
61 assert_on_compile_is_power_of_2(MAX_PRE_DELAY_FRAMES);
62 for (i = 0; i < DRC_NUM_CHANNELS; i++) {
63 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
64 dk->pre_delay_buffers[i] = (float *)calloc(1, size);
65 }
66 }
67
dk_free(struct drc_kernel * dk)68 void dk_free(struct drc_kernel *dk)
69 {
70 int i;
71 for (i = 0; i < DRC_NUM_CHANNELS; ++i)
72 free(dk->pre_delay_buffers[i]);
73 }
74
75 /* Sets the pre-delay (lookahead) buffer size */
set_pre_delay_time(struct drc_kernel * dk,float pre_delay_time)76 static void set_pre_delay_time(struct drc_kernel *dk, float pre_delay_time)
77 {
78 int i;
79 /* Re-configure look-ahead section pre-delay if delay time has
80 * changed. */
81 unsigned pre_delay_frames = pre_delay_time * dk->sample_rate;
82 pre_delay_frames = min(pre_delay_frames, MAX_PRE_DELAY_FRAMES - 1);
83
84 /* Make pre_delay_frames multiplies of DIVISION_FRAMES. This way we
85 * won't split a division of samples into two blocks of memory, so it is
86 * easier to process. This may make the actual delay time slightly less
87 * than the specified value, but the difference is less than 1ms. */
88 pre_delay_frames &= ~DIVISION_FRAMES_MASK;
89
90 /* We need at least one division buffer, so the incoming data won't
91 * overwrite the output data */
92 pre_delay_frames = max(pre_delay_frames, DIVISION_FRAMES);
93
94 if (dk->last_pre_delay_frames != pre_delay_frames) {
95 dk->last_pre_delay_frames = pre_delay_frames;
96 for (i = 0; i < DRC_NUM_CHANNELS; ++i) {
97 size_t size = sizeof(float) * MAX_PRE_DELAY_FRAMES;
98 memset(dk->pre_delay_buffers[i], 0, size);
99 }
100
101 dk->pre_delay_read_index = 0;
102 dk->pre_delay_write_index = pre_delay_frames;
103 }
104 }
105
106 /* Exponential curve for the knee. It is 1st derivative matched at
107 * dk->linear_threshold and asymptotically approaches the value
108 * dk->linear_threshold + 1 / k.
109 *
110 * This is used only when calculating the static curve, not used when actually
111 * compress the input data (knee_curveK below is used instead).
112 */
knee_curve(struct drc_kernel * dk,float x,float k)113 static float knee_curve(struct drc_kernel *dk, float x, float k)
114 {
115 /* Linear up to threshold. */
116 if (x < dk->linear_threshold)
117 return x;
118
119 return dk->linear_threshold +
120 (1 - knee_expf(-k * (x - dk->linear_threshold))) / k;
121 }
122
123 /* Approximate 1st derivative with input and output expressed in dB. This slope
124 * is equal to the inverse of the compression "ratio". In other words, a
125 * compression ratio of 20 would be a slope of 1/20.
126 */
slope_at(struct drc_kernel * dk,float x,float k)127 static float slope_at(struct drc_kernel *dk, float x, float k)
128 {
129 if (x < dk->linear_threshold)
130 return 1;
131
132 float x2 = x * 1.001;
133
134 float x_db = linear_to_decibels(x);
135 float x2Db = linear_to_decibels(x2);
136
137 float y_db = linear_to_decibels(knee_curve(dk, x, k));
138 float y2Db = linear_to_decibels(knee_curve(dk, x2, k));
139
140 float m = (y2Db - y_db) / (x2Db - x_db);
141
142 return m;
143 }
144
k_at_slope(struct drc_kernel * dk,float desired_slope)145 static float k_at_slope(struct drc_kernel *dk, float desired_slope)
146 {
147 float x_db = dk->db_threshold + dk->db_knee;
148 float x = decibels_to_linear(x_db);
149
150 /* Approximate k given initial values. */
151 float minK = 0.1;
152 float maxK = 10000;
153 float k = 5;
154 int i;
155
156 for (i = 0; i < 15; ++i) {
157 /* A high value for k will more quickly asymptotically approach
158 * a slope of 0. */
159 float slope = slope_at(dk, x, k);
160
161 if (slope < desired_slope) {
162 /* k is too high. */
163 maxK = k;
164 } else {
165 /* k is too low. */
166 minK = k;
167 }
168
169 /* Re-calculate based on geometric mean. */
170 k = sqrtf(minK * maxK);
171 }
172
173 return k;
174 }
175
update_static_curve_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio)176 static void update_static_curve_parameters(struct drc_kernel *dk,
177 float db_threshold,
178 float db_knee, float ratio)
179 {
180 if (db_threshold != dk->db_threshold || db_knee != dk->db_knee ||
181 ratio != dk->ratio) {
182 /* Threshold and knee. */
183 dk->db_threshold = db_threshold;
184 dk->linear_threshold = decibels_to_linear(db_threshold);
185 dk->db_knee = db_knee;
186
187 /* Compute knee parameters. */
188 dk->ratio = ratio;
189 dk->slope = 1 / dk->ratio;
190
191 float k = k_at_slope(dk, 1 / dk->ratio);
192 dk->K = k;
193 /* See knee_curveK() for details */
194 dk->knee_alpha = dk->linear_threshold + 1 / k;
195 dk->knee_beta = -expf(k * dk->linear_threshold) / k;
196
197 dk->knee_threshold = decibels_to_linear(db_threshold + db_knee);
198 /* See volume_gain() for details */
199 float y0 = knee_curve(dk, dk->knee_threshold, k);
200 dk->ratio_base = y0 * powf(dk->knee_threshold, -dk->slope);
201 }
202 }
203
204 /* This is the knee part of the compression curve. Returns the output level
205 * given the input level x. */
knee_curveK(struct drc_kernel * dk,float x)206 static float knee_curveK(struct drc_kernel *dk, float x)
207 {
208 /* The formula in knee_curveK is dk->linear_threshold +
209 * (1 - expf(-k * (x - dk->linear_threshold))) / k
210 * which simplifies to (alpha + beta * expf(gamma))
211 * where alpha = dk->linear_threshold + 1 / k
212 * beta = -expf(k * dk->linear_threshold) / k
213 * gamma = -k * x
214 */
215 return dk->knee_alpha + dk->knee_beta * knee_expf(-dk->K * x);
216 }
217
218 /* Full compression curve with constant ratio after knee. Returns the ratio of
219 * output and input signal. */
volume_gain(struct drc_kernel * dk,float x)220 static float volume_gain(struct drc_kernel *dk, float x)
221 {
222 float y;
223
224 if (x < dk->knee_threshold) {
225 if (x < dk->linear_threshold)
226 return 1;
227 y = knee_curveK(dk, x) / x;
228 } else {
229 /* Constant ratio after knee.
230 * log(y/y0) = s * log(x/x0)
231 * => y = y0 * (x/x0)^s
232 * => y = [y0 * (1/x0)^s] * x^s
233 * => y = dk->ratio_base * x^s
234 * => y/x = dk->ratio_base * x^(s - 1)
235 * => y/x = dk->ratio_base * e^(log(x) * (s - 1))
236 */
237 y = dk->ratio_base * knee_expf(logf(x) * (dk->slope - 1));
238 }
239
240 return y;
241 }
242
dk_set_parameters(struct drc_kernel * dk,float db_threshold,float db_knee,float ratio,float attack_time,float release_time,float pre_delay_time,float db_post_gain,float releaseZone1,float releaseZone2,float releaseZone3,float releaseZone4)243 void dk_set_parameters(struct drc_kernel *dk,
244 float db_threshold,
245 float db_knee,
246 float ratio,
247 float attack_time,
248 float release_time,
249 float pre_delay_time,
250 float db_post_gain,
251 float releaseZone1,
252 float releaseZone2,
253 float releaseZone3,
254 float releaseZone4)
255 {
256 float sample_rate = dk->sample_rate;
257
258 update_static_curve_parameters(dk, db_threshold, db_knee, ratio);
259
260 /* Makeup gain. */
261 float full_range_gain = volume_gain(dk, 1);
262 float full_range_makeup_gain = 1 / full_range_gain;
263
264 /* Empirical/perceptual tuning. */
265 full_range_makeup_gain = powf(full_range_makeup_gain, 0.6f);
266
267 dk->master_linear_gain = decibels_to_linear(db_post_gain) *
268 full_range_makeup_gain;
269
270 /* Attack parameters. */
271 attack_time = max(0.001f, attack_time);
272 dk->attack_frames = attack_time * sample_rate;
273
274 /* Release parameters. */
275 float release_frames = sample_rate * release_time;
276
277 /* Detector release time. */
278 float sat_release_time = 0.0025f;
279 float sat_release_frames = sat_release_time * sample_rate;
280 dk->sat_release_frames_inv_neg = -1 / sat_release_frames;
281 dk->sat_release_rate_at_neg_two_db =
282 decibels_to_linear(-2 * dk->sat_release_frames_inv_neg) - 1;
283
284 /* Create a smooth function which passes through four points.
285 * Polynomial of the form y = a + b*x + c*x^2 + d*x^3 + e*x^4
286 */
287 float y1 = release_frames * releaseZone1;
288 float y2 = release_frames * releaseZone2;
289 float y3 = release_frames * releaseZone3;
290 float y4 = release_frames * releaseZone4;
291
292 /* All of these coefficients were derived for 4th order polynomial curve
293 * fitting where the y values match the evenly spaced x values as
294 * follows: (y1 : x == 0, y2 : x == 1, y3 : x == 2, y4 : x == 3)
295 */
296 dk->kA = 0.9999999999999998f*y1 + 1.8432219684323923e-16f*y2
297 - 1.9373394351676423e-16f*y3 + 8.824516011816245e-18f*y4;
298 dk->kB = -1.5788320352845888f*y1 + 2.3305837032074286f*y2
299 - 0.9141194204840429f*y3 + 0.1623677525612032f*y4;
300 dk->kC = 0.5334142869106424f*y1 - 1.272736789213631f*y2
301 + 0.9258856042207512f*y3 - 0.18656310191776226f*y4;
302 dk->kD = 0.08783463138207234f*y1 - 0.1694162967925622f*y2
303 + 0.08588057951595272f*y3 - 0.00429891410546283f*y4;
304 dk->kE = -0.042416883008123074f*y1 + 0.1115693827987602f*y2
305 - 0.09764676325265872f*y3 + 0.028494263462021576f*y4;
306
307 /* x ranges from 0 -> 3 0 1 2 3
308 * -15 -10 -5 0db
309 *
310 * y calculates adaptive release frames depending on the amount of
311 * compression.
312 */
313 set_pre_delay_time(dk, pre_delay_time);
314 }
315
dk_set_enabled(struct drc_kernel * dk,int enabled)316 void dk_set_enabled(struct drc_kernel *dk, int enabled)
317 {
318 dk->enabled = enabled;
319 }
320
321 /* Updates the envelope_rate used for the next division */
dk_update_envelope(struct drc_kernel * dk)322 static void dk_update_envelope(struct drc_kernel *dk)
323 {
324 const float kA = dk->kA;
325 const float kB = dk->kB;
326 const float kC = dk->kC;
327 const float kD = dk->kD;
328 const float kE = dk->kE;
329 const float attack_frames = dk->attack_frames;
330
331 /* Calculate desired gain */
332 float desired_gain = dk->detector_average;
333
334 /* Pre-warp so we get desired_gain after sin() warp below. */
335 float scaled_desired_gain = warp_asinf(desired_gain);
336
337 /* Deal with envelopes */
338
339 /* envelope_rate is the rate we slew from current compressor level to
340 * the desired level. The exact rate depends on if we're attacking or
341 * releasing and by how much.
342 */
343 float envelope_rate;
344
345 int is_releasing = scaled_desired_gain > dk->compressor_gain;
346
347 /* compression_diff_db is the difference between current compression
348 * level and the desired level. */
349 float compression_diff_db = linear_to_decibels(
350 dk->compressor_gain / scaled_desired_gain);
351
352 if (is_releasing) {
353 /* Release mode - compression_diff_db should be negative dB */
354 dk->max_attack_compression_diff_db = -INFINITY;
355
356 /* Fix gremlins. */
357 if (isbadf(compression_diff_db))
358 compression_diff_db = -1;
359
360 /* Adaptive release - higher compression (lower
361 * compression_diff_db) releases faster. Contain within range:
362 * -12 -> 0 then scale to go from 0 -> 3
363 */
364 float x = compression_diff_db;
365 x = max(-12.0f, x);
366 x = min(0.0f, x);
367 x = 0.25f * (x + 12);
368
369 /* Compute adaptive release curve using 4th order polynomial.
370 * Normal values for the polynomial coefficients would create a
371 * monotonically increasing function.
372 */
373 float x2 = x * x;
374 float x3 = x2 * x;
375 float x4 = x2 * x2;
376 float release_frames = kA + kB * x + kC * x2 + kD * x3 +
377 kE * x4;
378
379 #define kSpacingDb 5
380 float db_per_frame = kSpacingDb / release_frames;
381 envelope_rate = decibels_to_linear(db_per_frame);
382 } else {
383 /* Attack mode - compression_diff_db should be positive dB */
384
385 /* Fix gremlins. */
386 if (isbadf(compression_diff_db))
387 compression_diff_db = 1;
388
389 /* As long as we're still in attack mode, use a rate based off
390 * the largest compression_diff_db we've encountered so far.
391 */
392 dk->max_attack_compression_diff_db = max(
393 dk->max_attack_compression_diff_db,
394 compression_diff_db);
395
396 float eff_atten_diff_db =
397 max(0.5f, dk->max_attack_compression_diff_db);
398
399 float x = 0.25f / eff_atten_diff_db;
400 envelope_rate = 1 - powf(x, 1 / attack_frames);
401 }
402
403 dk->envelope_rate = envelope_rate;
404 dk->scaled_desired_gain = scaled_desired_gain;
405 }
406
407 /* For a division of frames, take the absolute values of left channel and right
408 * channel, store the maximum of them in output. */
409 #if defined(__aarch64__)
max_abs_division(float * output,const float * data0,const float * data1)410 static inline void max_abs_division(float *output,
411 const float *data0, const float *data1)
412 {
413 int count = DIVISION_FRAMES / 4;
414
415 __asm__ __volatile__(
416 "1: \n"
417 "ld1 {v0.4s}, [%[data0]], #16 \n"
418 "ld1 {v1.4s}, [%[data1]], #16 \n"
419 "fabs v0.4s, v0.4s \n"
420 "fabs v1.4s, v1.4s \n"
421 "fmax v0.4s, v0.4s, v1.4s \n"
422 "st1 {v0.4s}, [%[output]], #16 \n"
423 "subs %w[count], %w[count], #1 \n"
424 "b.ne 1b \n"
425 : /* output */
426 [data0]"+r"(data0),
427 [data1]"+r"(data1),
428 [output]"+r"(output),
429 [count]"+r"(count)
430 : /* input */
431 : /* clobber */
432 "v0", "v1", "memory", "cc"
433 );
434 }
435 #elif defined(__ARM_NEON__)
max_abs_division(float * output,const float * data0,const float * data1)436 static inline void max_abs_division(float *output,
437 const float *data0, const float *data1)
438 {
439 int count = DIVISION_FRAMES / 4;
440
441 __asm__ __volatile__(
442 "1: \n"
443 "vld1.32 {q0}, [%[data0]]! \n"
444 "vld1.32 {q1}, [%[data1]]! \n"
445 "vabs.f32 q0, q0 \n"
446 "vabs.f32 q1, q1 \n"
447 "vmax.f32 q0, q1 \n"
448 "vst1.32 {q0}, [%[output]]! \n"
449 "subs %[count], #1 \n"
450 "bne 1b \n"
451 : /* output */
452 [data0]"+r"(data0),
453 [data1]"+r"(data1),
454 [output]"+r"(output),
455 [count]"+r"(count)
456 : /* input */
457 : /* clobber */
458 "q0", "q1", "memory", "cc"
459 );
460 }
461 #elif defined(__SSE3__)
462 #include <emmintrin.h>
max_abs_division(float * output,const float * data0,const float * data1)463 static inline void max_abs_division(float *output,
464 const float *data0, const float *data1)
465 {
466 __m128 x, y;
467 int count = DIVISION_FRAMES / 4;
468
469 __asm__ __volatile__(
470 "1: \n"
471 "lddqu (%[data0]), %[x] \n"
472 "lddqu (%[data1]), %[y] \n"
473 "andps %[mask], %[x] \n"
474 "andps %[mask], %[y] \n"
475 "maxps %[y], %[x] \n"
476 "movdqu %[x], (%[output]) \n"
477 "add $16, %[data0] \n"
478 "add $16, %[data1] \n"
479 "add $16, %[output] \n"
480 "sub $1, %[count] \n"
481 "jnz 1b \n"
482 : /* output */
483 [data0]"+r"(data0),
484 [data1]"+r"(data1),
485 [output]"+r"(output),
486 [count]"+r"(count),
487 [x]"=&x"(x),
488 [y]"=&x"(y)
489 : /* input */
490 [mask]"x"(_mm_set1_epi32(0x7fffffff))
491 : /* clobber */
492 "memory", "cc"
493 );
494 }
495 #else
max_abs_division(float * output,const float * data0,const float * data1)496 static inline void max_abs_division(float *output,
497 const float *data0, const float *data1)
498 {
499 int i;
500 for (i = 0; i < DIVISION_FRAMES; i++)
501 output[i] = fmaxf(fabsf(data0[i]), fabsf(data1[i]));
502 }
503 #endif
504
505 /* Update detector_average from the last input division. */
dk_update_detector_average(struct drc_kernel * dk)506 static void dk_update_detector_average(struct drc_kernel *dk)
507 {
508 float abs_input_array[DIVISION_FRAMES];
509 const float sat_release_frames_inv_neg = dk->sat_release_frames_inv_neg;
510 const float sat_release_rate_at_neg_two_db =
511 dk->sat_release_rate_at_neg_two_db;
512 float detector_average = dk->detector_average;
513 int div_start, i;
514
515 /* Calculate the start index of the last input division */
516 if (dk->pre_delay_write_index == 0) {
517 div_start = MAX_PRE_DELAY_FRAMES - DIVISION_FRAMES;
518 } else {
519 div_start = dk->pre_delay_write_index - DIVISION_FRAMES;
520 }
521
522 /* The max abs value across all channels for this frame */
523 max_abs_division(abs_input_array,
524 &dk->pre_delay_buffers[0][div_start],
525 &dk->pre_delay_buffers[1][div_start]);
526
527 for (i = 0; i < DIVISION_FRAMES; i++) {
528 /* Compute compression amount from un-delayed signal */
529 float abs_input = abs_input_array[i];
530
531 /* Calculate shaped power on undelayed input. Put through
532 * shaping curve. This is linear up to the threshold, then
533 * enters a "knee" portion followed by the "ratio" portion. The
534 * transition from the threshold to the knee is smooth (1st
535 * derivative matched). The transition from the knee to the
536 * ratio portion is smooth (1st derivative matched).
537 */
538 float gain = volume_gain(dk, abs_input);
539 int is_release = (gain > detector_average);
540 if (is_release) {
541 if (gain > NEG_TWO_DB) {
542 detector_average += (gain - detector_average) *
543 sat_release_rate_at_neg_two_db;
544 } else {
545 float gain_db = linear_to_decibels(gain);
546 float db_per_frame = gain_db *
547 sat_release_frames_inv_neg;
548 float sat_release_rate =
549 decibels_to_linear(db_per_frame) - 1;
550 detector_average += (gain - detector_average) *
551 sat_release_rate;
552 }
553 } else {
554 detector_average = gain;
555 }
556
557 /* Fix gremlins. */
558 if (isbadf(detector_average))
559 detector_average = 1.0f;
560 else
561 detector_average = min(detector_average, 1.0f);
562 }
563
564 dk->detector_average = detector_average;
565 }
566
567 /* Calculate compress_gain from the envelope and apply total_gain to compress
568 * the next output division. */
569 /* TODO(fbarchard): Port to aarch64 */
570 #if defined(__ARM_NEON__)
571 #include <arm_neon.h>
dk_compress_output(struct drc_kernel * dk)572 static void dk_compress_output(struct drc_kernel *dk)
573 {
574 const float master_linear_gain = dk->master_linear_gain;
575 const float envelope_rate = dk->envelope_rate;
576 const float scaled_desired_gain = dk->scaled_desired_gain;
577 const float compressor_gain = dk->compressor_gain;
578 const int div_start = dk->pre_delay_read_index;
579 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
580 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
581 int count = DIVISION_FRAMES / 4;
582
583 /* See warp_sinf() for the details for the constants. */
584 const float32x4_t A7 = vdupq_n_f32(-4.3330336920917034149169921875e-3f);
585 const float32x4_t A5 = vdupq_n_f32(7.9434238374233245849609375e-2f);
586 const float32x4_t A3 = vdupq_n_f32(-0.645892798900604248046875f);
587 const float32x4_t A1 = vdupq_n_f32(1.5707910060882568359375f);
588
589 /* Exponential approach to desired gain. */
590 if (envelope_rate < 1) {
591 float c = compressor_gain - scaled_desired_gain;
592 float r = 1 - envelope_rate;
593 float32x4_t x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
594 float32x4_t x, x2, x4, left, right, tmp1, tmp2;
595
596 __asm__ __volatile(
597 "b 2f \n"
598 "1: \n"
599 "vmul.f32 %q[x0], %q[r4] \n"
600 "2: \n"
601 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n"
602 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n"
603 "vadd.f32 %q[x], %q[x0], %q[base] \n"
604 /* Calculate warp_sin() for four values in x. */
605 "vmul.f32 %q[x2], %q[x], %q[x] \n"
606 "vmov.f32 %q[tmp1], %q[A5] \n"
607 "vmov.f32 %q[tmp2], %q[A1] \n"
608 "vmul.f32 %q[x4], %q[x2], %q[x2] \n"
609 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n"
610 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n"
611 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n"
612 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n"
613 /* Now tmp2 contains the result of warp_sin(). */
614 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n"
615 "vmul.f32 %q[left], %q[tmp2] \n"
616 "vmul.f32 %q[right], %q[tmp2] \n"
617 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n"
618 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n"
619 "subs %[count], #1 \n"
620 "bne 1b \n"
621 : /* output */
622 "=r"(count),
623 "=r"(ptr_left),
624 "=r"(ptr_right),
625 "=w"(x0),
626 [x]"=&w"(x),
627 [x2]"=&w"(x2),
628 [x4]"=&w"(x4),
629 [left]"=&w"(left),
630 [right]"=&w"(right),
631 [tmp1]"=&w"(tmp1),
632 [tmp2]"=&w"(tmp2)
633 : /* input */
634 [count]"0"(count),
635 [ptr_left]"1"(ptr_left),
636 [ptr_right]"2"(ptr_right),
637 [x0]"3"(x0),
638 [A1]"w"(A1),
639 [A3]"w"(A3),
640 [A5]"w"(A5),
641 [A7]"w"(A7),
642 [base]"w"(vdupq_n_f32(scaled_desired_gain)),
643 [r4]"w"(vdupq_n_f32(r*r*r*r)),
644 [g]"w"(vdupq_n_f32(master_linear_gain))
645 : /* clobber */
646 "memory", "cc"
647 );
648 dk->compressor_gain = x[3];
649 } else {
650 float c = compressor_gain;
651 float r = envelope_rate;
652 float32x4_t x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
653 float32x4_t x2, x4, left, right, tmp1, tmp2;
654
655 __asm__ __volatile(
656 "b 2f \n"
657 "1: \n"
658 "vmul.f32 %q[x], %q[r4] \n"
659 "2: \n"
660 "vld1.32 {%e[left],%f[left]}, [%[ptr_left]] \n"
661 "vld1.32 {%e[right],%f[right]}, [%[ptr_right]] \n"
662 "vmin.f32 %q[x], %q[one] \n"
663 /* Calculate warp_sin() for four values in x. */
664 "vmul.f32 %q[x2], %q[x], %q[x] \n"
665 "vmov.f32 %q[tmp1], %q[A5] \n"
666 "vmov.f32 %q[tmp2], %q[A1] \n"
667 "vmul.f32 %q[x4], %q[x2], %q[x2] \n"
668 "vmla.f32 %q[tmp1], %q[A7], %q[x2] \n"
669 "vmla.f32 %q[tmp2], %q[A3], %q[x2] \n"
670 "vmla.f32 %q[tmp2], %q[tmp1], %q[x4] \n"
671 "vmul.f32 %q[tmp2], %q[tmp2], %q[x] \n"
672 /* Now tmp2 contains the result of warp_sin(). */
673 "vmul.f32 %q[tmp2], %q[tmp2], %q[g] \n"
674 "vmul.f32 %q[left], %q[tmp2] \n"
675 "vmul.f32 %q[right], %q[tmp2] \n"
676 "vst1.32 {%e[left],%f[left]}, [%[ptr_left]]! \n"
677 "vst1.32 {%e[right],%f[right]}, [%[ptr_right]]! \n"
678 "subs %[count], #1 \n"
679 "bne 1b \n"
680 : /* output */
681 "=r"(count),
682 "=r"(ptr_left),
683 "=r"(ptr_right),
684 "=w"(x),
685 [x2]"=&w"(x2),
686 [x4]"=&w"(x4),
687 [left]"=&w"(left),
688 [right]"=&w"(right),
689 [tmp1]"=&w"(tmp1),
690 [tmp2]"=&w"(tmp2)
691 : /* input */
692 [count]"0"(count),
693 [ptr_left]"1"(ptr_left),
694 [ptr_right]"2"(ptr_right),
695 [x]"3"(x),
696 [A1]"w"(A1),
697 [A3]"w"(A3),
698 [A5]"w"(A5),
699 [A7]"w"(A7),
700 [one]"w"(vdupq_n_f32(1)),
701 [r4]"w"(vdupq_n_f32(r*r*r*r)),
702 [g]"w"(vdupq_n_f32(master_linear_gain))
703 : /* clobber */
704 "memory", "cc"
705 );
706 dk->compressor_gain = x[3];
707 }
708 }
709 #elif defined(__SSE3__) && defined(__x86_64__)
710 #include <emmintrin.h>
dk_compress_output(struct drc_kernel * dk)711 static void dk_compress_output(struct drc_kernel *dk)
712 {
713 const float master_linear_gain = dk->master_linear_gain;
714 const float envelope_rate = dk->envelope_rate;
715 const float scaled_desired_gain = dk->scaled_desired_gain;
716 const float compressor_gain = dk->compressor_gain;
717 const int div_start = dk->pre_delay_read_index;
718 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
719 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
720 int count = DIVISION_FRAMES / 4;
721
722 /* See warp_sinf() for the details for the constants. */
723 const __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
724 const __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
725 const __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
726 const __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
727
728 /* Exponential approach to desired gain. */
729 if (envelope_rate < 1) {
730 float c = compressor_gain - scaled_desired_gain;
731 float r = 1 - envelope_rate;
732 __m128 x0 = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
733 __m128 x, x2, x4, left, right, tmp1, tmp2;
734
735 __asm__ __volatile(
736 "jmp 2f \n"
737 "1: \n"
738 "mulps %[r4], %[x0] \n"
739 "2: \n"
740 "lddqu (%[ptr_left]), %[left] \n"
741 "lddqu (%[ptr_right]), %[right] \n"
742 "movaps %[x0], %[x] \n"
743 "addps %[base], %[x] \n"
744 /* Calculate warp_sin() for four values in x. */
745 "movaps %[x], %[x2] \n"
746 "mulps %[x], %[x2] \n"
747 "movaps %[x2], %[x4] \n"
748 "movaps %[x2], %[tmp1] \n"
749 "movaps %[x2], %[tmp2] \n"
750 "mulps %[x2], %[x4] \n"
751 "mulps %[A7], %[tmp1] \n"
752 "mulps %[A3], %[tmp2] \n"
753 "addps %[A5], %[tmp1] \n"
754 "addps %[A1], %[tmp2] \n"
755 "mulps %[x4], %[tmp1] \n"
756 "addps %[tmp1], %[tmp2] \n"
757 "mulps %[x], %[tmp2] \n"
758 /* Now tmp2 contains the result of warp_sin(). */
759 "mulps %[g], %[tmp2] \n"
760 "mulps %[tmp2], %[left] \n"
761 "mulps %[tmp2], %[right] \n"
762 "movdqu %[left], (%[ptr_left]) \n"
763 "movdqu %[right], (%[ptr_right]) \n"
764 "add $16, %[ptr_left] \n"
765 "add $16, %[ptr_right] \n"
766 "sub $1, %[count] \n"
767 "jne 1b \n"
768 : /* output */
769 "=r"(count),
770 "=r"(ptr_left),
771 "=r"(ptr_right),
772 "=x"(x0),
773 [x]"=&x"(x),
774 [x2]"=&x"(x2),
775 [x4]"=&x"(x4),
776 [left]"=&x"(left),
777 [right]"=&x"(right),
778 [tmp1]"=&x"(tmp1),
779 [tmp2]"=&x"(tmp2)
780 : /* input */
781 [count]"0"(count),
782 [ptr_left]"1"(ptr_left),
783 [ptr_right]"2"(ptr_right),
784 [x0]"3"(x0),
785 [A1]"x"(A1),
786 [A3]"x"(A3),
787 [A5]"x"(A5),
788 [A7]"x"(A7),
789 [base]"x"(_mm_set1_ps(scaled_desired_gain)),
790 [r4]"x"(_mm_set1_ps(r*r*r*r)),
791 [g]"x"(_mm_set1_ps(master_linear_gain))
792 : /* clobber */
793 "memory", "cc"
794 );
795 dk->compressor_gain = x[3];
796 } else {
797 /* See warp_sinf() for the details for the constants. */
798 __m128 A7 = _mm_set1_ps(-4.3330336920917034149169921875e-3f);
799 __m128 A5 = _mm_set1_ps(7.9434238374233245849609375e-2f);
800 __m128 A3 = _mm_set1_ps(-0.645892798900604248046875f);
801 __m128 A1 = _mm_set1_ps(1.5707910060882568359375f);
802
803 float c = compressor_gain;
804 float r = envelope_rate;
805 __m128 x = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
806 __m128 x2, x4, left, right, tmp1, tmp2;
807
808 __asm__ __volatile(
809 "jmp 2f \n"
810 "1: \n"
811 "mulps %[r4], %[x] \n"
812 "2: \n"
813 "lddqu (%[ptr_left]), %[left] \n"
814 "lddqu (%[ptr_right]), %[right] \n"
815 "minps %[one], %[x] \n"
816 /* Calculate warp_sin() for four values in x. */
817 "movaps %[x], %[x2] \n"
818 "mulps %[x], %[x2] \n"
819 "movaps %[x2], %[x4] \n"
820 "movaps %[x2], %[tmp1] \n"
821 "movaps %[x2], %[tmp2] \n"
822 "mulps %[x2], %[x4] \n"
823 "mulps %[A7], %[tmp1] \n"
824 "mulps %[A3], %[tmp2] \n"
825 "addps %[A5], %[tmp1] \n"
826 "addps %[A1], %[tmp2] \n"
827 "mulps %[x4], %[tmp1] \n"
828 "addps %[tmp1], %[tmp2] \n"
829 "mulps %[x], %[tmp2] \n"
830 /* Now tmp2 contains the result of warp_sin(). */
831 "mulps %[g], %[tmp2] \n"
832 "mulps %[tmp2], %[left] \n"
833 "mulps %[tmp2], %[right] \n"
834 "movdqu %[left], (%[ptr_left]) \n"
835 "movdqu %[right], (%[ptr_right]) \n"
836 "add $16, %[ptr_left] \n"
837 "add $16, %[ptr_right] \n"
838 "sub $1, %[count] \n"
839 "jne 1b \n"
840 : /* output */
841 "=r"(count),
842 "=r"(ptr_left),
843 "=r"(ptr_right),
844 "=x"(x),
845 [x2]"=&x"(x2),
846 [x4]"=&x"(x4),
847 [left]"=&x"(left),
848 [right]"=&x"(right),
849 [tmp1]"=&x"(tmp1),
850 [tmp2]"=&x"(tmp2)
851 : /* input */
852 [count]"0"(count),
853 [ptr_left]"1"(ptr_left),
854 [ptr_right]"2"(ptr_right),
855 [x]"3"(x),
856 [A1]"x"(A1),
857 [A3]"x"(A3),
858 [A5]"x"(A5),
859 [A7]"x"(A7),
860 [one]"x"(_mm_set1_ps(1)),
861 [r4]"x"(_mm_set1_ps(r*r*r*r)),
862 [g]"x"(_mm_set1_ps(master_linear_gain))
863 : /* clobber */
864 "memory", "cc"
865 );
866 dk->compressor_gain = x[3];
867 }
868 }
869 #else
dk_compress_output(struct drc_kernel * dk)870 static void dk_compress_output(struct drc_kernel *dk)
871 {
872 const float master_linear_gain = dk->master_linear_gain;
873 const float envelope_rate = dk->envelope_rate;
874 const float scaled_desired_gain = dk->scaled_desired_gain;
875 const float compressor_gain = dk->compressor_gain;
876 const int div_start = dk->pre_delay_read_index;
877 float *ptr_left = &dk->pre_delay_buffers[0][div_start];
878 float *ptr_right = &dk->pre_delay_buffers[1][div_start];
879 int count = DIVISION_FRAMES / 4;
880
881 int i, j;
882
883 /* Exponential approach to desired gain. */
884 if (envelope_rate < 1) {
885 /* Attack - reduce gain to desired. */
886 float c = compressor_gain - scaled_desired_gain;
887 float base = scaled_desired_gain;
888 float r = 1 - envelope_rate;
889 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
890 float r4 = r*r*r*r;
891
892 i = 0;
893 while (1) {
894 for (j = 0; j < 4; j++) {
895 /* Warp pre-compression gain to smooth out sharp
896 * exponential transition points.
897 */
898 float post_warp_compressor_gain =
899 warp_sinf(x[j] + base);
900
901 /* Calculate total gain using master gain. */
902 float total_gain = master_linear_gain *
903 post_warp_compressor_gain;
904
905 /* Apply final gain. */
906 *ptr_left++ *= total_gain;
907 *ptr_right++ *= total_gain;
908 }
909
910 if (++i == count)
911 break;
912
913 for (j = 0; j < 4; j++)
914 x[j] = x[j] * r4;
915 }
916
917 dk->compressor_gain = x[3] + base;
918 } else {
919 /* Release - exponentially increase gain to 1.0 */
920 float c = compressor_gain;
921 float r = envelope_rate;
922 float x[4] = {c*r, c*r*r, c*r*r*r, c*r*r*r*r};
923 float r4 = r*r*r*r;
924
925 i = 0;
926 while (1) {
927 for (j = 0; j < 4; j++) {
928 /* Warp pre-compression gain to smooth out sharp
929 * exponential transition points.
930 */
931 float post_warp_compressor_gain =
932 warp_sinf(x[j]);
933
934 /* Calculate total gain using master gain. */
935 float total_gain = master_linear_gain *
936 post_warp_compressor_gain;
937
938 /* Apply final gain. */
939 *ptr_left++ *= total_gain;
940 *ptr_right++ *= total_gain;
941 }
942
943 if (++i == count)
944 break;
945
946 for (j = 0; j < 4; j++)
947 x[j] = min(1.0f, x[j] * r4);
948 }
949
950 dk->compressor_gain = x[3];
951 }
952 }
953 #endif
954
955 /* After one complete divison of samples have been received (and one divison of
956 * samples have been output), we calculate shaped power average
957 * (detector_average) from the input division, update envelope parameters from
958 * detector_average, then prepare the next output division by applying the
959 * envelope to compress the samples.
960 */
dk_process_one_division(struct drc_kernel * dk)961 static void dk_process_one_division(struct drc_kernel *dk)
962 {
963 dk_update_detector_average(dk);
964 dk_update_envelope(dk);
965 dk_compress_output(dk);
966 }
967
968 /* Copy the input data to the pre-delay buffer, and copy the output data back to
969 * the input buffer */
dk_copy_fragment(struct drc_kernel * dk,float * data_channels[],unsigned frame_index,int frames_to_process)970 static void dk_copy_fragment(struct drc_kernel *dk, float *data_channels[],
971 unsigned frame_index, int frames_to_process)
972 {
973 int write_index = dk->pre_delay_write_index;
974 int read_index = dk->pre_delay_read_index;
975 int j;
976
977 for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
978 memcpy(&dk->pre_delay_buffers[j][write_index],
979 &data_channels[j][frame_index],
980 frames_to_process * sizeof(float));
981 memcpy(&data_channels[j][frame_index],
982 &dk->pre_delay_buffers[j][read_index],
983 frames_to_process * sizeof(float));
984 }
985
986 dk->pre_delay_write_index = (write_index + frames_to_process) &
987 MAX_PRE_DELAY_FRAMES_MASK;
988 dk->pre_delay_read_index = (read_index + frames_to_process) &
989 MAX_PRE_DELAY_FRAMES_MASK;
990 }
991
992 /* Delay the input sample only and don't do other processing. This is used when
993 * the kernel is disabled. We want to do this to match the processing delay in
994 * kernels of other bands.
995 */
dk_process_delay_only(struct drc_kernel * dk,float * data_channels[],unsigned count)996 static void dk_process_delay_only(struct drc_kernel *dk, float *data_channels[],
997 unsigned count)
998 {
999 int read_index = dk->pre_delay_read_index;
1000 int write_index = dk->pre_delay_write_index;
1001 int i = 0;
1002
1003 while (i < count) {
1004 int j;
1005 int small = min(read_index, write_index);
1006 int large = max(read_index, write_index);
1007 /* chunk is the minimum of readable samples in contiguous
1008 * buffer, writable samples in contiguous buffer, and the
1009 * available input samples. */
1010 int chunk = min(large - small, MAX_PRE_DELAY_FRAMES - large);
1011 chunk = min(chunk, count - i);
1012 for (j = 0; j < DRC_NUM_CHANNELS; ++j) {
1013 memcpy(&dk->pre_delay_buffers[j][write_index],
1014 &data_channels[j][i],
1015 chunk * sizeof(float));
1016 memcpy(&data_channels[j][i],
1017 &dk->pre_delay_buffers[j][read_index],
1018 chunk * sizeof(float));
1019 }
1020 read_index = (read_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1021 write_index = (write_index + chunk) & MAX_PRE_DELAY_FRAMES_MASK;
1022 i += chunk;
1023 }
1024
1025 dk->pre_delay_read_index = read_index;
1026 dk->pre_delay_write_index = write_index;
1027 }
1028
dk_process(struct drc_kernel * dk,float * data_channels[],unsigned count)1029 void dk_process(struct drc_kernel *dk, float *data_channels[], unsigned count)
1030 {
1031 int i = 0;
1032 int fragment;
1033
1034 if (!dk->enabled) {
1035 dk_process_delay_only(dk, data_channels, count);
1036 return;
1037 }
1038
1039 if (!dk->processed) {
1040 dk_update_envelope(dk);
1041 dk_compress_output(dk);
1042 dk->processed = 1;
1043 }
1044
1045 int offset = dk->pre_delay_write_index & DIVISION_FRAMES_MASK;
1046 while (i < count) {
1047 fragment = min(DIVISION_FRAMES - offset, count - i);
1048 dk_copy_fragment(dk, data_channels, i, fragment);
1049 i += fragment;
1050 offset = (offset + fragment) & DIVISION_FRAMES_MASK;
1051
1052 /* Process the input division (32 frames). */
1053 if (offset == 0)
1054 dk_process_one_division(dk);
1055 }
1056 }
1057