1 /**
2 * Copyright 2021-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
18
19 #include <Eigen/Dense>
20 #include <unsupported/Eigen/FFT>
21
22 #include <algorithm>
23 #include <cmath>
24 #include <complex>
25 #include <limits>
26 #include <memory>
27 #include <random>
28 #include <string>
29 #include <vector>
30
31 #include "minddata/dataset/core/tensor.h"
32 #include "minddata/dataset/kernels/data/data_utils.h"
33 #include "minddata/dataset/kernels/tensor_op.h"
34 #include "minddata/dataset/util/status.h"
35 #include "minddata/dataset/util/validators.h"
36
37 constexpr double PI = 3.141592653589793;
38 constexpr int kMinAudioDim = 1;
39 constexpr int kDefaultAudioDim = 2;
40 constexpr int TWO = 2;
41 constexpr float HALF = 0.5;
42
43 namespace mindspore {
44 namespace dataset {
45 /// \brief Turn a tensor from the power/amplitude scale to the decibel scale.
46 /// \param input/output: Tensor of shape <..., freq, time>.
47 /// \param multiplier: power - 10, amplitude - 20.
48 /// \param amin: lower bound.
49 /// \param db_multiplier: multiplier for decibels.
50 /// \param top_db: the lower bound for decibels cut-off.
51 /// \return Status code.
52 template <typename T>
AmplitudeToDB(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T multiplier,T amin,T db_multiplier,T top_db)53 Status AmplitudeToDB(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T multiplier, T amin,
54 T db_multiplier, T top_db) {
55 TensorShape input_shape = input->shape();
56 TensorShape to_shape = input_shape.Rank() == 2
57 ? TensorShape({1, 1, input_shape[-2], input_shape[-1]})
58 : TensorShape({input->Size() / (input_shape[-3] * input_shape[-2] * input_shape[-1]),
59 input_shape[-3], input_shape[-2], input_shape[-1]});
60 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
61 RETURN_IF_NOT_OK((*output)->Reshape(to_shape));
62
63 std::vector<T> max_val;
64 uint64_t step = to_shape[-3] * input_shape[-2] * input_shape[-1];
65 uint64_t cnt = 0;
66 T temp_max = std::numeric_limits<T>::lowest();
67 for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
68 // do clamp
69 *itr = *itr < amin ? log10(amin) * multiplier : log10(*itr) * multiplier;
70 *itr -= multiplier * db_multiplier;
71 // calculate max by axis
72 cnt++;
73 if ((*itr) > temp_max) {
74 temp_max = *itr;
75 }
76 if (cnt % step == 0) {
77 max_val.push_back(temp_max);
78 temp_max = std::numeric_limits<T>::lowest();
79 }
80 }
81
82 if (!std::isnan(top_db)) {
83 uint64_t ind = 0;
84 for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++, ind++) {
85 T lower_bound = max_val[ind / step] - top_db;
86 *itr = std::max((*itr), lower_bound);
87 }
88 }
89 RETURN_IF_NOT_OK((*output)->Reshape(input_shape));
90 return Status::OK();
91 }
92
93 /// \brief Use custom thread pools.
94 /// \param task: Lambda functions to be processed.
95 /// \param input_size: Size of the input.
96 /// \param block_size: Customized size for each thread processing.
97 /// \return Status code.
98 Status AudioParallelLaunch(const std::function<void(size_t, size_t, size_t, size_t)> &task, size_t input_size,
99 float block_size);
100
101 /// \brief Compute the thread nums.
102 /// \param input_size: Size of the input.
103 /// \param block_size: Customized size for each thread processing.
104 /// \param task_num: The final calculated number of threads.
105 /// \param once_compute_size: The final calculated size for each thread processing.
106 /// \return Status code.
107 Status CountThreadNums(size_t input_size, float block_size, size_t *task_num, size_t *once_compute_size);
108
109 /// \brief Calculate the angles of the complex numbers.
110 /// \param input/output: Tensor of shape <..., time>.
111 template <typename T>
Angle(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)112 Status Angle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
113 TensorShape shape = input->shape();
114 std::vector output_shape = shape.AsVector();
115 output_shape.pop_back();
116 std::shared_ptr<Tensor> out;
117 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape(output_shape), input->type(), &out));
118
119 std::vector<std::thread> threads;
120 size_t input_size = input->Size();
121 float block_size = 30000;
122 auto task = [&input, &out, &input_size](size_t start, size_t end, size_t num, size_t task_num) {
123 T o;
124 T x;
125 T y;
126
127 if ((task_num - num) == 1) {
128 end = input_size;
129 }
130
131 auto itr_start = input->begin<T>();
132 itr_start += start;
133 auto itr_end = input->begin<T>();
134 itr_end += end;
135 auto itr_out = out->begin<T>();
136 size_t offset = start / TWO;
137 itr_out += offset;
138
139 // calculate norm, using: .pow(2.).sum(-1).pow(0.5 * power)
140 for (auto itr = itr_start; itr != itr_end; itr++) {
141 x = *itr;
142 itr++;
143 y = *itr;
144 o = std::atan2(y, x);
145 *itr_out = o;
146 itr_out++;
147 }
148 };
149 RETURN_IF_NOT_OK(AudioParallelLaunch(task, input_size, block_size));
150 *output = out;
151 return Status::OK();
152 }
153
154 Status Bartlett(std::shared_ptr<Tensor> *output, int len);
155
156 /// \brief Perform a biquad filter of input tensor.
157 /// \param input/output: Tensor of shape <..., time>.
158 /// \param a0: denominator coefficient of current output y[n], typically 1.
159 /// \param a1: denominator coefficient of current output y[n-1].
160 /// \param a2: denominator coefficient of current output y[n-2].
161 /// \param b0: numerator coefficient of current input, x[n].
162 /// \param b1: numerator coefficient of input one time step ago x[n-1].
163 /// \param b2: numerator coefficient of input two time steps ago x[n-2].
164 /// \return Status code.
165 template <typename T>
Biquad(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T b0,T b1,T b2,T a0,T a1,T a2)166 Status Biquad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T b0, T b1, T b2, T a0, T a1,
167 T a2) {
168 std::vector<T> a_coeffs;
169 std::vector<T> b_coeffs;
170 a_coeffs.push_back(a0);
171 a_coeffs.push_back(a1);
172 a_coeffs.push_back(a2);
173 b_coeffs.push_back(b0);
174 b_coeffs.push_back(b1);
175 b_coeffs.push_back(b2);
176 return LFilter(input, output, a_coeffs, b_coeffs, true);
177 }
178
179 /// \brief Apply contrast effect.
180 /// \param input/output: Tensor of shape <..., time>.
181 /// \param enhancement_amount: controls the amount of the enhancement.
182 /// \return Status code.
183 template <typename T>
Contrast(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T enhancement_amount)184 Status Contrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T enhancement_amount) {
185 const float enhancement_zoom = 750.0;
186 T enhancement_amount_value = enhancement_amount / enhancement_zoom;
187 TensorShape output_shape{input->shape()};
188 std::shared_ptr<Tensor> out;
189 RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, input->type(), &out));
190
191 // Get the thread num and once compute size.
192 float block_size = 6000;
193 size_t input_size = input->Size();
194
195 auto task = [&input, &out, &input_size, &enhancement_amount_value](size_t start, size_t end, size_t num,
196 size_t task_num) {
197 if ((task_num - num) == 1) {
198 end = input_size;
199 }
200
201 auto itr_out = out->begin<T>();
202 itr_out += start;
203 auto tmp_start = input->begin<T>();
204 tmp_start += start;
205 auto tmp_end = input->begin<T>();
206 tmp_end += end;
207
208 for (auto itr_in = tmp_start; itr_in != tmp_end; itr_in++) {
209 // PI / 2 is half of the constant PI
210 T temp1 = (*itr_in) * (PI / TWO);
211 T temp2 = enhancement_amount_value * std::sin(temp1 * 4);
212 *itr_out = std::sin(temp1 + temp2);
213 itr_out++;
214 }
215 };
216 RETURN_IF_NOT_OK(AudioParallelLaunch(task, input_size, block_size));
217 *output = out;
218 return Status::OK();
219 }
220
221 /// \brief Apply DBToAmplitude effect.
222 /// \param input/output: Tensor of shape <...,time>
223 /// \param ref: Reference which the output will be scaled by.
224 /// \param power: If power equals 1, will compute DB to power. If 0.5, will compute DB to amplitude.
225 /// \return Status code
226 template <typename T>
DBToAmplitude(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T ref,T power)227 Status DBToAmplitude(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T ref, T power) {
228 std::shared_ptr<Tensor> out;
229 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), &out));
230 auto itr_out = out->begin<T>();
231 constexpr int64_t pow_factor_x = 10;
232 constexpr double pow_factor_y = 0.1;
233 for (auto itr_in = input->begin<T>(); itr_in != input->end<T>(); itr_in++) {
234 *itr_out = ref * pow(pow(pow_factor_x, (*itr_in) * pow_factor_y), power);
235 itr_out++;
236 }
237 *output = out;
238 return Status::OK();
239 }
240
241 /// \brief Apply a DC shift to the audio.
242 /// \param input/output: Tensor of shape <...,time>.
243 /// \param shift: the amount to shift the audio.
244 /// \param limiter_gain: used only on peaks to prevent clipping.
245 /// \return Status code.
246 template <typename T>
DCShift(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float shift,float limiter_gain)247 Status DCShift(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float shift, float limiter_gain) {
248 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
249 float limiter_threshold = 0.0;
250 if (std::fabs(shift - limiter_gain) > std::numeric_limits<float>::epsilon() &&
251 std::fabs(shift) > std::numeric_limits<float>::epsilon()) {
252 limiter_threshold = 1.0 - (std::abs(shift) - limiter_gain);
253 for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
254 if (*itr > limiter_threshold && shift > 0) {
255 T peak = (*itr - limiter_threshold) * limiter_gain / (1 - limiter_threshold);
256 T sample = (peak + limiter_threshold + shift);
257 *itr = sample > limiter_threshold ? limiter_threshold : sample;
258 } else if (*itr < -limiter_threshold && shift < 0) {
259 T peak = (*itr + limiter_threshold) * limiter_gain / (1 - limiter_threshold);
260 T sample = (peak + limiter_threshold + shift);
261 *itr = sample < -limiter_threshold ? -limiter_threshold : sample;
262 } else {
263 T sample = (*itr + shift);
264 *itr = (sample > 1 || sample < -1) ? (sample > 1 ? 1 : -1) : sample;
265 }
266 }
267 } else {
268 for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
269 T sample = (*itr + shift);
270 *itr = sample > 1 || sample < -1 ? (sample > 1 ? 1 : -1) : sample;
271 }
272 }
273 return Status::OK();
274 }
275
276 /// \brief Apply amplification or attenuation to the whole waveform.
277 /// \param input/output: Tensor of shape <..., time>.
278 /// \param gain_db: Gain adjustment in decibels (dB).
279 /// \return Status code.
280 template <typename T>
Gain(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T gain_db)281 Status Gain(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T gain_db) {
282 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
283 if (gain_db == 0) {
284 return Status::OK();
285 }
286
287 T radio = pow(10, gain_db / 20);
288 for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); ++itr) {
289 *itr = (*itr) * radio;
290 }
291 return Status::OK();
292 }
293
294 /// \brief Perform an IIR filter by evaluating difference equation.
295 /// \param input/output: Tensor of shape <..., time>
296 /// \param a_coeffs: denominator coefficients of difference equation of dimension of (n_order + 1).
297 /// \param b_coeffs: numerator coefficients of difference equation of dimension of (n_order + 1).
298 /// \param clamp: If True, clamp the output signal to be in the range [-1, 1] (Default: True).
299 /// \return Status code
300 template <typename T>
LFilter(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,std::vector<T> a_coeffs,std::vector<T> b_coeffs,bool clamp)301 Status LFilter(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, std::vector<T> a_coeffs,
302 std::vector<T> b_coeffs, bool clamp) {
303 // pack batch
304 TensorShape input_shape = input->shape();
305 TensorShape toShape({input->Size() / input_shape[-1], input_shape[-1]});
306 input->Reshape(toShape);
307 auto shape_0 = static_cast<size_t>(input->shape()[0]);
308 auto shape_1 = static_cast<size_t>(input->shape()[1]);
309 std::vector<T> signal;
310 std::shared_ptr<Tensor> out;
311 std::vector<T> out_vect(shape_0 * shape_1);
312 size_t x_idx = 0;
313 size_t channel_idx = 1;
314 size_t m_num_order = b_coeffs.size() - 1;
315 size_t m_den_order = a_coeffs.size() - 1;
316 CHECK_FAIL_RETURN_UNEXPECTED(a_coeffs[0] != static_cast<T>(0),
317 "Invalid data, the first value of 'a_coeffs' should not be 0, but got 0.");
318 // init A_coeffs and B_coeffs by div(a0)
319 for (size_t i = 1; i < a_coeffs.size(); i++) {
320 a_coeffs[i] /= a_coeffs[0];
321 }
322 for (size_t i = 0; i < b_coeffs.size(); i++) {
323 b_coeffs[i] /= a_coeffs[0];
324 }
325 // Sliding window
326 std::vector<T> m_px(m_num_order + 1, static_cast<T>(0));
327 std::vector<T> m_py(m_den_order + 1, static_cast<T>(0));
328
329 // Tensor -> vector
330 for (auto itr = input->begin<T>(); itr != input->end<T>();) {
331 while (x_idx < shape_1 * channel_idx) {
332 signal.push_back(*itr);
333 itr++;
334 x_idx++;
335 }
336 // Sliding window
337 for (size_t j = 0; j < m_den_order; j++) {
338 m_px[j] = static_cast<T>(0);
339 }
340 for (size_t j = 0; j <= m_den_order; j++) {
341 m_py[j] = static_cast<T>(0);
342 }
343 // Each channel is processed with the sliding window
344 for (size_t i = x_idx - shape_1; i < x_idx; i++) {
345 m_px[m_num_order] = signal[i];
346 for (size_t j = 0; j < m_num_order + 1; j++) {
347 m_py[m_num_order] += b_coeffs[j] * m_px[m_num_order - j];
348 }
349 for (size_t j = 1; j < m_den_order + 1; j++) {
350 m_py[m_num_order] -= a_coeffs[j] * m_py[m_num_order - j];
351 }
352 if (clamp) {
353 if (m_py[m_num_order] > static_cast<T>(1))
354 out_vect[i] = static_cast<T>(1);
355 else if (m_py[m_num_order] < static_cast<T>(-1))
356 out_vect[i] = static_cast<T>(-1);
357 else
358 out_vect[i] = m_py[m_num_order];
359 } else {
360 out_vect[i] = m_py[m_num_order];
361 }
362 if (i + 1 == x_idx) {
363 continue;
364 }
365 for (size_t j = 0; j < m_num_order; j++) {
366 m_px[j] = m_px[j + 1];
367 }
368 for (size_t j = 0; j < m_num_order; j++) {
369 m_py[j] = m_py[j + 1];
370 }
371 m_py[m_num_order] = static_cast<T>(0);
372 }
373 if (x_idx % shape_1 == 0) {
374 ++channel_idx;
375 }
376 }
377 // unpack batch
378 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_vect, input_shape, &out));
379 *output = out;
380 return Status::OK();
381 }
382
383 /// \brief Generate linearly spaced vector.
384 /// \param[in] start Value of the startpoint.
385 /// \param[in] end Value of the endpoint.
386 /// \param[in] n N points in the output tensor.
387 /// \param[out] output Tensor has n points with linearly space.
388 /// The spacing between the points is (end - start) / (n - 1).
389 /// \return Status return code.
390 template <typename T>
Linspace(std::shared_ptr<Tensor> * output,T start,T end,int32_t n)391 Status Linspace(std::shared_ptr<Tensor> *output, T start, T end, int32_t n) {
392 RETURN_IF_NOT_OK(ValidateNoGreaterThan("Linspace", "start", start, "end", end));
393 int hundred = 100;
394 n = std::isnan(static_cast<double>(n * 1.0)) ? hundred : n;
395 CHECK_FAIL_RETURN_UNEXPECTED(n >= 0, "Linspace: input param n must be non-negative.");
396
397 TensorShape out_shape({n});
398 std::vector<T> linear_vect(n);
399 T interval = (n == 1) ? 0 : ((end - start) / (n - 1));
400 for (auto i = 0; i < linear_vect.size(); ++i) {
401 linear_vect[i] = start + i * interval;
402 }
403 std::shared_ptr<Tensor> out_t;
404 RETURN_IF_NOT_OK(Tensor::CreateFromVector(linear_vect, out_shape, &out_t));
405 linear_vect.clear();
406 linear_vect.shrink_to_fit();
407 *output = out_t;
408 return Status::OK();
409 }
410
411 template <typename T>
CreateTriangularFilterbank(std::shared_ptr<Tensor> * output,const std::shared_ptr<Tensor> & all_freqs,const std::shared_ptr<Tensor> & f_pts)412 Status CreateTriangularFilterbank(std::shared_ptr<Tensor> *output, const std::shared_ptr<Tensor> &all_freqs,
413 const std::shared_ptr<Tensor> &f_pts) {
414 // calculate the difference between each mel point and each stft freq point in hertz.
415 std::vector<T> f_diff;
416 auto iter_fpts1 = f_pts->begin<T>();
417 auto iter_fpts2 = f_pts->begin<T>();
418 ++iter_fpts2;
419 for (size_t i = 1; i < f_pts->Size(); i++) {
420 f_diff.push_back(*iter_fpts2 - *iter_fpts1);
421 ++iter_fpts2;
422 ++iter_fpts1;
423 }
424
425 std::vector<T> slopes;
426 TensorShape slopes_shape({all_freqs->Size(), f_pts->Size()});
427 auto iter_all_freq = all_freqs->begin<T>();
428 for (; iter_all_freq != all_freqs->end<T>(); ++iter_all_freq) {
429 auto iter_f_pts = f_pts->begin<T>();
430 for (; iter_f_pts != f_pts->end<T>(); ++iter_f_pts) {
431 slopes.push_back(*iter_f_pts - *iter_all_freq);
432 }
433 }
434
435 // calculate up and down slopes for creating overlapping triangles.
436 std::vector<T> down_slopes;
437 TensorShape down_slopes_shape({all_freqs->Size(), f_pts->Size() - 2});
438 for (size_t row = 0; row < down_slopes_shape[0]; row++) {
439 for (size_t col = 0; col < down_slopes_shape[1]; col++) {
440 down_slopes.push_back(-slopes[col + row * f_pts->Size()] / f_diff[col]);
441 }
442 }
443
444 std::vector<T> up_slopes;
445 TensorShape up_slopes_shape({all_freqs->Size(), f_pts->Size() - 2});
446 for (size_t row = 0; row < up_slopes_shape[0]; row++) {
447 for (size_t col = 2; col < f_pts->Size(); col++) {
448 up_slopes.push_back(slopes[col + row * f_pts->Size()] / f_diff[col - 1]);
449 }
450 }
451
452 // clip the value of triangles and save into fb.
453 std::vector<T> fb;
454 T zero = 0;
455 TensorShape fb_shape({all_freqs->Size(), f_pts->Size() - 2});
456 for (size_t i = 0; i < down_slopes.size(); i++) {
457 fb.push_back(std::max(zero, std::min(down_slopes[i], up_slopes[i])));
458 }
459
460 std::shared_ptr<Tensor> fb_tensor;
461 RETURN_IF_NOT_OK(Tensor::CreateFromVector(fb, fb_shape, &fb_tensor));
462 *output = fb_tensor;
463 return Status::OK();
464 }
465
466 /// \brief Create a frequency transformation matrix with shape (n_freqs, n_mels).
467 /// \param output Tensor of the frequency transformation matrix.
468 /// \param n_freqs: Number of frequency.
469 /// \param f_min: Minimum of frequency in Hz.
470 /// \param f_max: Maximum of frequency in Hz.
471 /// \param n_mels: Number of mel filterbanks.
472 /// \param sample_rate: Sample rate.
473 /// \param norm: Norm to use, can be NormTyppe::kSlaney or NormTyppe::kNone.
474 /// \param mel_type: Scale to use, can be MelTyppe::kSlaney or MelTyppe::kHtk.
475 /// \return Status code.
476 template <typename T>
CreateFbanks(std::shared_ptr<Tensor> * output,int32_t n_freqs,float f_min,float f_max,int32_t n_mels,int32_t sample_rate,NormType norm,MelType mel_type)477 Status CreateFbanks(std::shared_ptr<Tensor> *output, int32_t n_freqs, float f_min, float f_max, int32_t n_mels,
478 int32_t sample_rate, NormType norm, MelType mel_type) {
479 // min_log_hz, min_log_mel, logstep and f_sp are the const of the mel value equation.
480 const double min_log_hz = 1000.0;
481 const double min_log_mel = 1000 / (200.0 / 3);
482 const double logstep = log(6.4) / 27.0;
483 const double f_sp = 200.0 / 3;
484
485 // hez_to_mel_c and mel_to_hz_c are the const coefficient of mel frequency cepstrum.
486 const double hz_to_mel_c = 2595.0;
487 const double mel_to_hz_c = 700.0;
488
489 // all_freqs is equivalent filterbank construction.
490 std::shared_ptr<Tensor> all_freqs;
491 // the sampling frequency is at least twice the highest frequency of the signal.
492 const double signal_times = 2;
493 RETURN_IF_NOT_OK(Linspace<T>(&all_freqs, 0, sample_rate / signal_times, n_freqs));
494
495 // calculate mel value by f_min and f_max.
496 double m_min = 0.0;
497 double m_max = 0.0;
498 if (mel_type == MelType::kHtk) {
499 m_min = hz_to_mel_c * log10(1.0 + (f_min / mel_to_hz_c));
500 m_max = hz_to_mel_c * log10(1.0 + (f_max / mel_to_hz_c));
501 } else {
502 m_min = (f_min - 0.0) / f_sp;
503 m_max = (f_max - 0.0) / f_sp;
504 if (f_min >= min_log_hz) {
505 m_min = min_log_mel + log(f_min / min_log_hz) / logstep;
506 }
507 if (f_max >= min_log_hz) {
508 m_max = min_log_mel + log(f_max / min_log_hz) / logstep;
509 }
510 }
511
512 // m_pts is mel value sequence in linspace of (m_min, m_max).
513 std::shared_ptr<Tensor> m_pts;
514 const int32_t bias = 2;
515 RETURN_IF_NOT_OK(Linspace<T>(&m_pts, m_min, m_max, n_mels + bias));
516
517 // f_pts saves hertz(mel) though 700.0 * (10.0 **(mel/ 2595.0) - 1.).
518 std::shared_ptr<Tensor> f_pts;
519 const double htk_mel_c = 10.0;
520 RETURN_IF_NOT_OK(Tensor::CreateEmpty(m_pts->shape(), m_pts->type(), &f_pts));
521
522 if (mel_type == MelType::kHtk) {
523 auto iter_f = f_pts->begin<T>();
524 auto iter_m = m_pts->begin<T>();
525 for (; iter_m != m_pts->end<T>(); ++iter_m) {
526 *iter_f = mel_to_hz_c * (pow(htk_mel_c, *iter_m / hz_to_mel_c) - 1.0);
527 ++iter_f;
528 }
529 } else {
530 auto iter_f = f_pts->begin<T>();
531 auto iter_m = m_pts->begin<T>();
532 for (; iter_m != m_pts->end<T>(); iter_m++, iter_f++) {
533 *iter_f = f_sp * (*iter_m);
534 }
535 iter_f = f_pts->begin<T>();
536 iter_m = m_pts->begin<T>();
537 for (; iter_m != m_pts->end<T>(); iter_m++, iter_f++) {
538 if (*iter_m >= min_log_mel) {
539 *iter_f = min_log_hz * exp(logstep * (*iter_m - min_log_mel));
540 }
541 }
542 }
543
544 // create filterbank
545 TensorShape fb_shape({all_freqs->Size(), f_pts->Size() - 2});
546 std::shared_ptr<Tensor> fb;
547 RETURN_IF_NOT_OK(CreateTriangularFilterbank<T>(&fb, all_freqs, f_pts));
548
549 // normalize with Slaney
550 std::vector<T> enorm;
551 if (norm == NormType::kSlaney) {
552 auto iter_f_pts_0 = f_pts->begin<T>();
553 auto iter_f_pts_2 = f_pts->begin<T>();
554 iter_f_pts_2++;
555 iter_f_pts_2++;
556 for (; iter_f_pts_2 != f_pts->end<T>(); iter_f_pts_0++, iter_f_pts_2++) {
557 enorm.push_back(2.0f / (*iter_f_pts_2 - *iter_f_pts_0));
558 }
559 auto iter_fb = fb->begin<T>();
560 for (size_t row = 0; row < fb_shape[0]; row++) {
561 for (size_t col = 0; col < fb_shape[1]; col++) {
562 *iter_fb = (*iter_fb) * enorm[col];
563 iter_fb++;
564 }
565 }
566 enorm.clear();
567 }
568
569 // anomaly detection.
570 auto iter_fb = fb->begin<T>();
571 std::vector<T> max_val(fb_shape[1], 0);
572 for (size_t row = 0; row < fb_shape[0]; row++) {
573 for (size_t col = 0; col < fb_shape[1]; col++) {
574 max_val[col] = std::max(max_val[col], *iter_fb);
575 iter_fb++;
576 }
577 }
578 for (size_t col = 0; col < fb_shape[1]; col++) {
579 if (max_val[col] < 1e-8) {
580 MS_LOG(WARNING) << "MelscaleFbanks: at least one mel filterbank is all zeros, check if the value for 'n_mels' " +
581 std::to_string(n_mels) + " is set too high or the value for 'n_freqs' " +
582 std::to_string(n_freqs) + " is set too low.";
583 break;
584 }
585 }
586 *output = fb;
587 return Status::OK();
588 }
589
590 /// \brief Creates a linear triangular filterbank.
591 /// \param output Tensor of a linear triangular filterbank.
592 /// \param n_freqs: Number of frequency.
593 /// \param f_min: Minimum of frequency in Hz.
594 /// \param f_max: Maximum of frequency in Hz.
595 /// \param n_filter: Number of (linear) triangular filter.
596 /// \param sample_rate: Sample rate.
597 /// \return Status code.
598 Status CreateLinearFbanks(std::shared_ptr<Tensor> *output, int32_t n_freqs, float f_min, float f_max, int32_t n_filter,
599 int32_t sample_rate);
600
601 /// \brief Convert normal STFT to STFT at the Mel scale.
602 /// \param input: Input audio tensor.
603 /// \param output: Mel scale audio tensor.
604 /// \param n_mels: Number of mel filter.
605 /// \param sample_rate: Sample rate of the signal.
606 /// \param f_min: Minimum frequency.
607 /// \param f_max: Maximum frequency.
608 /// \param n_stft: Number of bins in STFT.
609 /// \param norm: Enum, NormType::kSlaney or NormType::kNone. If norm is NormType::kSlaney, divide the triangle mel
610 /// weight by the width of the mel band.
611 /// \param mel_type: Type of calculate mel type, value should be MelType::kHtk or MelType::kSlaney.
612 /// \return Status code.
613 template <typename T>
MelScale(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_mels,int32_t sample_rate,T f_min,T f_max,int32_t n_stft,NormType norm,MelType mel_type)614 Status MelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_mels,
615 int32_t sample_rate, T f_min, T f_max, int32_t n_stft, NormType norm, MelType mel_type) {
616 // pack
617 TensorShape input_shape = input->shape();
618 TensorShape input_reshape({input->Size() / input_shape[-1] / input_shape[-2], input_shape[-2], input_shape[-1]});
619 RETURN_IF_NOT_OK(input->Reshape(input_reshape));
620
621 int rows = input_reshape[1];
622 int cols = input_reshape[2];
623
624 // unpack
625 std::vector<int64_t> out_shape_vec = input_shape.AsVector();
626 out_shape_vec[input_shape.Size() - 1] = cols;
627 out_shape_vec[input_shape.Size() - TWO] = n_mels;
628 TensorShape output_shape(out_shape_vec);
629
630 RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, input->type(), output));
631 if (n_mels == 0) {
632 return Status::OK();
633 }
634 auto out_in = (*output)->GetMutableBuffer();
635 size_t t_size = sizeof(T);
636
637 // gen freq bin mat
638 std::shared_ptr<Tensor> freq_bin_mat;
639 RETURN_IF_NOT_OK(CreateFbanks<T>(&freq_bin_mat, n_stft, f_min, f_max, n_mels, sample_rate, norm, mel_type));
640 auto data_ptr = &*freq_bin_mat->begin<T>();
641 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> matrix_fb(data_ptr, n_mels, n_stft);
642 auto matrix_fb_t = matrix_fb.transpose();
643
644 for (size_t c = 0; c < input_reshape[0]; c++) {
645 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
646 static_cast<T *>(&*input->begin<T>() + rows * cols * c), cols, rows);
647 auto mat_res = matrix_c * matrix_fb_t;
648 size_t mat_res_size = mat_res.size() * t_size;
649 size_t offset = mat_res_size * c;
650 int ret_code =
651 memcpy_s(reinterpret_cast<void *>(out_in + offset), mat_res_size, mat_res.eval().data(), mat_res_size);
652 CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK,
653 "Failed to copy data into std::vector, ret code: " + std::to_string(ret_code) + ".");
654 }
655
656 return Status::OK();
657 }
658
659 /// \brief Transform audio signal into spectrogram.
660 /// \param[in] n_fft Size of FFT, creates n_fft / 2 + 1 bins.
661 /// \param[in] win_length Window size.
662 /// \param[in] hop_length Length of hop between STFT windows.
663 /// \param[in] pad Two sided padding of signal.
664 /// \param[in] window A function to create a window tensor
665 /// that is applied/multiplied to each frame/window.
666 /// \param[in] power Exponent for the magnitude spectrogram.
667 /// \param[in] normalized Whether to normalize by magnitude after stft.
668 /// \param[in] center Whether to pad waveform on both sides.
669 /// \param[in] pad_mode Controls the padding method used when center is true.
670 /// \param[in] onesided Controls whether to return half of results to avoid redundancy.
671 /// \return Status code.
672 Status Spectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int pad, WindowType window,
673 int n_fft, int hop_length, int win_length, float power, bool normalized, bool center,
674 BorderType pad_mode, bool onesided);
675
676 /// \brief Transform audio signal into spectrogram.
677 /// \param[in] input Tensor of shape <..., time>.
678 /// \param[out] output Tensor of shape <..., time>.
679 /// \param[in] sample_rate The sample rate of input tensor.
680 /// \param[in] n_fft Size of FFT, creates n_fft / 2 + 1 bins.
681 /// \param[in] win_length Window size.
682 /// \param[in] hop_length Length of hop between STFT windows.
683 /// \param[in] pad Two sided padding of signal.
684 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
685 /// \return Status code.
686 Status SpectralCentroid(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate,
687 int n_fft, int win_length, int hop_length, int pad, WindowType window);
688
689 /// \brief Stretch STFT in time at a given rate, without changing the pitch.
690 /// \param input: Tensor of shape <..., freq, time>.
691 /// \param rate: Stretch factor.
692 /// \param phase_advance: Expected phase advance in each bin.
693 /// \param output: Tensor after stretch in time domain.
694 /// \return Status code.
695 Status TimeStretch(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate, float hop_length,
696 int32_t n_freq);
697
698 /// \brief Stretch STFT in time at a given rate, without changing the pitch.
699 /// \param[in] input Tensor of shape <..., freq, time, 2>.
700 /// \param[in] output Tensor of shape <..., freq, ceil(time/rate), 2>.
701 /// \param[in] rate Speed-up factor.
702 /// \param[in] phase_advance Expected phase advance in each bin in shape of (freq, 1).
703 /// \return Status code.
704 Status PhaseVocoder(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate,
705 const std::shared_ptr<Tensor> &phase_advance);
706
707 /// \brief Apply a mask along axis.
708 /// \param input: Tensor of shape <..., freq, time>.
709 /// \param output: Tensor of shape <..., freq, time>.
710 /// \param mask_param: Number of columns to be masked will be uniformly sampled from [0, mask_param].
711 /// \param mask_value: Value to assign to the masked columns.
712 /// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
713 /// \param rnd: Number generator.
714 /// \return Status code.
715 Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_param,
716 float mask_value, int axis, std::mt19937 *rnd);
717
718 /// \brief Apply a mask along axis. All examples will have the same mask interval.
719 /// \param input: Tensor of shape <..., freq, time>.
720 /// \param output: Tensor of shape <..., freq, time>.
721 /// \param mask_width: The width of the mask.
722 /// \param mask_start: Starting position of the mask.
723 /// Mask will be applied from indices [mask_start, mask_start + mask_width).
724 /// \param mask_value: Value to assign to the masked columns.
725 /// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
726 /// \return Status code.
727 Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_width,
728 int32_t mask_start, float mask_value, int32_t axis);
729
730 /// \brief Create a DCT transformation matrix with shape (n_mels, n_mfcc), normalized depending on norm.
731 /// \param n_mfcc: Number of mfc coefficients to retain, the value must be greater than 0.
732 /// \param n_mels: Number of mel filterbanks, the value must be greater than 0.
733 /// \param norm: Norm to use, can be NormMode::kNone or NormMode::kOrtho.
734 /// \return Status code.
735 Status Dct(std::shared_ptr<Tensor> *output, int32_t n_mfcc, int32_t n_mels, NormMode norm);
736
737 /// \brief Compute the norm of complex tensor input.
738 /// \param power Power of the norm description (optional).
739 /// \param input Tensor shape of <..., complex=2>.
740 /// \param output Tensor shape of <..., >.
741 /// \return Status code.
742 Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power);
743
744 /// \brief Stochastic gradient descent.
745 /// \param[in] input Input tensor.
746 /// \param[out] output Output tensor.
747 /// \param[in] grad Input grad for params.
748 /// \param[in] lr Learning rate.
749 /// \param[in] momentum Momentum factor.
750 /// \param[in] dampening Dampening for momentum.
751 /// \param[in] weight_decay Weight decay.
752 /// \param[in] nesterov Whether enable nesterov momentum.
753 /// \param[in] stat Stat.
754 /// \return Status code.
755 template <typename T>
756 Status SGD(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const std::shared_ptr<Tensor> &grad,
757 float lr, float momentum = 0.0, float dampening = 0.0, float weight_decay = 0.0, bool nesterov = false,
758 float stat = 0.0) {
759 size_t elem_num = input->Size();
760 std::vector<T> accum(elem_num);
761 std::shared_ptr<Tensor> output_param;
762 std::vector<T> out_param(elem_num);
763 int ind = 0;
764 auto itr_inp = input->begin<T>();
765 auto itr_grad = grad->begin<T>();
766 while (itr_inp != input->end<T>() && itr_grad != grad->end<T>()) {
767 T grad_new = (*itr_grad);
768 if (weight_decay > static_cast<float>(0.0)) {
769 grad_new += (*itr_inp) * static_cast<T>(weight_decay);
770 }
771 if (momentum > 0) {
772 if (stat > 0) {
773 accum[ind] = grad_new;
774 stat = 0;
775 } else {
776 accum[ind] = accum[ind] * momentum + (1 - static_cast<T>(dampening)) * grad_new;
777 }
778 if (nesterov) {
779 grad_new += accum[ind] * momentum;
780 } else {
781 grad_new = accum[ind];
782 }
783 }
784 out_param[ind] = (*itr_inp) - lr * grad_new;
785 itr_inp++;
786 itr_grad++;
787 ind++;
788 }
789
790 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_param, TensorShape({input->Size()}), &output_param));
791 *output = output_param;
792 return Status::OK();
793 }
794
795 /// \brief Use conversion matrix to solve normal STFT from mel frequency STFT.
796 /// \param input Tensor of shape <..., n_mels, time>.
797 /// \param output Tensor of shape <..., freq, time>.
798 /// \param n_stft Number of bins in STFT, the value must be greater than 0.
799 /// \param n_mels Number of mel filter, the value must be greater than 0.
800 /// \param sample_rate Sample rate of the signal, the value can't be zero.
801 /// \param f_min Minimum frequency, the value must be greater than or equal to 0.
802 /// \param f_max Maximum frequency, the value must be greater than 0.
803 /// \param max_iter Maximum number of optimization iterations, the value must be greater than 0.
804 /// \param tolerance_loss Value of loss to stop optimization at, the value must be greater than or equal to 0.
805 /// \param tolerance_change Difference in losses to stop optimization at, the value must be greater than or equal to 0.
806 /// \param sgd_lr Learning rate for SGD optimizer, the value must be greater than or equal to 0.
807 /// \param sgd_momentum Momentum factor for SGD optimizer, the value must be greater than or equal to 0.
808 /// \param norm Type of norm, value should be NormType::kSlaney or NormType::kNone. If norm is NormType::kSlaney,
809 /// divide the triangle mel weight by the width of the mel band.
810 /// \param mel_type Type of mel, value should be MelType::kHtk or MelType::kSlaney.
811 /// \param rnd Random generator.
812 /// \return Status code.
813 Status InverseMelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
814 int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
815 float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum, NormType norm,
816 MelType mel_type, std::mt19937 *rnd);
817
818 /// \brief Create InverseSpectrogram for a raw audio signal.
819 /// \param[in] input Input tensor.
820 /// \param[out] output Output tensor.
821 /// \param[in] length The output length of the waveform.
822 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
823 /// \param[in] win_length Window size.
824 /// \param[in] hop_length Length of hop between STFT windows.
825 /// \param[in] pad Two sided padding of signal.
826 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
827 /// \param[in] normalized Whether to normalize by magnitude after stft.
828 /// \param[in] center Whether the signal in spectrogram was padded on both sides.
829 /// \param[in] pad_mode Controls the padding method used when center is True.
830 /// \param[in] onesided Controls whether spectrogram was used to return half of results to avoid redundancy.
831 /// \return Status return code.
832 Status InverseSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length,
833 int32_t n_fft, int32_t win_length, int32_t hop_length, int32_t pad, WindowType window,
834 bool normalized, bool center, BorderType pad_mode, bool onesided);
835
836 /// \brief Decode mu-law encoded signal.
837 /// \param input Tensor of shape <..., time>.
838 /// \param output Tensor of shape <..., time>.
839 /// \param quantization_channels Number of channels.
840 /// \return Status code.
841 Status MuLawDecoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
842 int32_t quantization_channels);
843
844 /// \brief Encode signal based on mu-law companding.
845 /// \param input Tensor of shape <..., time>.
846 /// \param output Tensor of shape <..., time>.
847 /// \param quantization_channels Number of channels.
848 /// \return Status code.
849 Status MuLawEncoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
850 int32_t quantization_channels);
851
852 /// \brief Apply a overdrive effect to the audio.
853 /// \param input Tensor of shape <..., time>.
854 /// \param output Tensor of shape <..., time>.
855 /// \param gain Coefficient of overload in dB.
856 /// \param color Coefficient of translation.
857 /// \return Status code.
858 template <typename T>
Overdrive(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float gain,float color)859 Status Overdrive(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float gain, float color) {
860 TensorShape input_shape = input->shape();
861 // input->2D.
862 auto rows = input->Size() / input_shape[-1];
863 auto cols = input_shape[-1];
864 TensorShape to_shape({rows, cols});
865 RETURN_IF_NOT_OK(input->Reshape(to_shape));
866 // apply dB2Linear on gain, 20dB is expect to gain.
867 float gain_ex = exp(gain * log(10) / 20.0);
868 constexpr int64_t translation_factor = 200;
869 color = color / translation_factor;
870 // declare the array used to store the input.
871 std::vector<T> input_vec;
872 // out_vec is used to save the result of applying overdrive.
873 std::vector<T> out_vec;
874 // store intermediate results of input.
875 std::vector<T> temp;
876 constexpr double temp_factor = 3.0;
877 // scale and pan the input two-dimensional sound wave array to a certain extent.
878 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
879 // store the value of traverse the input.
880 T temp_fp = *itr;
881 input_vec.push_back(temp_fp);
882 // use 0 to initialize out_vec.
883 out_vec.push_back(0);
884 T temp_fp2 = temp_fp * gain_ex + color;
885 // 0.5 + 2/3 * 0.75 = 1, zoom and shift the sound.
886 if (temp_fp2 < -1) {
887 // -2.0 / 3.0 is -2/3 in the formula.
888 temp.push_back(-2.0 / temp_factor);
889 } else if (temp_fp2 > 1) {
890 // 2.0 / 3.0 is 2/3 in the formula.
891 temp.push_back(2.0 / temp_factor);
892 } else {
893 temp.push_back(temp_fp2 - temp_fp2 * temp_fp2 * temp_fp2 / temp_factor);
894 }
895 }
896 // last_in and last_out are the intermediate values for processing each moment.
897 std::vector<T> last_in;
898 std::vector<T> last_out;
899 for (size_t i = 0; i < cols; i++) {
900 last_in.push_back(0.0);
901 last_out.push_back(0.0);
902 }
903 // overdrive core loop.
904 for (size_t i = 0; i < cols; i++) {
905 size_t index = 0;
906 // calculate the value of each moment according to the rules of overdrive.
907 for (size_t j = i; j < rows * cols; j += cols, index++) {
908 // 0.995 is the preservation ratio of sound waves.
909 last_out[index] = temp[j] - last_in[index] + last_out[index] * 0.995;
910 last_in[index] = temp[j];
911 // 0.5 + 2/3 * 0.75 = 1, zoom and shift the sound.
912 T temp_fp = input_vec[j] * 0.5 + last_out[index] * 0.75;
913 // clamp min=-1, max=1.
914 if (temp_fp < -1) {
915 out_vec[j] = -1.0;
916 } else if (temp_fp > 1) {
917 out_vec[j] = 1.0;
918 } else {
919 out_vec[j] = temp_fp;
920 }
921 }
922 }
923 // move data to output tensor.
924 std::shared_ptr<Tensor> out;
925 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_vec, input_shape, &out));
926 *output = out;
927 return Status::OK();
928 }
929
930 /// \brief Add a fade in and/or fade out to an input.
931 /// \param[in] input: The input tensor.
932 /// \param[out] output: Added fade in and/or fade out audio with the same shape.
933 /// \param[in] fade_in_len: Length of fade-in (time frames).
934 /// \param[in] fade_out_len: Length of fade-out (time frames).
935 /// \param[in] fade_shape: Shape of fade.
936 Status Fade(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t fade_in_len,
937 int32_t fade_out_len, FadeShape fade_shape);
938
939 /// \brief Add a volume to an waveform.
940 /// \param input/output: Tensor of shape <..., time>.
941 /// \param gain: Gain value, varies according to the value of gain_type.
942 /// \param gain_type: Type of gain, should be one of [GainType::kAmplitude, GainType::kDb, GainType::kPower].
943 /// \return Status code.
944 template <typename T>
Vol(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T gain,GainType gain_type)945 Status Vol(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T gain, GainType gain_type) {
946 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
947 const T lower_bound = -1;
948 const T upper_bound = 1;
949
950 // DB is a unit which converts a numeric value into decibel scale and for conversion, we have to use log10
951 // A(in dB) = 20log10(A in amplitude)
952 // When referring to measurements of power quantities, a ratio can be expressed as a level in decibels by evaluating
953 // ten times the base-10 logarithm of the ratio of the measured quantity to reference value
954 // A(in dB) = 10log10(A in power)
955 const int power_factor_div = 20;
956 const int power_factor_mul = 10;
957 const int base = 10;
958
959 if (gain_type == GainType::kDb) {
960 if (gain != 0) {
961 gain = std::pow(base, (gain / power_factor_div));
962 }
963 } else if (gain_type == GainType::kPower) {
964 gain = power_factor_mul * std::log10(gain);
965 gain = std::pow(base, (gain / power_factor_div));
966 }
967
968 for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
969 if (gain != 0 || gain_type == GainType::kAmplitude) {
970 *itr = (*itr) * gain;
971 }
972 *itr = std::min(std::max((*itr), lower_bound), upper_bound);
973 }
974
975 return Status::OK();
976 }
977
978 /// \brief Separate a complex-valued spectrogram with shape (…, 2) into its magnitude and phase.
979 /// \param input: Complex tensor.
980 /// \param output: The magnitude and phase of the complex tensor.
981 /// \param power: Power of the norm.
982 Status Magphase(const TensorRow &input, TensorRow *output, float power);
983
984 /// \brief Compute Normalized Cross-Correlation Function (NCCF).
985 /// \param input: Tensor of shape <channel,waveform_length>.
986 /// \param output: Tensor of shape <channel, num_of_frames, lags>.
987 /// \param sample_rate: The sample rate of the waveform (Hz).
988 /// \param frame_time: Duration of a frame.
989 /// \param freq_low: Lowest frequency that can be detected (Hz).
990 /// \return Status code.
991 template <typename T>
ComputeNccf(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float frame_time,int32_t freq_low)992 Status ComputeNccf(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
993 float frame_time, int32_t freq_low) {
994 auto channel = input->shape()[0];
995 auto waveform_length = input->shape()[1];
996 size_t idx = 0;
997 size_t channel_idx = 1;
998 int32_t lags = static_cast<int32_t>(ceil(static_cast<float>(sample_rate) / freq_low));
999 int32_t frame_size = static_cast<int32_t>(ceil(sample_rate * frame_time));
1000 int32_t num_of_frames = static_cast<int32_t>(ceil(static_cast<float>(waveform_length) / frame_size));
1001 int32_t p = lags + num_of_frames * frame_size - waveform_length;
1002 TensorShape output_shape({channel, num_of_frames, lags});
1003 DataType intput_type = input->type();
1004 RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, intput_type, output));
1005 // pad p 0 in -1 dimension
1006 std::vector<T> signal;
1007 // Tensor -> vector
1008 for (auto itr = input->begin<T>(); itr != input->end<T>();) {
1009 while (idx < waveform_length * channel_idx) {
1010 signal.push_back(*itr);
1011 ++itr;
1012 ++idx;
1013 }
1014 // Each channel is processed with the sliding window
1015 // waveform:[channel, time] --> waveform:[channel, time+p]
1016 for (size_t i = 0; i < p; ++i) {
1017 signal.push_back(static_cast<T>(0.0));
1018 }
1019 if (idx % waveform_length == 0) {
1020 ++channel_idx;
1021 }
1022 }
1023 // compute ncc
1024 for (dsize_t lag = 1; lag <= lags; ++lag) {
1025 // compute one ncc
1026 // one ncc out
1027 std::vector<T> out;
1028 channel_idx = 1;
1029 idx = 0;
1030 size_t win_idx = 0;
1031 size_t waveform_length_p = waveform_length + p;
1032 // Traversal signal
1033 for (auto itr = signal.begin(); itr != signal.end();) {
1034 // Each channel is processed with the sliding window
1035 size_t s1 = idx;
1036 size_t s2 = idx + lag;
1037 size_t frame_count = 0;
1038 T s1_norm = static_cast<T>(0);
1039 T s2_norm = static_cast<T>(0);
1040 T ncc_umerator = static_cast<T>(0);
1041 T ncc = static_cast<T>(0);
1042 while (idx < waveform_length_p * channel_idx) {
1043 // Sliding window
1044 if (frame_count == num_of_frames) {
1045 ++itr;
1046 ++idx;
1047 continue;
1048 }
1049 if (win_idx < frame_size) {
1050 ncc_umerator += signal[s1] * signal[s2];
1051 s1_norm += signal[s1] * signal[s1];
1052 s2_norm += signal[s2] * signal[s2];
1053 ++win_idx;
1054 ++s1;
1055 ++s2;
1056 }
1057 if (win_idx == frame_size) {
1058 if (s1_norm != static_cast<T>(0.0) && s2_norm != static_cast<T>(0.0)) {
1059 ncc = ncc_umerator / s1_norm / s2_norm;
1060 } else {
1061 ncc = static_cast<T>(0.0);
1062 }
1063 out.push_back(ncc);
1064 ncc_umerator = static_cast<T>(0.0);
1065 s1_norm = static_cast<T>(0.0);
1066 s2_norm = static_cast<T>(0.0);
1067 ++frame_count;
1068 win_idx = 0;
1069 }
1070 ++itr;
1071 ++idx;
1072 }
1073 if (idx % waveform_length_p == 0) {
1074 ++channel_idx;
1075 }
1076 } // compute one ncc
1077 // cat tensor
1078 auto itr_out = out.begin();
1079 for (dsize_t row_idx = 0; row_idx < channel; ++row_idx) {
1080 for (dsize_t frame_idx = 0; frame_idx < num_of_frames; ++frame_idx) {
1081 RETURN_IF_NOT_OK((*output)->SetItemAt({row_idx, frame_idx, lag - 1}, *itr_out));
1082 ++itr_out;
1083 }
1084 }
1085 } // compute ncc
1086 return Status::OK();
1087 }
1088
1089 /// \brief For each frame, take the highest value of NCCF.
1090 /// \param input: Tensor of shape <channel, num_of_frames, lags>.
1091 /// \param output: Tensor of shape <channel, num_of_frames>.
1092 /// \param sample_rate: The sample rate of the waveform (Hz).
1093 /// \param freq_high: Highest frequency that can be detected (Hz).
1094 /// \return Status code.
1095 template <typename T>
FindMaxPerFrame(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t freq_high)1096 Status FindMaxPerFrame(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
1097 int32_t freq_high) {
1098 std::vector<T> signal;
1099 std::vector<int> out;
1100 auto channel = input->shape()[0];
1101 auto num_of_frames = input->shape()[1];
1102 auto lags = input->shape()[2];
1103 CHECK_FAIL_RETURN_UNEXPECTED(freq_high != 0, "DetectPitchFrequency: freq_high can not be zero.");
1104 auto lag_min = static_cast<int32_t>(ceil(static_cast<float>(sample_rate) / freq_high));
1105 TensorShape out_shape({channel, num_of_frames});
1106 // pack batch
1107 for (auto itr = input->begin<T>(); itr != input->end<T>(); ++itr) {
1108 signal.push_back(*itr);
1109 }
1110 // find the best nccf
1111 T best_max_value = static_cast<T>(0.0);
1112 T half_max_value = static_cast<T>(0.0);
1113 int32_t best_max_indices = 0;
1114 int32_t half_max_indices = 0;
1115 auto thresh = static_cast<T>(0.99);
1116 auto lags_half = lags / 2;
1117 for (dsize_t channel_idx = 0; channel_idx < channel; ++channel_idx) {
1118 for (dsize_t frame_idx = 0; frame_idx < num_of_frames; ++frame_idx) {
1119 auto index_01 = channel_idx * num_of_frames * lags + frame_idx * lags + lag_min;
1120 best_max_value = signal[index_01];
1121 half_max_value = signal[index_01];
1122 best_max_indices = lag_min;
1123 half_max_indices = lag_min;
1124 for (dsize_t lag_idx = 0; lag_idx < lags; ++lag_idx) {
1125 if (lag_idx > lag_min) {
1126 auto index_02 = channel_idx * num_of_frames * lags + frame_idx * lags + lag_idx;
1127 if (signal[index_02] > best_max_value) {
1128 best_max_value = signal[index_02];
1129 best_max_indices = lag_idx;
1130 if (lag_idx < lags_half) {
1131 half_max_value = signal[index_02];
1132 half_max_indices = lag_idx;
1133 }
1134 }
1135 }
1136 }
1137 // Add back minimal lag
1138 // Add 1 empirical calibration offset
1139 if (half_max_value > best_max_value * thresh) {
1140 out.push_back(half_max_indices + 1);
1141 } else {
1142 out.push_back(best_max_indices + 1);
1143 }
1144 }
1145 }
1146 // unpack batch
1147 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
1148 return Status::OK();
1149 }
1150
1151 /// \brief Apply median smoothing to the 1D tensor over the given window.
1152 /// \param input: Tensor of shape<channel, num_of_frames>.
1153 /// \param output: Tensor of shape <channel, num_of_window>.
1154 /// \param win_length: The window length for median smoothing (in number of frames).
1155 /// \return Status code.
1156 Status MedianSmoothing(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length);
1157
1158 /// \brief Detect pitch frequency.
1159 /// \param input: Tensor of shape <channel,waveform_length>.
1160 /// \param output: Tensor of shape <channel, num_of_frames, lags>.
1161 /// \param sample_rate: The sample rate of the waveform (Hz).
1162 /// \param frame_time: Duration of a frame.
1163 /// \param win_length: The window length for median smoothing (in number of frames).
1164 /// \param freq_low: Lowest frequency that can be detected (Hz).
1165 /// \param freq_high: Highest frequency that can be detected (Hz).
1166 /// \return Status code.
1167 Status DetectPitchFrequency(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
1168 float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high);
1169
1170 /// \brief A helper function for phaser, generates a table with given parameters.
1171 /// \param output: Tensor of shape <time>.
1172 /// \param type: can choose DataType::DE_FLOAT32 or DataType::DE_INT32.
1173 /// \param modulation: Modulation of the input tensor.
1174 /// It can be one of Modulation.kSinusoidal or Modulation.kTriangular.
1175 /// \param table_size: The length of table.
1176 /// \param min: Calculate the sampling rate within the delay time.
1177 /// \param max: Calculate the sampling rate within the delay and delay depth time.
1178 /// \param phase: Phase offset of function.
1179 /// \return Status code.
1180 Status GenerateWaveTable(std::shared_ptr<Tensor> *output, const DataType &type, Modulation modulation,
1181 int32_t table_size, float min, float max, float phase);
1182
1183 /// \brief Apply a phaser effect to the audio.
1184 /// \param input Tensor of shape <..., time>.
1185 /// \param output Tensor of shape <..., time>.
1186 /// \param sample_rate Sampling rate of the waveform.
1187 /// \param gain_in Desired input gain at the boost (or attenuation) in dB.
1188 /// \param gain_out Desired output gain at the boost (or attenuation) in dB.
1189 /// \param delay_ms Desired delay in milli seconds.
1190 /// \param decay Desired decay relative to gain-in.
1191 /// \param mod_speed Modulation speed in Hz.
1192 /// \param sinusoidal If true, use sinusoidal modulation. If false, use triangular modulation.
1193 /// \return Status code.
1194 template <typename T>
Phaser(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float gain_in,float gain_out,float delay_ms,float decay,float mod_speed,bool sinusoidal)1195 Status Phaser(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, float gain_in,
1196 float gain_out, float delay_ms, float decay, float mod_speed, bool sinusoidal) {
1197 TensorShape input_shape = input->shape();
1198 // input convert to 2D (channels,time)
1199 auto channels = input->Size() / input_shape[-1];
1200 auto time = input_shape[-1];
1201 TensorShape to_shape({channels, time});
1202 RETURN_IF_NOT_OK(input->Reshape(to_shape));
1203 // input vector
1204 std::vector<std::vector<T>> input_vec(channels, std::vector<T>(time, 0));
1205 // output vector
1206 std::vector<std::vector<T>> out_vec(channels, std::vector<T>(time, 0));
1207 // input convert to vector
1208 auto input_itr = input->begin<T>();
1209 for (size_t i = 0; i < channels; i++) {
1210 for (size_t j = 0; j < time; j++) {
1211 input_vec[i][j] = *input_itr * gain_in;
1212 input_itr++;
1213 }
1214 }
1215 // compute
1216 // create delay buffer
1217 int delay_buf_nrow = channels;
1218 // calculate the length of the delay
1219 int delay_buf_len = static_cast<int>((delay_ms * 0.001 * sample_rate) + 0.5);
1220 std::vector<std::vector<T>> delay_buf(delay_buf_nrow, std::vector<T>(delay_buf_len, 0 * decay));
1221 // calculate the length after the momentum
1222 int mod_buf_len = static_cast<int>(sample_rate / mod_speed + 0.5);
1223 Modulation modulation = sinusoidal ? Modulation::kSinusoidal : Modulation::kTriangular;
1224 // create and compute mod buffer
1225 std::shared_ptr<Tensor> mod_buf_tensor;
1226 auto PI_factor = 2;
1227 RETURN_IF_NOT_OK(GenerateWaveTable(&mod_buf_tensor, DataType(DataType::DE_INT32), modulation, mod_buf_len,
1228 static_cast<float>(1.0f), static_cast<float>(delay_buf_len),
1229 static_cast<float>(PI / PI_factor)));
1230 // tensor mod_buf convert to vector
1231 std::vector<int> mod_buf;
1232 for (auto itr = mod_buf_tensor->begin<int>(); itr != mod_buf_tensor->end<int>(); itr++) {
1233 mod_buf.push_back(*itr);
1234 }
1235 dsize_t delay_pos = 0;
1236 dsize_t mod_pos = 0;
1237 // for every channal at the current time
1238 for (size_t i = 0; i < time; i++) {
1239 // calculate the delay data that should be added to each channal at this time
1240 int idx = static_cast<int>((delay_pos + mod_buf[mod_pos]) % delay_buf_len);
1241 mod_pos = (mod_pos + 1) % mod_buf_len;
1242 delay_pos = (delay_pos + 1) % delay_buf_len;
1243 // update the next delay data with the current result * decay
1244 for (size_t j = 0; j < channels; j++) {
1245 out_vec[j][i] = input_vec[j][i] + delay_buf[j][idx];
1246 delay_buf[j][delay_pos] = (input_vec[j][i] + delay_buf[j][idx]) * decay;
1247 }
1248 }
1249 std::vector<T> out_vec_one_d;
1250 for (size_t i = 0; i < channels; i++) {
1251 for (size_t j = 0; j < time; j++) {
1252 // gain_out on the output
1253 out_vec[i][j] *= gain_out;
1254 // clamp
1255 out_vec[i][j] = std::max<float>(-1.0f, std::min<float>(1.0f, out_vec[i][j]));
1256 // output vector is transformed from 2d to 1d
1257 out_vec_one_d.push_back(out_vec[i][j]);
1258 }
1259 }
1260 // move data to output tensor
1261 std::shared_ptr<Tensor> out;
1262 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_vec_one_d, input_shape, &out));
1263 *output = out;
1264 return Status::OK();
1265 }
1266
1267 /// \brief Flanger about interpolation effect.
1268 /// \param input: Tensor of shape <batch, channel, time>.
1269 /// \param int_delay: A dimensional vector about integer delay, subscript representing delay.
1270 /// \param frac_delay: A dimensional vector about delay obtained by using the frac function.
1271 /// \param interpolation: Interpolation of the input tensor.
1272 /// It can be one of Interpolation::kLinear or Interpolation::kQuadratic.
1273 /// \param delay_buf_pos: Minimum dimension length about delay_bufs.
1274 /// \Returns Flanger about interpolation effect.
1275 template <typename T>
FlangerInterpolation(const std::shared_ptr<Tensor> & input,std::vector<int> int_delay,const std::vector<T> & frac_delay,Interpolation interpolation,int delay_buf_pos)1276 std::vector<std::vector<T>> FlangerInterpolation(const std::shared_ptr<Tensor> &input, std::vector<int> int_delay,
1277 const std::vector<T> &frac_delay, Interpolation interpolation,
1278 int delay_buf_pos) {
1279 int n_batch = input->shape()[0];
1280 int n_channels = input->shape()[-2];
1281 int delay_buf_length = input->shape()[-1];
1282 const int32_t bias = 2;
1283
1284 std::vector<std::vector<T>> delayed_value_a(n_batch, std::vector<T>(n_channels, 0));
1285 std::vector<std::vector<T>> delayed_value_b(n_batch, std::vector<T>(n_channels, 0));
1286 for (int j = 0; j < n_batch; j++) {
1287 for (int k = 0; k < n_channels; k++) {
1288 // delay after obtaining the current number of channels
1289 auto iter_input = input->begin<T>();
1290 int it = j * n_channels * delay_buf_length + k * delay_buf_length;
1291 iter_input += it + (delay_buf_pos + int_delay[k]) % delay_buf_length;
1292 delayed_value_a[j][k] = *(iter_input);
1293 iter_input = input->begin<T>();
1294 iter_input += it + (delay_buf_pos + int_delay[k] + 1) % delay_buf_length;
1295 delayed_value_b[j][k] = *(iter_input);
1296 }
1297 }
1298 // delay subscript backward
1299 for (int j = 0; j < n_channels; j++) {
1300 int_delay[j] = int_delay[j] + bias;
1301 }
1302 std::vector<std::vector<T>> delayed(n_batch, std::vector<T>(n_channels, 0));
1303 std::vector<std::vector<T>> delayed_value_c(n_batch, std::vector<T>(n_channels, 0));
1304 if (interpolation == Interpolation::kLinear) {
1305 for (int j = 0; j < n_batch; j++) {
1306 for (int k = 0; k < n_channels; k++) {
1307 delayed[j][k] = delayed_value_a[j][k] + (delayed_value_b[j][k] - delayed_value_a[j][k]) * frac_delay[k];
1308 }
1309 }
1310 } else {
1311 for (int j = 0; j < n_batch; j++) {
1312 for (int k = 0; k < n_channels; k++) {
1313 auto iter_input = input->begin<T>();
1314 int it = j * n_channels * delay_buf_length + k * delay_buf_length;
1315 iter_input += it + (delay_buf_pos + int_delay[k]) % delay_buf_length;
1316 delayed_value_c[j][k] = *(iter_input);
1317 }
1318 }
1319 // delay subscript backward
1320 for (int j = 0; j < n_channels; j++) {
1321 int_delay[j] = int_delay[j] + 1;
1322 }
1323 std::vector<std::vector<T>> frac_delay_coefficient(n_batch, std::vector<T>(n_channels, 0));
1324 std::vector<std::vector<T>> frac_delay_value(n_batch, std::vector<T>(n_channels, 0));
1325 for (int j = 0; j < n_batch; j++) {
1326 for (int k = 0; k < n_channels; k++) {
1327 delayed_value_c[j][k] = delayed_value_c[j][k] - delayed_value_a[j][k];
1328 delayed_value_b[j][k] = delayed_value_b[j][k] - delayed_value_a[j][k];
1329 // delayed_value_c[j][k] * 0.5 is half of the delayed_value_c[j][k]
1330 frac_delay_coefficient[j][k] = delayed_value_c[j][k] * 0.5 - delayed_value_b[j][k];
1331 frac_delay_value[j][k] = delayed_value_b[j][k] * 2 - delayed_value_c[j][k] * 0.5;
1332 // the next delay is obtained by delaying the data in the buffer
1333 delayed[j][k] = delayed_value_a[j][k] +
1334 (frac_delay_coefficient[j][k] * frac_delay[k] + frac_delay_value[j][k]) * frac_delay[k];
1335 }
1336 }
1337 }
1338 return delayed;
1339 }
1340
1341 /// \brief Interval limiting function.
1342 /// \param output_waveform: Tensor of shape <..., time>.
1343 /// \param min: If value is less than min, min is returned.
1344 /// \param max: If value is greater than max, max is returned.
1345 /// \Returns Tensor at the same latitude.
1346 template <typename T>
Clamp(const std::shared_ptr<Tensor> & tensor,T min,T max)1347 std::shared_ptr<Tensor> Clamp(const std::shared_ptr<Tensor> &tensor, T min, T max) {
1348 for (auto itr = tensor->begin<T>(); itr != tensor->end<T>(); itr++) {
1349 if (*itr > max) {
1350 *itr = max;
1351 } else if (*itr < min) {
1352 *itr = min;
1353 }
1354 }
1355 return tensor;
1356 }
1357
1358 /// \brief Apply flanger effect.
1359 /// \param input/output: Tensor of shape <..., channel, time>.
1360 /// \param sample_rate: Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
1361 /// \param delay: Desired delay in milliseconds (ms), range: [0, 30].
1362 /// \param depth: Desired delay depth in milliseconds (ms), range: [0, 10].
1363 /// \param regen: Desired regen (feedback gain) in dB., range: [-95, 95].
1364 /// \param width: Desired width (delay gain) in dB, range: [0, 100].
1365 /// \param speed: Modulation speed in Hz, range: [0.1, 10].
1366 /// \param phase: Percentage phase-shift for multi-channel, range: [0, 100].
1367 /// \param modulation: Modulation of the input tensor.
1368 /// It can be one of Modulation::kSinusoidal or Modulation::kTriangular.
1369 /// \param interpolation: Interpolation of the input tensor.
1370 /// It can be one of Interpolation::kLinear or Interpolation::kQuadratic.
1371 /// \return Status code.
1372 template <typename T>
Flanger(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float delay,float depth,float regen,float width,float speed,float phase,Modulation modulation,Interpolation interpolation)1373 Status Flanger(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, int32_t sample_rate, float delay,
1374 float depth, float regen, float width, float speed, float phase, Modulation modulation,
1375 Interpolation interpolation) {
1376 std::shared_ptr<Tensor> waveform;
1377 if (input->type() == DataType::DE_FLOAT64) {
1378 waveform = input;
1379 } else {
1380 RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
1381 }
1382 // convert to 3D (batch, channels, time)
1383 TensorShape actual_shape = waveform->shape();
1384 TensorShape toShape({waveform->Size() / actual_shape[-2] / actual_shape[-1], actual_shape[-2], actual_shape[-1]});
1385 RETURN_IF_NOT_OK(waveform->Reshape(toShape));
1386
1387 // scaling
1388 T feedback_gain = static_cast<T>(regen) / 100;
1389 T delay_gain = static_cast<T>(width) / 100;
1390 T channel_phase = static_cast<T>(phase) / 100;
1391 T delay_min = static_cast<T>(delay) / 1000;
1392 T delay_depth = static_cast<T>(depth) / 1000;
1393
1394 // balance output:
1395 T in_gain = 1.0 / (1 + delay_gain);
1396 delay_gain = delay_gain / (1 + delay_gain);
1397 // balance feedback loop:
1398 delay_gain = delay_gain * (1 - abs(feedback_gain));
1399
1400 int delay_buf_length = static_cast<int>((delay_min + delay_depth) * sample_rate + 0.5);
1401 auto delay_buf_length_factor = 2;
1402 delay_buf_length = delay_buf_length + delay_buf_length_factor;
1403
1404 int lfo_length = static_cast<int>(sample_rate / speed);
1405
1406 T table_min = floor(delay_min * sample_rate + 0.5);
1407 T table_max = delay_buf_length - 2.0;
1408 // generate wave table
1409 T lfo_phase = 3 * PI / 2;
1410 std::shared_ptr<Tensor> lfo;
1411 RETURN_IF_NOT_OK(GenerateWaveTable(&lfo, DataType(DataType::DE_FLOAT32), modulation, lfo_length,
1412 static_cast<float>(table_min), static_cast<float>(table_max),
1413 static_cast<float>(lfo_phase)));
1414 int n_batch = waveform->shape()[0];
1415 int n_channels = waveform->shape()[-2];
1416 int time = waveform->shape()[-1];
1417 std::vector<T> delay_tensor(n_channels, 0.0), frac_delay(n_channels, 0.0);
1418 std::vector<int> cur_channel_phase(n_channels, 0), int_delay(n_channels, 0);
1419 // next delay
1420 std::vector<std::vector<T>> delay_last(n_batch, std::vector<T>(n_channels, 0));
1421
1422 // initialization of delay_bufs
1423 TensorShape delay_bufs_shape({n_batch, n_channels, delay_buf_length});
1424 std::shared_ptr<Tensor> delay_bufs, output_waveform;
1425 RETURN_IF_NOT_OK(Tensor::CreateEmpty(delay_bufs_shape, waveform->type(), &delay_bufs));
1426 RETURN_IF_NOT_OK(delay_bufs->Zero());
1427 // initialization of output_waveform
1428 TensorShape output_waveform_shape({n_batch, n_channels, actual_shape[-1]});
1429 RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_waveform_shape, waveform->type(), &output_waveform));
1430
1431 int delay_buf_pos = 0, lfo_pos = 0;
1432 for (int i = 0; i < time; i++) {
1433 delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length;
1434 for (int j = 0; j < n_channels; j++) {
1435 auto channel_phase_factor = 0.5;
1436 // get current channel phase
1437 cur_channel_phase[j] = static_cast<int>(j * lfo_length * channel_phase + channel_phase_factor);
1438 // through the current channel phase and lfo arrays to get the delay
1439 auto iter_lfo = lfo->begin<float>();
1440 delay_tensor[j] = *(iter_lfo + static_cast<ptrdiff_t>((lfo_pos + cur_channel_phase[j]) % lfo_length));
1441 // the frac delay is obtained by using the frac function
1442 frac_delay[j] = delay_tensor[j] - static_cast<int>(delay_tensor[j]);
1443 delay_tensor[j] = floor(delay_tensor[j]);
1444 int_delay[j] = static_cast<int>(delay_tensor[j]);
1445 }
1446 // get the waveform of [:, :, i]
1447 std::shared_ptr<Tensor> temp;
1448 TensorShape temp_shape({n_batch, n_channels});
1449 RETURN_IF_NOT_OK(Tensor::CreateEmpty(temp_shape, waveform->type(), &temp));
1450 Slice ss1(0, n_batch), ss2(0, n_channels), ss3(i, i + 1);
1451 SliceOption sp1(ss1), sp2(ss2), sp3(ss3);
1452 std::vector<SliceOption> slice_option;
1453 slice_option.push_back(sp1), slice_option.push_back(sp2), slice_option.push_back(sp3);
1454 RETURN_IF_NOT_OK(waveform->Slice(&temp, slice_option));
1455
1456 auto iter_temp = temp->begin<T>();
1457 auto iter_delay_bufs = delay_bufs->begin<T>();
1458 for (int j = 0; j < n_batch; j++) {
1459 for (int k = 0; k < n_channels; k++) {
1460 iter_delay_bufs += delay_buf_pos;
1461 // the value of delay_bufs is processed by next delay
1462 *(iter_delay_bufs) = *iter_temp + delay_last[j][k] * feedback_gain;
1463 iter_delay_bufs -= (delay_buf_pos - delay_buf_length);
1464 iter_temp++;
1465 }
1466 }
1467 // different delayed values can be obtained by judging the type of interpolation
1468 std::vector<std::vector<T>> delayed(n_batch, std::vector<T>(n_channels, 0));
1469 delayed = FlangerInterpolation<T>(delay_bufs, int_delay, frac_delay, interpolation, delay_buf_pos);
1470
1471 for (int j = 0; j < n_channels; j++) {
1472 int_delay[j] = int_delay[j] + 1;
1473 }
1474 iter_temp = temp->begin<T>();
1475 for (int j = 0; j < n_batch; j++) {
1476 for (int k = 0; k < n_channels; k++) {
1477 auto iter_output_waveform = output_waveform->begin<T>();
1478 // update the next delay
1479 delay_last[j][k] = delayed[j][k];
1480 int it = j * n_channels * actual_shape[-1] + k * actual_shape[-1];
1481 iter_output_waveform += it + i;
1482 // the results are obtained by balancing the output and balancing the feedback loop
1483 *(iter_output_waveform) = *(iter_temp)*in_gain + delayed[j][k] * delay_gain;
1484 iter_temp++;
1485 }
1486 }
1487 // update lfo location
1488 lfo_pos = (lfo_pos + 1) % lfo_length;
1489 }
1490 // the output value is limited by the interval limit function
1491 output_waveform = Clamp<T>(output_waveform, -1, 1);
1492 // convert dimension to waveform dimension
1493 RETURN_IF_NOT_OK(output_waveform->Reshape(actual_shape));
1494 RETURN_IF_NOT_OK(TypeCast(output_waveform, output, input->type()));
1495 return Status::OK();
1496 }
1497
1498 // A brief structure of wave file header.
1499 struct WavHeader {
1500 int8_t chunk_id[4] = {0};
1501 int32_t chunk_size = 0;
1502 int8_t format[4] = {0};
1503 int8_t sub_chunk1_id[4] = {0};
1504 int32_t sub_chunk1_size = 0;
1505 int16_t audio_format = 0;
1506 int16_t num_channels = 0;
1507 int32_t sample_rate = 0;
1508 int32_t byte_rate = 0;
1509 int16_t byte_align = 0;
1510 int16_t bits_per_sample = 0;
1511 int8_t sub_chunk2_id[4] = {0};
1512 int32_t sub_chunk2_size = 0;
WavHeaderWavHeader1513 WavHeader() {}
1514 };
1515
1516 /// \brief Get an audio data from a wav file and store into a vector.
1517 /// \param wav_file_dir: wave file dir.
1518 /// \param waveform_vec: vector of waveform.
1519 /// \param sample_rate: sample rate.
1520 /// \return Status code.
1521 Status ReadWaveFile(const std::string &wav_file_dir, std::vector<float> *waveform_vec, int32_t *sample_rate);
1522
1523 /// \brief Apply sliding-window cepstral mean and variance (optional) normalization per utterance.
1524 /// \param input: Tensor of shape <..., freq, time>.
1525 /// \param output: Tensor of shape <..., frame>.
1526 /// \param cmn_window: Window in frames for running average CMN computation.
1527 /// \param min_cmn_window: Minimum CMN window used at start of decoding.
1528 /// \param center: If true, use a window centered on the current frame. If false, window is to the left.
1529 /// \param norm_vars: If true, normalize variance to one.
1530 /// \return Status code.
1531 Status SlidingWindowCmn(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t cmn_window,
1532 int32_t min_cmn_window, bool center, bool norm_vars);
1533 /// \brief Compute delta coefficients of a tensor, usually a spectrogram.
1534 /// \param input: Tensor of shape <...,freq,time>.
1535 /// \param output: Tensor of shape <...,freq,time>.
1536 /// \param win_length: The window length used for computing delta.
1537 /// \param mode: Padding mode.
1538 /// \return Status code.
1539 Status ComputeDeltas(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length,
1540 const BorderType &mode);
1541
1542 template <typename T>
Mul(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,T value)1543 Status Mul(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, T value) {
1544 RETURN_UNEXPECTED_IF_NULL(output);
1545 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1546 auto iter_in = input->begin<T>();
1547 auto iter_out = (*output)->begin<T>();
1548 for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1549 *iter_out = (*iter_in) * value;
1550 }
1551 return Status::OK();
1552 }
1553
1554 template <typename T>
Div(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,T value)1555 Status Div(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, T value) {
1556 RETURN_UNEXPECTED_IF_NULL(output);
1557 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1558 CHECK_FAIL_RETURN_UNEXPECTED(value != 0, "Div: invalid parameter, 'value' can not be zero.");
1559 auto iter_in = input->begin<T>();
1560 auto iter_out = (*output)->begin<T>();
1561 for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1562 *iter_out = (*iter_in) / value;
1563 }
1564 return Status::OK();
1565 }
1566
1567 template <typename T>
Add(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,T value)1568 Status Add(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, T value) {
1569 RETURN_UNEXPECTED_IF_NULL(output);
1570 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1571 auto iter_in = input->begin<T>();
1572 auto iter_out = (*output)->begin<T>();
1573 for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1574 *iter_out = (*iter_in) + value;
1575 }
1576 return Status::OK();
1577 }
1578
1579 template <typename T>
SubTensor(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int len)1580 Status SubTensor(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int len) {
1581 RETURN_UNEXPECTED_IF_NULL(output);
1582 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), input->type(), output));
1583 RETURN_IF_NOT_OK(
1584 ValidateNoGreaterThan("SubTensor", "len", len, "size of input tensor", static_cast<int>(input->Size())));
1585 auto iter_in = input->begin<T>();
1586 auto iter_out = (*output)->begin<T>();
1587 for (; iter_out != (*output)->end<T>(); ++iter_in, ++iter_out) {
1588 *iter_out = *iter_in;
1589 }
1590 return Status::OK();
1591 }
1592
1593 template <typename T>
TensorAdd(const std::shared_ptr<Tensor> & input,const std::shared_ptr<Tensor> & other,std::shared_ptr<Tensor> * output)1594 Status TensorAdd(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
1595 std::shared_ptr<Tensor> *output) {
1596 RETURN_UNEXPECTED_IF_NULL(output);
1597 CHECK_FAIL_RETURN_UNEXPECTED(input->shape() == other->shape(), "TensorAdd: input tensor shape must be the same.");
1598 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == other->type(), "TensorAdd: input tensor type must be the same.");
1599
1600 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1601 auto iter_in1 = input->begin<T>();
1602 auto iter_in2 = other->begin<T>();
1603 auto iter_out = (*output)->begin<T>();
1604 for (; iter_out != (*output)->end<T>(); ++iter_in1, ++iter_in2, ++iter_out) {
1605 *iter_out = (*iter_in1) + (*iter_in2);
1606 }
1607 return Status::OK();
1608 }
1609
1610 template <typename T>
TensorSub(const std::shared_ptr<Tensor> & input,const std::shared_ptr<Tensor> & other,std::shared_ptr<Tensor> * output)1611 Status TensorSub(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
1612 std::shared_ptr<Tensor> *output) {
1613 RETURN_UNEXPECTED_IF_NULL(output);
1614 CHECK_FAIL_RETURN_UNEXPECTED(input->shape() == other->shape(), "TensorSub: input tensor shape must be the same.");
1615 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == other->type(), "TensorSub: input tensor type must be the same.");
1616
1617 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1618 auto iter_in1 = input->begin<T>();
1619 auto iter_in2 = other->begin<T>();
1620 auto iter_out = (*output)->begin<T>();
1621 for (; iter_out != (*output)->end<T>(); ++iter_in1, ++iter_in2, ++iter_out) {
1622 *iter_out = (*iter_in1) - (*iter_in2);
1623 }
1624 return Status::OK();
1625 }
1626
1627 template <typename T>
TensorCat(const std::shared_ptr<Tensor> & input,const std::shared_ptr<Tensor> & other,std::shared_ptr<Tensor> * output)1628 Status TensorCat(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
1629 std::shared_ptr<Tensor> *output) {
1630 RETURN_UNEXPECTED_IF_NULL(output);
1631 CHECK_FAIL_RETURN_UNEXPECTED(input->type() == other->type(), "TensorCat: input tensor type must be the same.");
1632 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[-1] + other->shape()[-1]}), input->type(), output));
1633 auto iter_in1 = input->begin<T>();
1634 auto iter_in2 = other->begin<T>();
1635 auto iter_out = (*output)->begin<T>();
1636 for (; iter_in1 != input->end<T>(); ++iter_in1, ++iter_out) {
1637 *iter_out = *iter_in1;
1638 }
1639 for (; iter_in2 != other->end<T>(); ++iter_in2, ++iter_out) {
1640 *iter_out = *iter_in2;
1641 }
1642 return Status::OK();
1643 }
1644
1645 template <typename T>
TensorRepeat(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int rank_repeat)1646 Status TensorRepeat(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int rank_repeat) {
1647 RETURN_UNEXPECTED_IF_NULL(output);
1648
1649 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({rank_repeat, (input->shape()[-1])}), input->type(), output));
1650 auto iter_in = input->begin<T>();
1651 auto iter_out = (*output)->begin<T>();
1652 for (int i = 0; i < rank_repeat; i++) {
1653 auto iter_in = input->begin<T>();
1654 for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1655 *iter_out = *iter_in;
1656 }
1657 }
1658 return Status::OK();
1659 }
1660
1661 template <typename T>
TensorRowReplace(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int row)1662 Status TensorRowReplace(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int row) {
1663 RETURN_UNEXPECTED_IF_NULL(output);
1664 auto iter_in = input->begin<T>();
1665 auto iter_out = (*output)->begin<T>() + static_cast<ptrdiff_t>((*output)->shape()[-1] * row);
1666 CHECK_FAIL_RETURN_UNEXPECTED(iter_out <= (*output)->end<T>(), "TensorRowReplace: pointer out of bounds");
1667 CHECK_FAIL_RETURN_UNEXPECTED(input->Size() <= (*output)->shape()[-1], "TensorRowReplace: pointer out of bounds");
1668 for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1669 *iter_out = *iter_in;
1670 }
1671 return Status::OK();
1672 }
1673
1674 template <typename T>
TensorRowAt(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int rank_index)1675 Status TensorRowAt(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int rank_index) {
1676 RETURN_UNEXPECTED_IF_NULL(output);
1677 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[-1]}), input->type(), output));
1678 auto iter_in = input->begin<T>() + static_cast<ptrdiff_t>(input->shape()[-1] * rank_index);
1679 auto iter_out = (*output)->begin<T>();
1680 CHECK_FAIL_RETURN_UNEXPECTED(iter_in <= input->end<T>(), "TensorRowAt: pointer out of bounds");
1681 for (; iter_out != (*output)->end<T>(); ++iter_in, ++iter_out) {
1682 *iter_out = *iter_in;
1683 }
1684 return Status::OK();
1685 }
1686
1687 template <typename T>
TensorRound(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)1688 Status TensorRound(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
1689 RETURN_UNEXPECTED_IF_NULL(output);
1690
1691 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1692 auto iter_in = input->begin<T>();
1693 auto iter_out = (*output)->begin<T>();
1694 for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1695 *iter_out = round(*iter_in);
1696 }
1697 return Status::OK();
1698 }
1699
1700 template <typename T>
ApplyProbabilityDistribution(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,DensityFunction density_function,std::mt19937 * rnd)1701 Status ApplyProbabilityDistribution(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
1702 DensityFunction density_function, std::mt19937 *rnd) {
1703 int channel_size = input->shape()[0] - 1;
1704 int time_size = input->shape()[-1] - 1;
1705 std::uniform_int_distribution<> dis_channel(0, channel_size);
1706 int random_channel = channel_size > 0 ? dis_channel(*rnd) : 0;
1707 std::uniform_int_distribution<> dis_time(0, time_size);
1708 int random_time = time_size > 0 ? dis_time(*rnd) : 0;
1709 int number_of_bits = 16;
1710 int up_scaling = static_cast<int>(pow(2, number_of_bits - 1) - 2);
1711 int down_scaling = static_cast<int>(pow(2, number_of_bits - 1));
1712
1713 std::shared_ptr<Tensor> signal_scaled;
1714 RETURN_IF_NOT_OK(Mul<T>(input, &signal_scaled, up_scaling));
1715
1716 std::shared_ptr<Tensor> signal_scaled_dis;
1717 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, &signal_scaled_dis));
1718
1719 if (density_function == DensityFunction::kRPDF) {
1720 auto iter_in = input->begin<T>();
1721 iter_in += (time_size + 1) * random_channel + random_time;
1722 auto RPDF = *(iter_in);
1723 RETURN_IF_NOT_OK(Add<T>(signal_scaled, &signal_scaled_dis, RPDF));
1724 } else if (density_function == DensityFunction::kGPDF) {
1725 int num_rand_variables = 6;
1726 auto iter_in = input->begin<T>();
1727 iter_in += (time_size + 1) * random_channel + random_time;
1728 auto gaussian = *(iter_in);
1729 for (int i = 0; i < num_rand_variables; i++) {
1730 int rand_channel = channel_size > 0 ? dis_channel(*rnd) : 0;
1731 int rand_time = time_size > 0 ? dis_time(*rnd) : 0;
1732
1733 auto iter_in_rand = input->begin<T>();
1734 iter_in_rand += (time_size + 1) * rand_channel + rand_time;
1735 gaussian += *(iter_in_rand);
1736 *(iter_in_rand) = gaussian;
1737 }
1738 RETURN_IF_NOT_OK(Add<T>(signal_scaled, &signal_scaled_dis, gaussian));
1739 } else {
1740 int window_length = time_size + 1;
1741 std::shared_ptr<Tensor> float_bartlett;
1742 RETURN_IF_NOT_OK(Bartlett(&float_bartlett, window_length));
1743 std::shared_ptr<Tensor> type_convert_bartlett;
1744 RETURN_IF_NOT_OK(TypeCast(float_bartlett, &type_convert_bartlett, input->type()));
1745
1746 int rank_repeat = channel_size + 1;
1747 std::shared_ptr<Tensor> TPDF;
1748 RETURN_IF_NOT_OK(TensorRepeat<T>(type_convert_bartlett, &TPDF, rank_repeat));
1749 RETURN_IF_NOT_OK(TensorAdd<T>(signal_scaled, TPDF, &signal_scaled_dis));
1750 }
1751 std::shared_ptr<Tensor> quantised_signal_scaled;
1752 RETURN_IF_NOT_OK(TensorRound<T>(signal_scaled_dis, &quantised_signal_scaled));
1753
1754 std::shared_ptr<Tensor> quantised_signal;
1755 RETURN_IF_NOT_OK(Div<T>(quantised_signal_scaled, &quantised_signal, down_scaling));
1756 *output = quantised_signal;
1757 return Status::OK();
1758 }
1759
1760 template <typename T>
AddNoiseShaping(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)1761 Status AddNoiseShaping(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
1762 std::shared_ptr<Tensor> dithered_waveform;
1763 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(*output, &dithered_waveform));
1764 std::shared_ptr<Tensor> waveform;
1765 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, &waveform));
1766
1767 std::shared_ptr<Tensor> error;
1768 RETURN_IF_NOT_OK(TensorSub<T>(dithered_waveform, waveform, &error));
1769 for (int i = 0; i < error->shape()[0]; i++) {
1770 std::shared_ptr<Tensor> err;
1771 RETURN_IF_NOT_OK(TensorRowAt<T>(error, &err, i));
1772 std::shared_ptr<Tensor> tensor_zero;
1773 std::vector<T> vector_zero(1, 0);
1774 RETURN_IF_NOT_OK(Tensor::CreateFromVector(vector_zero, TensorShape({1}), &tensor_zero));
1775 std::shared_ptr<Tensor> error_offset;
1776 RETURN_IF_NOT_OK(TensorCat<T>(tensor_zero, err, &error_offset));
1777 int k = error->shape()[-1];
1778 std::shared_ptr<Tensor> fresh_error_offset;
1779 RETURN_IF_NOT_OK(SubTensor<T>(error_offset, &fresh_error_offset, k));
1780 RETURN_IF_NOT_OK(TensorRowReplace<T>(fresh_error_offset, &error, i));
1781 }
1782 std::shared_ptr<Tensor> noise_shaped;
1783 RETURN_IF_NOT_OK(TensorAdd<T>(dithered_waveform, error, &noise_shaped));
1784 *output = noise_shaped;
1785 return Status::OK();
1786 }
1787
1788 /// \brief Apply dither effect.
1789 /// \param input/output: Tensor of shape <..., time>.
1790 /// \param density_function: The density function of a continuous random variable.
1791 /// \param noise_shaing: A filtering process that shapes the spectral energy of quantisation error.
1792 /// \return Status code.
1793 template <typename T>
Dither(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,DensityFunction density_function,bool noise_shaping,std::mt19937 * rnd)1794 Status Dither(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, DensityFunction density_function,
1795 bool noise_shaping, std::mt19937 *rnd) {
1796 TensorShape shape = input->shape();
1797 TensorShape new_shape({input->Size() / shape[-1], shape[-1]});
1798 RETURN_IF_NOT_OK(input->Reshape(new_shape));
1799
1800 RETURN_IF_NOT_OK(ApplyProbabilityDistribution<T>(input, output, density_function, rnd));
1801 if (noise_shaping) {
1802 RETURN_IF_NOT_OK(AddNoiseShaping<T>(input, output));
1803 }
1804
1805 RETURN_IF_NOT_OK((*output)->Reshape(shape));
1806 RETURN_IF_NOT_OK(input->Reshape(shape));
1807 return Status::OK();
1808 }
1809
1810 /// \brief Apply GriffinLim to calculate waveform from linear scalar amplitude spectrogram.
1811 /// \param input Tensor of shape <..., freq, time>.
1812 /// \param output Tensor of shape <..., time>.
1813 /// \param n_fft Size of FFT.
1814 /// \param n_iter Number of iteration for phase recovery.
1815 /// \param win_length Window size for GriffinLim.
1816 /// \param hop_length Length of hop between STFT windows.
1817 /// \param window_type Window type for GriffinLim.
1818 /// \param power Exponent for the magnitude spectrogram.
1819 /// \param momentum The momentum for fast GriffinLim.
1820 /// \param length Length of the expected output waveform.
1821 /// \param rand_init Flag for random phase initialization or all-zero phase initialization.
1822 /// \param rnd Random generator.
1823 /// \return Status code.
1824 Status GriffinLim(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft, int32_t n_iter,
1825 int32_t win_length, int32_t hop_length, WindowType window_type, float power, float momentum,
1826 int32_t length, bool rand_init, std::mt19937 *rnd);
1827
1828 /// \brief Calculate measure for VAD.
1829 template <typename T>
Measure(int32_t measure_len_ws,int32_t index,const Eigen::MatrixXd & samples,Eigen::MatrixXd * spectrum,Eigen::MatrixXd * noise_spectrum,const std::vector<T> & spectrum_window,int32_t spectrum_start,int32_t spectrum_end,const std::vector<T> & cepstrum_window,int32_t cepstrum_start,int32_t cepstrum_end,T noise_reduction_amount,T measure_smooth_time_mult,T noise_up_time_mult,T noise_down_time_mult,int32_t index_ns,int32_t boot_count,float * meas)1830 Status Measure(int32_t measure_len_ws, int32_t index, const Eigen::MatrixXd &samples, Eigen::MatrixXd *spectrum,
1831 Eigen::MatrixXd *noise_spectrum, const std::vector<T> &spectrum_window, int32_t spectrum_start,
1832 int32_t spectrum_end, const std::vector<T> &cepstrum_window, int32_t cepstrum_start,
1833 int32_t cepstrum_end, T noise_reduction_amount, T measure_smooth_time_mult, T noise_up_time_mult,
1834 T noise_down_time_mult, int32_t index_ns, int32_t boot_count, float *meas) {
1835 RETURN_UNEXPECTED_IF_NULL(spectrum);
1836 RETURN_UNEXPECTED_IF_NULL(noise_spectrum);
1837 RETURN_UNEXPECTED_IF_NULL(meas);
1838 CHECK_FAIL_RETURN_UNEXPECTED(spectrum->cols() == noise_spectrum->cols(),
1839 "Measure: the number of columns of spectrum must be equal to noise_spectrum.");
1840 int samples_len_ns = samples.cols();
1841 CHECK_FAIL_RETURN_UNEXPECTED(samples_len_ns != 0, "Measure: the number of columns of samples cannot be zero.");
1842 int dft_len_ws = spectrum->cols();
1843 std::vector<float> dft_buf(dft_len_ws, 0);
1844 for (int ind = 0; ind < measure_len_ws; ind++) {
1845 int index_new = (ind == 0) ? index_ns : ((index_ns + ind) % samples_len_ns);
1846 dft_buf[ind] = samples(index, index_new) * spectrum_window[ind];
1847 }
1848
1849 // use fft from eigen to calculate rfft
1850 Eigen::FFT<float> fft;
1851 std::vector<std::complex<float>> rfft_res;
1852 fft.fwd(rfft_res, dft_buf);
1853 // truncate redundant information in fft
1854 int rfft_len = rfft_res.size() - (rfft_res.size() - 1) / 2;
1855 rfft_res.resize(rfft_len);
1856
1857 float mult = (boot_count >= 0) ? (static_cast<float>(boot_count) / (1.0 + boot_count))
1858 : static_cast<float>(measure_smooth_time_mult);
1859 std::vector<float> dft_buf_abs(spectrum_end - spectrum_start);
1860 for (int i = 0; i < spectrum_end - spectrum_start; i++) {
1861 auto dba_i = std::abs(rfft_res[i + spectrum_start]);
1862 // inplace revise spectrum
1863 (*spectrum)(index, spectrum_start + i) *= mult;
1864 (*spectrum)(index, spectrum_start + i) += dba_i * (1 - mult);
1865
1866 dba_i = std::pow((*spectrum)(index, spectrum_start + i), TWO);
1867
1868 float mult2 = 0;
1869 // new mult
1870 if (boot_count >= 0) {
1871 mult2 = 0;
1872 } else {
1873 if (dba_i > (*noise_spectrum)(index, spectrum_start + i)) {
1874 mult2 = noise_up_time_mult;
1875 } else {
1876 mult2 = noise_down_time_mult;
1877 }
1878 }
1879 // inplace revise noise spectrum
1880 (*noise_spectrum)(index, spectrum_start + i) *= mult2;
1881 (*noise_spectrum)(index, spectrum_start + i) += dba_i * (1 - mult2);
1882
1883 dba_i = dba_i - noise_reduction_amount * (*noise_spectrum)(index, spectrum_start + i);
1884 dba_i = dba_i <= 0 ? 0 : std::sqrt(dba_i);
1885
1886 dft_buf_abs.push_back(dba_i);
1887 }
1888
1889 // cepstrum_buf
1890 std::vector<float> cepstrum_buf(dft_len_ws >> 1, 0);
1891 for (int i = 0; i < spectrum_end - spectrum_start; i++) {
1892 cepstrum_buf[spectrum_start + i] = dft_buf_abs[i] * cepstrum_window[i];
1893 }
1894 std::vector<std::complex<float>> rfft_res2;
1895 fft.fwd(rfft_res2, cepstrum_buf);
1896 rfft_len = rfft_res2.size() - (rfft_res.size() - 1) / TWO;
1897 rfft_res2.resize(rfft_len);
1898
1899 float result = 0;
1900 for (int i = cepstrum_start; i < cepstrum_end; i++) {
1901 result += std::pow(std::abs(rfft_res2[i]), TWO);
1902 }
1903
1904 result = result > 0 ? std::log(result / (cepstrum_end - cepstrum_start)) : INT_MIN;
1905 int base = 21;
1906 *meas = static_cast<float>(std::max(0.0f, base + result));
1907 return Status::OK();
1908 }
1909
1910 /// \brief Update parameters in VAD calculation.
UpdateVadParams(int32_t * samples_index_ns,int32_t * pos,const int32_t samples_len_ns,int32_t * measure_timer_ns,const int32_t measure_period_ns,int32_t * measures_index,const int32_t measures_len,int32_t * boot_count,const int32_t boot_count_max,bool has_triggered,const int32_t num_measures_to_flush,int32_t * flushed_len_ns)1911 inline Status UpdateVadParams(int32_t *samples_index_ns, int32_t *pos, const int32_t samples_len_ns,
1912 int32_t *measure_timer_ns, const int32_t measure_period_ns, int32_t *measures_index,
1913 const int32_t measures_len, int32_t *boot_count, const int32_t boot_count_max,
1914 bool has_triggered, const int32_t num_measures_to_flush, int32_t *flushed_len_ns) {
1915 RETURN_UNEXPECTED_IF_NULL(samples_index_ns);
1916 RETURN_UNEXPECTED_IF_NULL(pos);
1917 RETURN_UNEXPECTED_IF_NULL(measure_timer_ns);
1918 RETURN_UNEXPECTED_IF_NULL(measures_index);
1919 RETURN_UNEXPECTED_IF_NULL(boot_count);
1920 RETURN_UNEXPECTED_IF_NULL(flushed_len_ns);
1921
1922 *samples_index_ns = *samples_index_ns + 1;
1923 *pos += 1;
1924 CHECK_FAIL_RETURN_UNEXPECTED(measures_len != 0, "UpdateVadParams: the length of measures cannot be zero.");
1925 CHECK_FAIL_RETURN_UNEXPECTED(samples_len_ns != 0,
1926 "UpdateVadParams: the number of columns of samples cannot be zero.");
1927 *samples_index_ns = *samples_index_ns == samples_len_ns ? 0 : *samples_index_ns;
1928 if (*measure_timer_ns == 0) {
1929 *measure_timer_ns = measure_period_ns;
1930 *measures_index += 1;
1931 *measures_index = *measures_index % measures_len;
1932 *boot_count = (*boot_count >= 0) && (*boot_count == boot_count_max) ? -1 : *boot_count + 1;
1933 }
1934 if (has_triggered) {
1935 *flushed_len_ns = (measures_len - num_measures_to_flush) * measure_period_ns;
1936 *samples_index_ns = (*samples_index_ns + *flushed_len_ns) % samples_len_ns;
1937 }
1938 return Status::OK();
1939 }
1940
1941 /// \brief Init spectrum window and cepstrum window for VAD.
FlushMeasures(int measures_len,int measures_index,const int row_ind,const Eigen::MatrixXd & measures,float trigger_level,int gap_len,int * num_measures_to_flush)1942 inline Status FlushMeasures(int measures_len, int measures_index, const int row_ind, const Eigen::MatrixXd &measures,
1943 float trigger_level, int gap_len, int *num_measures_to_flush) {
1944 RETURN_UNEXPECTED_IF_NULL(num_measures_to_flush);
1945
1946 int n = measures_len, k = measures_index;
1947 int j_trigger = n, j_zero = n, j = 0;
1948 for (int j = 0; j < n; j++) {
1949 if ((measures(row_ind, k) >= trigger_level) && (j <= (j_trigger + gap_len))) {
1950 j_trigger = j;
1951 j_zero = j_trigger;
1952 } else if ((measures(row_ind, k) == 0) && (j_trigger >= j_zero)) {
1953 j_zero = j;
1954 }
1955 k = (k + n - 1) % n;
1956 }
1957 j = std::min(j, j_zero);
1958 *num_measures_to_flush = std::min(std::max((*num_measures_to_flush), j), n);
1959
1960 return Status::OK();
1961 }
1962
1963 /// \brief Init spectrum window and cepstrum window for Vad.
1964 template <typename T>
InitWindows(std::vector<T> * spectrum_window,std::vector<T> * cepstrum_window)1965 Status InitWindows(std::vector<T> *spectrum_window, std::vector<T> *cepstrum_window) {
1966 RETURN_UNEXPECTED_IF_NULL(spectrum_window);
1967 RETURN_UNEXPECTED_IF_NULL(cepstrum_window);
1968
1969 float half = 0.5;
1970 for (int i = 0; i < spectrum_window->size(); i++) {
1971 auto hann = half - half * std::cos(static_cast<T>(i) / spectrum_window->size() * PI * TWO);
1972 (*spectrum_window)[i] *= hann;
1973 }
1974
1975 for (int i = 0; i < cepstrum_window->size(); i++) {
1976 auto hann = half - half * std::cos(static_cast<T>(i) / cepstrum_window->size() * PI * TWO);
1977 (*cepstrum_window)[i] *= hann;
1978 }
1979 return Status::OK();
1980 }
1981
1982 /// \brief Voice activity detector.
1983 /// \param input/output Tensor of shape <..., time>.
1984 /// \param sample_rate Sample rate of audio signal.
1985 /// \param trigger_level The measurement level used to trigger activity detection.
1986 /// \param trigger_time The time constant (in seconds) used to help ignore short sounds.
1987 /// \param search_time The amount of audio (in seconds) to search for quieter/shorter sounds to include prior to
1988 /// the detected trigger point.
1989 /// \param allowed_gap The allowed gap (in seconds) between quiteter/shorter sounds to include prior to the
1990 /// detected trigger point.
1991 /// \param pre_trigger_time The amount of audio (in seconds) to preserve before the trigger point and any found
1992 /// quieter/shorter bursts.
1993 /// \param boot_time The time for the initial noise estimate.
1994 /// \param noise_up_time Time constant used by the adaptive noise estimator, when the noise level is increasing.
1995 /// \param noise_down_time Time constant used by the adaptive noise estimator, when the noise level is decreasing.
1996 /// \param noise_reduction_amount The amount of noise reduction used in the detection algorithm.
1997 /// \param measure_freq The frequency of the algorithm’s processing.
1998 /// \param measure_duration The duration of measurement.
1999 /// \param measure_smooth_time The time constant used to smooth spectral measurements.
2000 /// \param hp_filter_freq The "Brick-wall" frequency of high-pass filter applied at the input to the detector
2001 /// algorithm.
2002 /// \param lp_filter_freq The "Brick-wall" frequency of low-pass filter applied at the input to the detector
2003 /// algorithm.
2004 /// \param hp_lifter_freq The "Brick-wall" frequency of high-pass lifter applied at the input to the detector
2005 /// algorithm.
2006 /// \param lp_lifter_freq The "Brick-wall" frequency of low-pass lifter applied at the input to the detector
2007 /// algorithm.
2008 /// \return Status code.
2009 template <typename T>
Vad(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int sample_rate,T trigger_level,T trigger_time,T search_time,T allowed_gap,T pre_trigger_time,T boot_time,T noise_up_time,T noise_down_time,T noise_reduction_amount,T measure_freq,T measure_duration,T measure_smooth_time,T hp_filter_freq,T lp_filter_freq,T hp_lifter_freq,T lp_lifter_freq)2010 Status Vad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate, T trigger_level,
2011 T trigger_time, T search_time, T allowed_gap, T pre_trigger_time, T boot_time, T noise_up_time,
2012 T noise_down_time, T noise_reduction_amount, T measure_freq, T measure_duration, T measure_smooth_time,
2013 T hp_filter_freq, T lp_filter_freq, T hp_lifter_freq, T lp_lifter_freq) {
2014 const int measure_len_ws = static_cast<int>(sample_rate * measure_duration + 0.5);
2015 const int measure_len_ns = measure_len_ws;
2016
2017 int dft_len_ws = 16;
2018 for (; dft_len_ws < measure_len_ws;) {
2019 dft_len_ws *= TWO;
2020 }
2021
2022 CHECK_FAIL_RETURN_UNEXPECTED(measure_freq != 0, "Vad: measure_freq cannot be zero.");
2023 const int measure_period_ns = static_cast<int>(sample_rate / measure_freq + 0.5);
2024 const int measures_len = std::ceil(search_time * measure_freq);
2025 const int search_pre_trigger_len_ns = static_cast<int>(measures_len * measure_period_ns);
2026 const int gap_len = static_cast<int>(allowed_gap * measure_freq + 0.5);
2027 const int fixed_pre_trigger_len_ns = static_cast<int>(pre_trigger_time * sample_rate + 0.5);
2028 const int samples_len_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns;
2029
2030 CHECK_FAIL_RETURN_UNEXPECTED(sample_rate != 0, "Vad: sample_rate cannot be zero.");
2031 auto spectrum_start = static_cast<int>(hp_filter_freq / sample_rate * dft_len_ws + 0.5);
2032 spectrum_start = std::max(spectrum_start, 1);
2033 auto spectrum_end = static_cast<int>(lp_filter_freq / sample_rate * dft_len_ws + 0.5);
2034 spectrum_end = std::min(spectrum_end, dft_len_ws / TWO);
2035 CHECK_FAIL_RETURN_UNEXPECTED(spectrum_end > spectrum_start,
2036 "Vad: the end of spectrum must be greater than the start. Check if `hp_filter_freq` is "
2037 "too large or `lp_filter_freq` is too small.");
2038
2039 CHECK_FAIL_RETURN_UNEXPECTED(lp_lifter_freq != 0, "Vad: lp_lifter_freq cannot be zero.");
2040 CHECK_FAIL_RETURN_UNEXPECTED(hp_lifter_freq != 0, "Vad: hp_lifter_freq cannot be zero.");
2041 int cepstrum_start = std::ceil(sample_rate * 0.5 / lp_lifter_freq);
2042 int cepstrum_end = std::floor(sample_rate * 0.5 / hp_lifter_freq);
2043 cepstrum_end = std::min(cepstrum_end, dft_len_ws / (TWO * TWO));
2044 CHECK_FAIL_RETURN_UNEXPECTED(cepstrum_end > cepstrum_start,
2045 "Vad: the end of cepstrum must be greater than the start. Check if `hp_lifter_freq` is "
2046 "too large or `lp_lifter_freq` is too small.");
2047 // init spectrum & cepstrum window
2048 std::vector<T> spectrum_window(measure_len_ws, static_cast<T>(TWO / std::sqrt(static_cast<T>(measure_len_ws))));
2049 std::vector<T> cepstrum_window(spectrum_end - spectrum_start,
2050 static_cast<T>(TWO / std::sqrt(static_cast<T>(spectrum_end) - spectrum_start)));
2051 RETURN_IF_NOT_OK(InitWindows<T>(&spectrum_window, &cepstrum_window));
2052
2053 T noise_up_time_mult = std::exp(-1.0 / (noise_up_time * measure_freq));
2054 T noise_down_time_mult = std::exp(-1.0 / (noise_down_time * measure_freq));
2055 T measure_smooth_time_mult = std::exp(-1.0 / (measure_smooth_time * measure_freq));
2056 T trigger_meas_time_mult = std::exp(-1.0 / (trigger_time * measure_freq));
2057
2058 auto boot_count_max = static_cast<int>(boot_time * measure_freq - 0.5);
2059 auto measure_timer_ns = measure_len_ns;
2060 int boot_count = 0, measures_index = 0, flushed_len_ns = 0, samples_index_ns = 0;
2061
2062 // pack batch
2063 TensorShape input_shape = input->shape();
2064 TensorShape to_shape({input->Size() / input_shape[-1], input_shape[-1]});
2065 RETURN_IF_NOT_OK(input->Reshape(to_shape));
2066 int n_channels = to_shape[0], ilen = to_shape[1];
2067 std::vector<T> mean_meas(n_channels, 0);
2068 Eigen::MatrixXd samples = Eigen::MatrixXd::Zero(n_channels, samples_len_ns);
2069 Eigen::MatrixXd spectrum = Eigen::MatrixXd::Zero(n_channels, dft_len_ws);
2070 Eigen::MatrixXd noise_spectrum = Eigen::MatrixXd::Zero(n_channels, dft_len_ws);
2071 Eigen::MatrixXd measures = Eigen::MatrixXd::Zero(n_channels, measures_len);
2072
2073 bool has_triggered = false;
2074 int num_measures_to_flush = 0, pos = 0;
2075
2076 // convert input to eigen mat
2077 auto wave_ptr = &*input->begin<T>();
2078 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> waveform_t(wave_ptr, ilen, n_channels);
2079 auto waveform = waveform_t.transpose();
2080 while ((pos < ilen) && (!has_triggered)) {
2081 measure_timer_ns -= 1;
2082 for (int i = 0; i < n_channels; i++) {
2083 samples(i, samples_index_ns) = waveform(i, pos);
2084
2085 if (measure_timer_ns == 0) {
2086 int index_ns = (samples_index_ns + samples_len_ns - measure_len_ns) % samples_len_ns;
2087 // measure
2088 float meas = 0;
2089 RETURN_IF_NOT_OK(Measure(measure_len_ws, i, samples, &spectrum, &noise_spectrum, spectrum_window,
2090 spectrum_start, spectrum_end, cepstrum_window, cepstrum_start, cepstrum_end,
2091 noise_reduction_amount, measure_smooth_time_mult, noise_up_time_mult,
2092 noise_down_time_mult, index_ns, boot_count, &meas));
2093 measures(i, measures_index) = meas;
2094 mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult);
2095
2096 has_triggered = has_triggered || (mean_meas[i] >= trigger_level);
2097 if (has_triggered) {
2098 RETURN_IF_NOT_OK(
2099 FlushMeasures(measures_len, measures_index, i, measures, trigger_level, gap_len, &num_measures_to_flush));
2100 }
2101 }
2102 }
2103 RETURN_IF_NOT_OK(UpdateVadParams(&samples_index_ns, &pos, samples_len_ns, &measure_timer_ns, measure_period_ns,
2104 &measures_index, measures_len, &boot_count, boot_count_max, has_triggered,
2105 num_measures_to_flush, &flushed_len_ns));
2106 }
2107
2108 // results truncate
2109 int new_col_ind = pos - samples_len_ns + flushed_len_ns;
2110 if (new_col_ind < (-1 * waveform.cols())) {
2111 new_col_ind = 0;
2112 } else if (new_col_ind < 0) {
2113 new_col_ind += ilen;
2114 }
2115 int new_cols = ilen - new_col_ind;
2116 auto res = waveform.rightCols(new_cols);
2117 // unpack
2118 std::shared_ptr<Tensor> res_tensor;
2119 Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_res = res.transpose();
2120 std::vector<T> res_vec(mat_res.data(), mat_res.data() + mat_res.size());
2121 RETURN_IF_NOT_OK(Tensor::CreateFromVector(res_vec, TensorShape({n_channels, new_cols}), &res_tensor));
2122 auto reshape_vec = input_shape.AsVector();
2123 reshape_vec[input_shape.Size() - 1] = new_cols;
2124 RETURN_IF_NOT_OK(res_tensor->Reshape(TensorShape(reshape_vec)));
2125 *output = res_tensor;
2126 return Status::OK();
2127 }
2128
2129 /// \brief Flip tensor in last dimension.
2130 /// \param input: Tensor of shape <..., time>
2131 /// \Returns Status code.
2132 template <typename T>
FlipLastDim(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)2133 Status FlipLastDim(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
2134 // Create input copy
2135 std::shared_ptr<Tensor> res_tensor;
2136 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), &res_tensor));
2137 int64_t step_length = input->shape()[-1];
2138 auto itr = input->begin<T>();
2139 auto tar_itr = res_tensor->begin<T>();
2140
2141 // Flip
2142 while (itr != input->end<T>()) {
2143 auto axis_begin = itr;
2144 auto axis_end = itr + static_cast<ptrdiff_t>(step_length);
2145 itr = axis_end;
2146
2147 // Reversed copy input value T to res_tensor cache
2148 while (axis_begin != axis_end) {
2149 axis_end--;
2150 *tar_itr = *axis_end;
2151 tar_itr++;
2152 }
2153 }
2154 *output = res_tensor;
2155 return Status::OK();
2156 }
2157
2158 /// \brief Perform an IIR filter forward and backward to a waveform.
2159 /// \param input/output: Tensor of shape <..., time>
2160 /// \param a_coeffs: denominator coefficients of difference equation of dimension of (n_order + 1).
2161 /// \param b_coeffs: numerator coefficients of difference equation of dimension of (n_order + 1).
2162 /// \param clamp: If True, clamp the output signal to be in the range [-1, 1]. Default: True.
2163 /// \return Status code
2164 template <typename T>
Filtfilt(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,std::vector<T> a_coeffs,std::vector<T> b_coeffs,bool clamp)2165 Status Filtfilt(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, std::vector<T> a_coeffs,
2166 std::vector<T> b_coeffs, bool clamp) {
2167 std::shared_ptr<Tensor> forward;
2168 std::shared_ptr<Tensor> reversed_forward;
2169 std::shared_ptr<Tensor> backward;
2170 std::shared_ptr<Tensor> reversed_backward;
2171
2172 RETURN_IF_NOT_OK(LFilter(input, &forward, a_coeffs, b_coeffs, false));
2173 RETURN_IF_NOT_OK(FlipLastDim<T>(forward, &reversed_forward));
2174 RETURN_IF_NOT_OK(LFilter(reversed_forward, &backward, a_coeffs, b_coeffs, clamp));
2175 RETURN_IF_NOT_OK(FlipLastDim<T>(backward, &reversed_backward));
2176
2177 *output = reversed_backward;
2178
2179 return Status::OK();
2180 }
2181
2182 /// \brief Resample a signal from one frequency to another. A resampling method can be given.
2183 /// \param[in] input Input tensor.
2184 /// \param[out] output Output tensor.
2185 /// \param[in] orig_freq The original frequency of the signal.
2186 /// \param[in] des_freq The desired frequency.
2187 /// \param[in] resample_method The resample method.
2188 /// \param[in] lowpass_filter_width Controls the sharpness of the filter, more means sharper but less efficient.
2189 /// \param[in] rolloff The roll-off frequency of the filter, as a fraction of the Nyquist. Lower values reduce
2190 /// anti-aliasing, but also reduce some of the highest frequencies.
2191 /// \param[in] beta The shape parameter used for kaiser window.
2192 /// \return Status return code.
2193 Status Resample(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float orig_freq, float des_freq,
2194 ResampleMethod resample_method, int32_t lowpass_filter_width, float rolloff, float beta);
2195
2196 /// \brief Create LFCC for a raw audio signal.
2197 /// \param[in] input Input tensor.
2198 /// \param[out] output Output tensor.
2199 /// \param[in] sample_rate Sample rate of audio signal.
2200 /// \param[in] n_filter Number of linear filters to apply.
2201 /// \param[in] n_lfcc Number of lfc coefficients to retain.
2202 /// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
2203 /// \param[in] log_lf Whether to use log-lf spectrograms instead of db-scaled.
2204 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2205 /// \param[in] win_length Window size.
2206 /// \param[in] hop_length Length of hop between STFT windows.
2207 /// \param[in] f_min Minimum frequency.
2208 /// \param[in] f_max Maximum frequency.
2209 /// \param[in] pad Two sided padding of signal.
2210 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2211 /// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
2212 /// \param[in] normalized Whether to normalize by magnitude after stft.
2213 /// \param[in] center Whether to pad waveform on both sides so that the tt-th frame is centered at time t
2214 /// t*hop_length.
2215 /// \param[in] pad_mode Controls the padding method used when center is True.
2216 /// \param[in] onesided Controls whether to return half of results to avoid redundancy.
2217 /// \param[in] norm Norm to use.
2218 /// \return Status code.
2219 Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2220 int32_t n_filter, int32_t n_lfcc, int32_t dct_type, bool log_lf, int32_t n_fft, int32_t win_length,
2221 int32_t hop_length, float f_min, float f_max, int32_t pad, WindowType window, float power, bool normalized,
2222 bool center, BorderType pad_mode, bool onesided, NormMode norm);
2223
2224 /// \brief Create MelSpectrogram for a raw audio signal.
2225 /// \param[in] input Input tensor.
2226 /// \param[out] output Output tensor.
2227 /// \param[in] sample_rate Sample rate of audio signal.
2228 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2229 /// \param[in] win_length Window size.
2230 /// \param[in] hop_length Length of hop between STFT windows.
2231 /// \param[in] f_min Minimum frequency, which must be non negative.
2232 /// \param[in] f_max Maximum frequency, which must be positive.
2233 /// \param[in] pad Two sided padding of signal.
2234 /// \param[in] n_mels Number of mel filter, which must be positive.
2235 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2236 /// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
2237 /// \param[in] normalized Whether to normalize by magnitude after stft.
2238 /// \param[in] center Whether to pad waveform on both sides.
2239 /// \param[in] pad_mode controls the padding method used when center is True.
2240 /// \param[in] onesided controls whether to return half of results to avoid redundancy.
2241 /// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
2242 /// \param[in] mel_scale Scale to use: htk or slaney.
2243 /// \return Status return code.
2244 Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2245 int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
2246 int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
2247 bool onesided, NormType norm, MelType mel_scale);
2248
2249 /// \brief Create MFCC for a raw audio signal.
2250 /// \param[in] input Input tensor.
2251 /// \param[out] output Output tensor.
2252 /// \param[in] sample_rate Sample rate of audio signal.
2253 /// \param[in] n_mfcc Number of mfc coefficients to retain.
2254 /// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
2255 /// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
2256 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2257 /// \param[in] win_length Window size.
2258 /// \param[in] hop_length Length of hop between STFT windows.
2259 /// \param[in] f_min Minimum frequency.
2260 /// \param[in] f_max Maximum frequency.
2261 /// \param[in] pad Two sided padding of signal.
2262 /// \param[in] n_mels Number of mel filterbanks.
2263 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2264 /// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
2265 /// \param[in] normalized Whether to normalize by magnitude after stft.
2266 /// \param[in] center Whether to pad waveform on both sides.
2267 /// \param[in] pad_mode Controls the padding method used when center is True.
2268 /// \param[in] onesided Controls whether to return half of results to avoid redundancy.
2269 /// \param[in] norm Norm to use.
2270 /// \param[in] norm_M If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
2271 /// \param[in] mel_scale Scale to use: htk or slaney.
2272 /// \return Status return code.
2273 Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
2274 int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
2275 float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
2276 BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale);
2277
2278 /// \brief Shift the pitch of a waveform by steps.
2279 /// \param[in] input Input tensor.
2280 /// \param[out] output Output tensor.
2281 /// \param[in] sample_rate Sample rate of audio signal.
2282 /// \param[in] n_steps The steps to shift audio signal.
2283 /// \param[in] bins_per_octave The number of steps per octave
2284 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2285 /// \param[in] win_length Window size.
2286 /// \param[in] hop_length Length of hop between STFT windows.
2287 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2288 /// \return Status return code.
2289 Status PitchShift(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2290 int32_t n_steps, int32_t bins_per_octave, int32_t n_fft, int32_t win_length, int32_t hop_length,
2291 WindowType window);
2292 } // namespace dataset
2293 } // namespace mindspore
2294 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
2295