• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
18 
19 #include <Eigen/Dense>
20 #include <unsupported/Eigen/FFT>
21 
22 #include <algorithm>
23 #include <cmath>
24 #include <complex>
25 #include <limits>
26 #include <memory>
27 #include <random>
28 #include <string>
29 #include <vector>
30 
31 #include "minddata/dataset/core/tensor.h"
32 #include "minddata/dataset/kernels/data/data_utils.h"
33 #include "minddata/dataset/kernels/tensor_op.h"
34 #include "minddata/dataset/util/status.h"
35 #include "minddata/dataset/util/validators.h"
36 
37 constexpr double PI = 3.141592653589793;
38 constexpr int kMinAudioDim = 1;
39 constexpr int kDefaultAudioDim = 2;
40 constexpr int TWO = 2;
41 constexpr float HALF = 0.5;
42 
43 namespace mindspore {
44 namespace dataset {
45 /// \brief Turn a tensor from the power/amplitude scale to the decibel scale.
46 /// \param input/output: Tensor of shape <..., freq, time>.
47 /// \param multiplier: power - 10, amplitude - 20.
48 /// \param amin: lower bound.
49 /// \param db_multiplier: multiplier for decibels.
50 /// \param top_db: the lower bound for decibels cut-off.
51 /// \return Status code.
52 template <typename T>
AmplitudeToDB(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T multiplier,T amin,T db_multiplier,T top_db)53 Status AmplitudeToDB(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T multiplier, T amin,
54                      T db_multiplier, T top_db) {
55   TensorShape input_shape = input->shape();
56   TensorShape to_shape = input_shape.Rank() == 2
57                            ? TensorShape({1, 1, input_shape[-2], input_shape[-1]})
58                            : TensorShape({input->Size() / (input_shape[-3] * input_shape[-2] * input_shape[-1]),
59                                           input_shape[-3], input_shape[-2], input_shape[-1]});
60   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
61   RETURN_IF_NOT_OK((*output)->Reshape(to_shape));
62 
63   std::vector<T> max_val;
64   uint64_t step = to_shape[-3] * input_shape[-2] * input_shape[-1];
65   uint64_t cnt = 0;
66   T temp_max = std::numeric_limits<T>::lowest();
67   for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
68     // do clamp
69     *itr = *itr < amin ? log10(amin) * multiplier : log10(*itr) * multiplier;
70     *itr -= multiplier * db_multiplier;
71     // calculate max by axis
72     cnt++;
73     if ((*itr) > temp_max) {
74       temp_max = *itr;
75     }
76     if (cnt % step == 0) {
77       max_val.push_back(temp_max);
78       temp_max = std::numeric_limits<T>::lowest();
79     }
80   }
81 
82   if (!std::isnan(top_db)) {
83     uint64_t ind = 0;
84     for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++, ind++) {
85       T lower_bound = max_val[ind / step] - top_db;
86       *itr = std::max((*itr), lower_bound);
87     }
88   }
89   RETURN_IF_NOT_OK((*output)->Reshape(input_shape));
90   return Status::OK();
91 }
92 
93 /// \brief Use custom thread pools.
94 /// \param task: Lambda functions to be processed.
95 /// \param input_size: Size of the input.
96 /// \param block_size: Customized size for each thread processing.
97 /// \return Status code.
98 Status AudioParallelLaunch(const std::function<void(size_t, size_t, size_t, size_t)> &task, size_t input_size,
99                            float block_size);
100 
101 /// \brief Compute the thread nums.
102 /// \param input_size: Size of the input.
103 /// \param block_size: Customized size for each thread processing.
104 /// \param task_num: The final calculated number of threads.
105 /// \param once_compute_size: The final calculated size for each thread processing.
106 /// \return Status code.
107 Status CountThreadNums(size_t input_size, float block_size, size_t *task_num, size_t *once_compute_size);
108 
109 /// \brief Calculate the angles of the complex numbers.
110 /// \param input/output: Tensor of shape <..., time>.
111 template <typename T>
Angle(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)112 Status Angle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
113   TensorShape shape = input->shape();
114   std::vector output_shape = shape.AsVector();
115   output_shape.pop_back();
116   std::shared_ptr<Tensor> out;
117   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape(output_shape), input->type(), &out));
118 
119   std::vector<std::thread> threads;
120   size_t input_size = input->Size();
121   float block_size = 30000;
122   auto task = [&input, &out, &input_size](size_t start, size_t end, size_t num, size_t task_num) {
123     T o;
124     T x;
125     T y;
126 
127     if ((task_num - num) == 1) {
128       end = input_size;
129     }
130 
131     auto itr_start = input->begin<T>();
132     itr_start += start;
133     auto itr_end = input->begin<T>();
134     itr_end += end;
135     auto itr_out = out->begin<T>();
136     size_t offset = start / TWO;
137     itr_out += offset;
138 
139     // calculate norm, using: .pow(2.).sum(-1).pow(0.5 * power)
140     for (auto itr = itr_start; itr != itr_end; itr++) {
141       x = *itr;
142       itr++;
143       y = *itr;
144       o = std::atan2(y, x);
145       *itr_out = o;
146       itr_out++;
147     }
148   };
149   RETURN_IF_NOT_OK(AudioParallelLaunch(task, input_size, block_size));
150   *output = out;
151   return Status::OK();
152 }
153 
154 Status Bartlett(std::shared_ptr<Tensor> *output, int len);
155 
156 /// \brief Perform a biquad filter of input tensor.
157 /// \param input/output: Tensor of shape <..., time>.
158 /// \param a0: denominator coefficient of current output y[n], typically 1.
159 /// \param a1: denominator coefficient of current output y[n-1].
160 /// \param a2: denominator coefficient of current output y[n-2].
161 /// \param b0: numerator coefficient of current input, x[n].
162 /// \param b1: numerator coefficient of input one time step ago x[n-1].
163 /// \param b2: numerator coefficient of input two time steps ago x[n-2].
164 /// \return Status code.
165 template <typename T>
Biquad(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T b0,T b1,T b2,T a0,T a1,T a2)166 Status Biquad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T b0, T b1, T b2, T a0, T a1,
167               T a2) {
168   std::vector<T> a_coeffs;
169   std::vector<T> b_coeffs;
170   a_coeffs.push_back(a0);
171   a_coeffs.push_back(a1);
172   a_coeffs.push_back(a2);
173   b_coeffs.push_back(b0);
174   b_coeffs.push_back(b1);
175   b_coeffs.push_back(b2);
176   return LFilter(input, output, a_coeffs, b_coeffs, true);
177 }
178 
179 /// \brief Apply contrast effect.
180 /// \param input/output: Tensor of shape <..., time>.
181 /// \param enhancement_amount: controls the amount of the enhancement.
182 /// \return Status code.
183 template <typename T>
Contrast(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T enhancement_amount)184 Status Contrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T enhancement_amount) {
185   const float enhancement_zoom = 750.0;
186   T enhancement_amount_value = enhancement_amount / enhancement_zoom;
187   TensorShape output_shape{input->shape()};
188   std::shared_ptr<Tensor> out;
189   RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, input->type(), &out));
190 
191   // Get the thread num and once compute size.
192   float block_size = 6000;
193   size_t input_size = input->Size();
194 
195   auto task = [&input, &out, &input_size, &enhancement_amount_value](size_t start, size_t end, size_t num,
196                                                                      size_t task_num) {
197     if ((task_num - num) == 1) {
198       end = input_size;
199     }
200 
201     auto itr_out = out->begin<T>();
202     itr_out += start;
203     auto tmp_start = input->begin<T>();
204     tmp_start += start;
205     auto tmp_end = input->begin<T>();
206     tmp_end += end;
207 
208     for (auto itr_in = tmp_start; itr_in != tmp_end; itr_in++) {
209       // PI / 2 is half of the constant PI
210       T temp1 = (*itr_in) * (PI / TWO);
211       T temp2 = enhancement_amount_value * std::sin(temp1 * 4);
212       *itr_out = std::sin(temp1 + temp2);
213       itr_out++;
214     }
215   };
216   RETURN_IF_NOT_OK(AudioParallelLaunch(task, input_size, block_size));
217   *output = out;
218   return Status::OK();
219 }
220 
221 /// \brief Apply DBToAmplitude effect.
222 /// \param input/output: Tensor of shape <...,time>
223 /// \param ref: Reference which the output will be scaled by.
224 /// \param power: If power equals 1, will compute DB to power. If 0.5, will compute DB to amplitude.
225 /// \return Status code
226 template <typename T>
DBToAmplitude(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T ref,T power)227 Status DBToAmplitude(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T ref, T power) {
228   std::shared_ptr<Tensor> out;
229   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), &out));
230   auto itr_out = out->begin<T>();
231   constexpr int64_t pow_factor_x = 10;
232   constexpr double pow_factor_y = 0.1;
233   for (auto itr_in = input->begin<T>(); itr_in != input->end<T>(); itr_in++) {
234     *itr_out = ref * pow(pow(pow_factor_x, (*itr_in) * pow_factor_y), power);
235     itr_out++;
236   }
237   *output = out;
238   return Status::OK();
239 }
240 
241 /// \brief Apply a DC shift to the audio.
242 /// \param input/output: Tensor of shape <...,time>.
243 /// \param shift: the amount to shift the audio.
244 /// \param limiter_gain: used only on peaks to prevent clipping.
245 /// \return Status code.
246 template <typename T>
DCShift(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float shift,float limiter_gain)247 Status DCShift(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float shift, float limiter_gain) {
248   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
249   float limiter_threshold = 0.0;
250   if (std::fabs(shift - limiter_gain) > std::numeric_limits<float>::epsilon() &&
251       std::fabs(shift) > std::numeric_limits<float>::epsilon()) {
252     limiter_threshold = 1.0 - (std::abs(shift) - limiter_gain);
253     for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
254       if (*itr > limiter_threshold && shift > 0) {
255         T peak = (*itr - limiter_threshold) * limiter_gain / (1 - limiter_threshold);
256         T sample = (peak + limiter_threshold + shift);
257         *itr = sample > limiter_threshold ? limiter_threshold : sample;
258       } else if (*itr < -limiter_threshold && shift < 0) {
259         T peak = (*itr + limiter_threshold) * limiter_gain / (1 - limiter_threshold);
260         T sample = (peak + limiter_threshold + shift);
261         *itr = sample < -limiter_threshold ? -limiter_threshold : sample;
262       } else {
263         T sample = (*itr + shift);
264         *itr = (sample > 1 || sample < -1) ? (sample > 1 ? 1 : -1) : sample;
265       }
266     }
267   } else {
268     for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
269       T sample = (*itr + shift);
270       *itr = sample > 1 || sample < -1 ? (sample > 1 ? 1 : -1) : sample;
271     }
272   }
273   return Status::OK();
274 }
275 
276 /// \brief Apply amplification or attenuation to the whole waveform.
277 /// \param input/output: Tensor of shape <..., time>.
278 /// \param gain_db: Gain adjustment in decibels (dB).
279 /// \return Status code.
280 template <typename T>
Gain(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T gain_db)281 Status Gain(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T gain_db) {
282   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
283   if (gain_db == 0) {
284     return Status::OK();
285   }
286 
287   T radio = pow(10, gain_db / 20);
288   for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); ++itr) {
289     *itr = (*itr) * radio;
290   }
291   return Status::OK();
292 }
293 
294 /// \brief Perform an IIR filter by evaluating difference equation.
295 /// \param input/output: Tensor of shape <..., time>
296 /// \param a_coeffs: denominator coefficients of difference equation of dimension of (n_order + 1).
297 /// \param b_coeffs: numerator coefficients of difference equation of dimension of (n_order + 1).
298 /// \param clamp: If True, clamp the output signal to be in the range [-1, 1] (Default: True).
299 /// \return Status code
300 template <typename T>
LFilter(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,std::vector<T> a_coeffs,std::vector<T> b_coeffs,bool clamp)301 Status LFilter(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, std::vector<T> a_coeffs,
302                std::vector<T> b_coeffs, bool clamp) {
303   //  pack batch
304   TensorShape input_shape = input->shape();
305   TensorShape toShape({input->Size() / input_shape[-1], input_shape[-1]});
306   input->Reshape(toShape);
307   auto shape_0 = static_cast<size_t>(input->shape()[0]);
308   auto shape_1 = static_cast<size_t>(input->shape()[1]);
309   std::vector<T> signal;
310   std::shared_ptr<Tensor> out;
311   std::vector<T> out_vect(shape_0 * shape_1);
312   size_t x_idx = 0;
313   size_t channel_idx = 1;
314   size_t m_num_order = b_coeffs.size() - 1;
315   size_t m_den_order = a_coeffs.size() - 1;
316   CHECK_FAIL_RETURN_UNEXPECTED(a_coeffs[0] != static_cast<T>(0),
317                                "Invalid data, the first value of 'a_coeffs' should not be 0, but got 0.");
318   // init A_coeffs and B_coeffs by div(a0)
319   for (size_t i = 1; i < a_coeffs.size(); i++) {
320     a_coeffs[i] /= a_coeffs[0];
321   }
322   for (size_t i = 0; i < b_coeffs.size(); i++) {
323     b_coeffs[i] /= a_coeffs[0];
324   }
325   // Sliding window
326   std::vector<T> m_px(m_num_order + 1, static_cast<T>(0));
327   std::vector<T> m_py(m_den_order + 1, static_cast<T>(0));
328 
329   // Tensor -> vector
330   for (auto itr = input->begin<T>(); itr != input->end<T>();) {
331     while (x_idx < shape_1 * channel_idx) {
332       signal.push_back(*itr);
333       itr++;
334       x_idx++;
335     }
336     // Sliding window
337     for (size_t j = 0; j < m_den_order; j++) {
338       m_px[j] = static_cast<T>(0);
339     }
340     for (size_t j = 0; j <= m_den_order; j++) {
341       m_py[j] = static_cast<T>(0);
342     }
343     // Each channel is processed with the sliding window
344     for (size_t i = x_idx - shape_1; i < x_idx; i++) {
345       m_px[m_num_order] = signal[i];
346       for (size_t j = 0; j < m_num_order + 1; j++) {
347         m_py[m_num_order] += b_coeffs[j] * m_px[m_num_order - j];
348       }
349       for (size_t j = 1; j < m_den_order + 1; j++) {
350         m_py[m_num_order] -= a_coeffs[j] * m_py[m_num_order - j];
351       }
352       if (clamp) {
353         if (m_py[m_num_order] > static_cast<T>(1))
354           out_vect[i] = static_cast<T>(1);
355         else if (m_py[m_num_order] < static_cast<T>(-1))
356           out_vect[i] = static_cast<T>(-1);
357         else
358           out_vect[i] = m_py[m_num_order];
359       } else {
360         out_vect[i] = m_py[m_num_order];
361       }
362       if (i + 1 == x_idx) {
363         continue;
364       }
365       for (size_t j = 0; j < m_num_order; j++) {
366         m_px[j] = m_px[j + 1];
367       }
368       for (size_t j = 0; j < m_num_order; j++) {
369         m_py[j] = m_py[j + 1];
370       }
371       m_py[m_num_order] = static_cast<T>(0);
372     }
373     if (x_idx % shape_1 == 0) {
374       ++channel_idx;
375     }
376   }
377   // unpack batch
378   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_vect, input_shape, &out));
379   *output = out;
380   return Status::OK();
381 }
382 
383 /// \brief Generate linearly spaced vector.
384 /// \param[in] start Value of the startpoint.
385 /// \param[in] end Value of the endpoint.
386 /// \param[in] n N points in the output tensor.
387 /// \param[out] output Tensor has n points with linearly space.
388 ///     The spacing between the points is (end - start) / (n - 1).
389 /// \return Status return code.
390 template <typename T>
Linspace(std::shared_ptr<Tensor> * output,T start,T end,int32_t n)391 Status Linspace(std::shared_ptr<Tensor> *output, T start, T end, int32_t n) {
392   RETURN_IF_NOT_OK(ValidateNoGreaterThan("Linspace", "start", start, "end", end));
393   int hundred = 100;
394   n = std::isnan(static_cast<double>(n * 1.0)) ? hundred : n;
395   CHECK_FAIL_RETURN_UNEXPECTED(n >= 0, "Linspace: input param n must be non-negative.");
396 
397   TensorShape out_shape({n});
398   std::vector<T> linear_vect(n);
399   T interval = (n == 1) ? 0 : ((end - start) / (n - 1));
400   for (auto i = 0; i < linear_vect.size(); ++i) {
401     linear_vect[i] = start + i * interval;
402   }
403   std::shared_ptr<Tensor> out_t;
404   RETURN_IF_NOT_OK(Tensor::CreateFromVector(linear_vect, out_shape, &out_t));
405   linear_vect.clear();
406   linear_vect.shrink_to_fit();
407   *output = out_t;
408   return Status::OK();
409 }
410 
411 template <typename T>
CreateTriangularFilterbank(std::shared_ptr<Tensor> * output,const std::shared_ptr<Tensor> & all_freqs,const std::shared_ptr<Tensor> & f_pts)412 Status CreateTriangularFilterbank(std::shared_ptr<Tensor> *output, const std::shared_ptr<Tensor> &all_freqs,
413                                   const std::shared_ptr<Tensor> &f_pts) {
414   // calculate the difference between each mel point and each stft freq point in hertz.
415   std::vector<T> f_diff;
416   auto iter_fpts1 = f_pts->begin<T>();
417   auto iter_fpts2 = f_pts->begin<T>();
418   ++iter_fpts2;
419   for (size_t i = 1; i < f_pts->Size(); i++) {
420     f_diff.push_back(*iter_fpts2 - *iter_fpts1);
421     ++iter_fpts2;
422     ++iter_fpts1;
423   }
424 
425   std::vector<T> slopes;
426   TensorShape slopes_shape({all_freqs->Size(), f_pts->Size()});
427   auto iter_all_freq = all_freqs->begin<T>();
428   for (; iter_all_freq != all_freqs->end<T>(); ++iter_all_freq) {
429     auto iter_f_pts = f_pts->begin<T>();
430     for (; iter_f_pts != f_pts->end<T>(); ++iter_f_pts) {
431       slopes.push_back(*iter_f_pts - *iter_all_freq);
432     }
433   }
434 
435   // calculate up and down slopes for creating overlapping triangles.
436   std::vector<T> down_slopes;
437   TensorShape down_slopes_shape({all_freqs->Size(), f_pts->Size() - 2});
438   for (size_t row = 0; row < down_slopes_shape[0]; row++) {
439     for (size_t col = 0; col < down_slopes_shape[1]; col++) {
440       down_slopes.push_back(-slopes[col + row * f_pts->Size()] / f_diff[col]);
441     }
442   }
443 
444   std::vector<T> up_slopes;
445   TensorShape up_slopes_shape({all_freqs->Size(), f_pts->Size() - 2});
446   for (size_t row = 0; row < up_slopes_shape[0]; row++) {
447     for (size_t col = 2; col < f_pts->Size(); col++) {
448       up_slopes.push_back(slopes[col + row * f_pts->Size()] / f_diff[col - 1]);
449     }
450   }
451 
452   // clip the value of triangles and save into fb.
453   std::vector<T> fb;
454   T zero = 0;
455   TensorShape fb_shape({all_freqs->Size(), f_pts->Size() - 2});
456   for (size_t i = 0; i < down_slopes.size(); i++) {
457     fb.push_back(std::max(zero, std::min(down_slopes[i], up_slopes[i])));
458   }
459 
460   std::shared_ptr<Tensor> fb_tensor;
461   RETURN_IF_NOT_OK(Tensor::CreateFromVector(fb, fb_shape, &fb_tensor));
462   *output = fb_tensor;
463   return Status::OK();
464 }
465 
466 /// \brief Create a frequency transformation matrix with shape (n_freqs, n_mels).
467 /// \param output Tensor of the frequency transformation matrix.
468 /// \param n_freqs: Number of frequency.
469 /// \param f_min: Minimum of frequency in Hz.
470 /// \param f_max: Maximum of frequency in Hz.
471 /// \param n_mels: Number of mel filterbanks.
472 /// \param sample_rate: Sample rate.
473 /// \param norm: Norm to use, can be NormTyppe::kSlaney or NormTyppe::kNone.
474 /// \param mel_type: Scale to use, can be MelTyppe::kSlaney or MelTyppe::kHtk.
475 /// \return Status code.
476 template <typename T>
CreateFbanks(std::shared_ptr<Tensor> * output,int32_t n_freqs,float f_min,float f_max,int32_t n_mels,int32_t sample_rate,NormType norm,MelType mel_type)477 Status CreateFbanks(std::shared_ptr<Tensor> *output, int32_t n_freqs, float f_min, float f_max, int32_t n_mels,
478                     int32_t sample_rate, NormType norm, MelType mel_type) {
479   // min_log_hz, min_log_mel, logstep and f_sp are the const of the mel value equation.
480   const double min_log_hz = 1000.0;
481   const double min_log_mel = 1000 / (200.0 / 3);
482   const double logstep = log(6.4) / 27.0;
483   const double f_sp = 200.0 / 3;
484 
485   // hez_to_mel_c and mel_to_hz_c are the const coefficient of mel frequency cepstrum.
486   const double hz_to_mel_c = 2595.0;
487   const double mel_to_hz_c = 700.0;
488 
489   // all_freqs is equivalent filterbank construction.
490   std::shared_ptr<Tensor> all_freqs;
491   // the sampling frequency is at least twice the highest frequency of the signal.
492   const double signal_times = 2;
493   RETURN_IF_NOT_OK(Linspace<T>(&all_freqs, 0, sample_rate / signal_times, n_freqs));
494 
495   // calculate mel value by f_min and f_max.
496   double m_min = 0.0;
497   double m_max = 0.0;
498   if (mel_type == MelType::kHtk) {
499     m_min = hz_to_mel_c * log10(1.0 + (f_min / mel_to_hz_c));
500     m_max = hz_to_mel_c * log10(1.0 + (f_max / mel_to_hz_c));
501   } else {
502     m_min = (f_min - 0.0) / f_sp;
503     m_max = (f_max - 0.0) / f_sp;
504     if (f_min >= min_log_hz) {
505       m_min = min_log_mel + log(f_min / min_log_hz) / logstep;
506     }
507     if (f_max >= min_log_hz) {
508       m_max = min_log_mel + log(f_max / min_log_hz) / logstep;
509     }
510   }
511 
512   // m_pts is mel value sequence in linspace of  (m_min, m_max).
513   std::shared_ptr<Tensor> m_pts;
514   const int32_t bias = 2;
515   RETURN_IF_NOT_OK(Linspace<T>(&m_pts, m_min, m_max, n_mels + bias));
516 
517   // f_pts saves hertz(mel) though 700.0 * (10.0 **(mel/ 2595.0) - 1.).
518   std::shared_ptr<Tensor> f_pts;
519   const double htk_mel_c = 10.0;
520   RETURN_IF_NOT_OK(Tensor::CreateEmpty(m_pts->shape(), m_pts->type(), &f_pts));
521 
522   if (mel_type == MelType::kHtk) {
523     auto iter_f = f_pts->begin<T>();
524     auto iter_m = m_pts->begin<T>();
525     for (; iter_m != m_pts->end<T>(); ++iter_m) {
526       *iter_f = mel_to_hz_c * (pow(htk_mel_c, *iter_m / hz_to_mel_c) - 1.0);
527       ++iter_f;
528     }
529   } else {
530     auto iter_f = f_pts->begin<T>();
531     auto iter_m = m_pts->begin<T>();
532     for (; iter_m != m_pts->end<T>(); iter_m++, iter_f++) {
533       *iter_f = f_sp * (*iter_m);
534     }
535     iter_f = f_pts->begin<T>();
536     iter_m = m_pts->begin<T>();
537     for (; iter_m != m_pts->end<T>(); iter_m++, iter_f++) {
538       if (*iter_m >= min_log_mel) {
539         *iter_f = min_log_hz * exp(logstep * (*iter_m - min_log_mel));
540       }
541     }
542   }
543 
544   // create filterbank
545   TensorShape fb_shape({all_freqs->Size(), f_pts->Size() - 2});
546   std::shared_ptr<Tensor> fb;
547   RETURN_IF_NOT_OK(CreateTriangularFilterbank<T>(&fb, all_freqs, f_pts));
548 
549   // normalize with Slaney
550   std::vector<T> enorm;
551   if (norm == NormType::kSlaney) {
552     auto iter_f_pts_0 = f_pts->begin<T>();
553     auto iter_f_pts_2 = f_pts->begin<T>();
554     iter_f_pts_2++;
555     iter_f_pts_2++;
556     for (; iter_f_pts_2 != f_pts->end<T>(); iter_f_pts_0++, iter_f_pts_2++) {
557       enorm.push_back(2.0f / (*iter_f_pts_2 - *iter_f_pts_0));
558     }
559     auto iter_fb = fb->begin<T>();
560     for (size_t row = 0; row < fb_shape[0]; row++) {
561       for (size_t col = 0; col < fb_shape[1]; col++) {
562         *iter_fb = (*iter_fb) * enorm[col];
563         iter_fb++;
564       }
565     }
566     enorm.clear();
567   }
568 
569   // anomaly detection.
570   auto iter_fb = fb->begin<T>();
571   std::vector<T> max_val(fb_shape[1], 0);
572   for (size_t row = 0; row < fb_shape[0]; row++) {
573     for (size_t col = 0; col < fb_shape[1]; col++) {
574       max_val[col] = std::max(max_val[col], *iter_fb);
575       iter_fb++;
576     }
577   }
578   for (size_t col = 0; col < fb_shape[1]; col++) {
579     if (max_val[col] < 1e-8) {
580       MS_LOG(WARNING) << "MelscaleFbanks: at least one mel filterbank is all zeros, check if the value for 'n_mels' " +
581                            std::to_string(n_mels) + " is set too high or the value for 'n_freqs' " +
582                            std::to_string(n_freqs) + " is set too low.";
583       break;
584     }
585   }
586   *output = fb;
587   return Status::OK();
588 }
589 
590 /// \brief Creates a linear triangular filterbank.
591 /// \param output Tensor of a linear triangular filterbank.
592 /// \param n_freqs: Number of frequency.
593 /// \param f_min: Minimum of frequency in Hz.
594 /// \param f_max: Maximum of frequency in Hz.
595 /// \param n_filter: Number of (linear) triangular filter.
596 /// \param sample_rate: Sample rate.
597 /// \return Status code.
598 Status CreateLinearFbanks(std::shared_ptr<Tensor> *output, int32_t n_freqs, float f_min, float f_max, int32_t n_filter,
599                           int32_t sample_rate);
600 
601 /// \brief Convert normal STFT to STFT at the Mel scale.
602 /// \param input: Input audio tensor.
603 /// \param output: Mel scale audio tensor.
604 /// \param n_mels: Number of mel filter.
605 /// \param sample_rate: Sample rate of the signal.
606 /// \param f_min: Minimum frequency.
607 /// \param f_max: Maximum frequency.
608 /// \param n_stft: Number of bins in STFT.
609 /// \param norm: Enum, NormType::kSlaney or NormType::kNone. If norm is NormType::kSlaney, divide the triangle mel
610 ///     weight by the width of the mel band.
611 /// \param mel_type: Type of calculate mel type, value should be MelType::kHtk or MelType::kSlaney.
612 /// \return Status code.
613 template <typename T>
MelScale(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_mels,int32_t sample_rate,T f_min,T f_max,int32_t n_stft,NormType norm,MelType mel_type)614 Status MelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_mels,
615                 int32_t sample_rate, T f_min, T f_max, int32_t n_stft, NormType norm, MelType mel_type) {
616   // pack
617   TensorShape input_shape = input->shape();
618   TensorShape input_reshape({input->Size() / input_shape[-1] / input_shape[-2], input_shape[-2], input_shape[-1]});
619   RETURN_IF_NOT_OK(input->Reshape(input_reshape));
620 
621   int rows = input_reshape[1];
622   int cols = input_reshape[2];
623 
624   // unpack
625   std::vector<int64_t> out_shape_vec = input_shape.AsVector();
626   out_shape_vec[input_shape.Size() - 1] = cols;
627   out_shape_vec[input_shape.Size() - TWO] = n_mels;
628   TensorShape output_shape(out_shape_vec);
629 
630   RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, input->type(), output));
631   if (n_mels == 0) {
632     return Status::OK();
633   }
634   auto out_in = (*output)->GetMutableBuffer();
635   size_t t_size = sizeof(T);
636 
637   // gen freq bin mat
638   std::shared_ptr<Tensor> freq_bin_mat;
639   RETURN_IF_NOT_OK(CreateFbanks<T>(&freq_bin_mat, n_stft, f_min, f_max, n_mels, sample_rate, norm, mel_type));
640   auto data_ptr = &*freq_bin_mat->begin<T>();
641   Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> matrix_fb(data_ptr, n_mels, n_stft);
642   auto matrix_fb_t = matrix_fb.transpose();
643 
644   for (size_t c = 0; c < input_reshape[0]; c++) {
645     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
646       static_cast<T *>(&*input->begin<T>() + rows * cols * c), cols, rows);
647     auto mat_res = matrix_c * matrix_fb_t;
648     size_t mat_res_size = mat_res.size() * t_size;
649     size_t offset = mat_res_size * c;
650     int ret_code =
651       memcpy_s(reinterpret_cast<void *>(out_in + offset), mat_res_size, mat_res.eval().data(), mat_res_size);
652     CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK,
653                                  "Failed to copy data into std::vector, ret code: " + std::to_string(ret_code) + ".");
654   }
655 
656   return Status::OK();
657 }
658 
659 /// \brief Transform audio signal into spectrogram.
660 /// \param[in] n_fft Size of FFT, creates n_fft / 2 + 1 bins.
661 /// \param[in] win_length Window size.
662 /// \param[in] hop_length Length of hop between STFT windows.
663 /// \param[in] pad Two sided padding of signal.
664 /// \param[in] window A function to create a window tensor
665 ///     that is applied/multiplied to each frame/window.
666 /// \param[in] power Exponent for the magnitude spectrogram.
667 /// \param[in] normalized Whether to normalize by magnitude after stft.
668 /// \param[in] center Whether to pad waveform on both sides.
669 /// \param[in] pad_mode Controls the padding method used when center is true.
670 /// \param[in] onesided Controls whether to return half of results to avoid redundancy.
671 /// \return Status code.
672 Status Spectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int pad, WindowType window,
673                    int n_fft, int hop_length, int win_length, float power, bool normalized, bool center,
674                    BorderType pad_mode, bool onesided);
675 
676 /// \brief Transform audio signal into spectrogram.
677 /// \param[in] input Tensor of shape <..., time>.
678 /// \param[out] output Tensor of shape <..., time>.
679 /// \param[in] sample_rate The sample rate of input tensor.
680 /// \param[in] n_fft Size of FFT, creates n_fft / 2 + 1 bins.
681 /// \param[in] win_length Window size.
682 /// \param[in] hop_length Length of hop between STFT windows.
683 /// \param[in] pad Two sided padding of signal.
684 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
685 /// \return Status code.
686 Status SpectralCentroid(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate,
687                         int n_fft, int win_length, int hop_length, int pad, WindowType window);
688 
689 /// \brief Stretch STFT in time at a given rate, without changing the pitch.
690 /// \param input: Tensor of shape <..., freq, time>.
691 /// \param rate: Stretch factor.
692 /// \param phase_advance: Expected phase advance in each bin.
693 /// \param output: Tensor after stretch in time domain.
694 /// \return Status code.
695 Status TimeStretch(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate, float hop_length,
696                    int32_t n_freq);
697 
698 /// \brief Stretch STFT in time at a given rate, without changing the pitch.
699 /// \param[in] input Tensor of shape <..., freq, time, 2>.
700 /// \param[in] output Tensor of shape <..., freq, ceil(time/rate), 2>.
701 /// \param[in] rate Speed-up factor.
702 /// \param[in] phase_advance Expected phase advance in each bin in shape of (freq, 1).
703 /// \return Status code.
704 Status PhaseVocoder(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate,
705                     const std::shared_ptr<Tensor> &phase_advance);
706 
707 /// \brief Apply a mask along axis.
708 /// \param input: Tensor of shape <..., freq, time>.
709 /// \param output: Tensor of shape <..., freq, time>.
710 /// \param mask_param: Number of columns to be masked will be uniformly sampled from [0, mask_param].
711 /// \param mask_value: Value to assign to the masked columns.
712 /// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
713 /// \param rnd: Number generator.
714 /// \return Status code.
715 Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_param,
716                            float mask_value, int axis, std::mt19937 *rnd);
717 
718 /// \brief Apply a mask along axis. All examples will have the same mask interval.
719 /// \param input: Tensor of shape <..., freq, time>.
720 /// \param output: Tensor of shape <..., freq, time>.
721 /// \param mask_width: The width of the mask.
722 /// \param mask_start: Starting position of the mask.
723 ///     Mask will be applied from indices [mask_start, mask_start + mask_width).
724 /// \param mask_value: Value to assign to the masked columns.
725 /// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
726 /// \return Status code.
727 Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_width,
728                      int32_t mask_start, float mask_value, int32_t axis);
729 
730 /// \brief Create a DCT transformation matrix with shape (n_mels, n_mfcc), normalized depending on norm.
731 /// \param n_mfcc: Number of mfc coefficients to retain, the value must be greater than 0.
732 /// \param n_mels: Number of mel filterbanks, the value must be greater than 0.
733 /// \param norm: Norm to use, can be NormMode::kNone or NormMode::kOrtho.
734 /// \return Status code.
735 Status Dct(std::shared_ptr<Tensor> *output, int32_t n_mfcc, int32_t n_mels, NormMode norm);
736 
737 /// \brief Compute the norm of complex tensor input.
738 /// \param power Power of the norm description (optional).
739 /// \param input Tensor shape of <..., complex=2>.
740 /// \param output Tensor shape of <..., >.
741 /// \return Status code.
742 Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power);
743 
744 /// \brief Stochastic gradient descent.
745 /// \param[in] input Input tensor.
746 /// \param[out] output Output tensor.
747 /// \param[in] grad Input grad for params.
748 /// \param[in] lr Learning rate.
749 /// \param[in] momentum Momentum factor.
750 /// \param[in] dampening Dampening for momentum.
751 /// \param[in] weight_decay Weight decay.
752 /// \param[in] nesterov Whether enable nesterov momentum.
753 /// \param[in] stat Stat.
754 /// \return Status code.
755 template <typename T>
756 Status SGD(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const std::shared_ptr<Tensor> &grad,
757            float lr, float momentum = 0.0, float dampening = 0.0, float weight_decay = 0.0, bool nesterov = false,
758            float stat = 0.0) {
759   size_t elem_num = input->Size();
760   std::vector<T> accum(elem_num);
761   std::shared_ptr<Tensor> output_param;
762   std::vector<T> out_param(elem_num);
763   int ind = 0;
764   auto itr_inp = input->begin<T>();
765   auto itr_grad = grad->begin<T>();
766   while (itr_inp != input->end<T>() && itr_grad != grad->end<T>()) {
767     T grad_new = (*itr_grad);
768     if (weight_decay > static_cast<float>(0.0)) {
769       grad_new += (*itr_inp) * static_cast<T>(weight_decay);
770     }
771     if (momentum > 0) {
772       if (stat > 0) {
773         accum[ind] = grad_new;
774         stat = 0;
775       } else {
776         accum[ind] = accum[ind] * momentum + (1 - static_cast<T>(dampening)) * grad_new;
777       }
778       if (nesterov) {
779         grad_new += accum[ind] * momentum;
780       } else {
781         grad_new = accum[ind];
782       }
783     }
784     out_param[ind] = (*itr_inp) - lr * grad_new;
785     itr_inp++;
786     itr_grad++;
787     ind++;
788   }
789 
790   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_param, TensorShape({input->Size()}), &output_param));
791   *output = output_param;
792   return Status::OK();
793 }
794 
795 /// \brief Use conversion matrix to solve normal STFT from mel frequency STFT.
796 /// \param input Tensor of shape <..., n_mels, time>.
797 /// \param output Tensor of shape <..., freq, time>.
798 /// \param n_stft Number of bins in STFT, the value must be greater than 0.
799 /// \param n_mels Number of mel filter, the value must be greater than 0.
800 /// \param sample_rate Sample rate of the signal, the value can't be zero.
801 /// \param f_min Minimum frequency, the value must be greater than or equal to 0.
802 /// \param f_max Maximum frequency, the value must be greater than 0.
803 /// \param max_iter Maximum number of optimization iterations, the value must be greater than 0.
804 /// \param tolerance_loss Value of loss to stop optimization at, the value must be greater than or equal to 0.
805 /// \param tolerance_change Difference in losses to stop optimization at, the value must be greater than or equal to 0.
806 /// \param sgd_lr Learning rate for SGD optimizer, the value must be greater than or equal to 0.
807 /// \param sgd_momentum Momentum factor for SGD optimizer, the value must be greater than or equal to 0.
808 /// \param norm Type of norm, value should be NormType::kSlaney or NormType::kNone. If norm is NormType::kSlaney,
809 ///     divide the triangle mel weight by the width of the mel band.
810 /// \param mel_type Type of mel, value should be MelType::kHtk or MelType::kSlaney.
811 /// \param rnd Random generator.
812 /// \return Status code.
813 Status InverseMelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
814                        int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
815                        float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum, NormType norm,
816                        MelType mel_type, std::mt19937 *rnd);
817 
818 /// \brief Create InverseSpectrogram for a raw audio signal.
819 /// \param[in] input Input tensor.
820 /// \param[out] output Output tensor.
821 /// \param[in] length The output length of the waveform.
822 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
823 /// \param[in] win_length Window size.
824 /// \param[in] hop_length Length of hop between STFT windows.
825 /// \param[in] pad Two sided padding of signal.
826 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
827 /// \param[in] normalized Whether to normalize by magnitude after stft.
828 /// \param[in] center Whether the signal in spectrogram was padded on both sides.
829 /// \param[in] pad_mode Controls the padding method used when center is True.
830 /// \param[in] onesided Controls whether spectrogram was used to return half of results to avoid redundancy.
831 /// \return Status return code.
832 Status InverseSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length,
833                           int32_t n_fft, int32_t win_length, int32_t hop_length, int32_t pad, WindowType window,
834                           bool normalized, bool center, BorderType pad_mode, bool onesided);
835 
836 /// \brief Decode mu-law encoded signal.
837 /// \param input Tensor of shape <..., time>.
838 /// \param output Tensor of shape <..., time>.
839 /// \param quantization_channels Number of channels.
840 /// \return Status code.
841 Status MuLawDecoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
842                      int32_t quantization_channels);
843 
844 /// \brief Encode signal based on mu-law companding.
845 /// \param input Tensor of shape <..., time>.
846 /// \param output Tensor of shape <..., time>.
847 /// \param quantization_channels Number of channels.
848 /// \return Status code.
849 Status MuLawEncoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
850                      int32_t quantization_channels);
851 
852 /// \brief Apply a overdrive effect to the audio.
853 /// \param input Tensor of shape <..., time>.
854 /// \param output Tensor of shape <..., time>.
855 /// \param gain Coefficient of overload in dB.
856 /// \param color Coefficient of translation.
857 /// \return Status code.
858 template <typename T>
Overdrive(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float gain,float color)859 Status Overdrive(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float gain, float color) {
860   TensorShape input_shape = input->shape();
861   // input->2D.
862   auto rows = input->Size() / input_shape[-1];
863   auto cols = input_shape[-1];
864   TensorShape to_shape({rows, cols});
865   RETURN_IF_NOT_OK(input->Reshape(to_shape));
866   // apply dB2Linear on gain, 20dB is expect to gain.
867   float gain_ex = exp(gain * log(10) / 20.0);
868   constexpr int64_t translation_factor = 200;
869   color = color / translation_factor;
870   // declare the array used to store the input.
871   std::vector<T> input_vec;
872   // out_vec is used to save the result of applying overdrive.
873   std::vector<T> out_vec;
874   // store intermediate results of input.
875   std::vector<T> temp;
876   constexpr double temp_factor = 3.0;
877   // scale and pan the input two-dimensional sound wave array to a certain extent.
878   for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
879     // store the value of traverse the input.
880     T temp_fp = *itr;
881     input_vec.push_back(temp_fp);
882     // use 0 to initialize out_vec.
883     out_vec.push_back(0);
884     T temp_fp2 = temp_fp * gain_ex + color;
885     // 0.5 + 2/3 * 0.75 = 1, zoom and shift the sound.
886     if (temp_fp2 < -1) {
887       // -2.0 / 3.0 is -2/3 in the formula.
888       temp.push_back(-2.0 / temp_factor);
889     } else if (temp_fp2 > 1) {
890       // 2.0 / 3.0 is 2/3 in the formula.
891       temp.push_back(2.0 / temp_factor);
892     } else {
893       temp.push_back(temp_fp2 - temp_fp2 * temp_fp2 * temp_fp2 / temp_factor);
894     }
895   }
896   // last_in and last_out are the intermediate values for processing each moment.
897   std::vector<T> last_in;
898   std::vector<T> last_out;
899   for (size_t i = 0; i < cols; i++) {
900     last_in.push_back(0.0);
901     last_out.push_back(0.0);
902   }
903   // overdrive core loop.
904   for (size_t i = 0; i < cols; i++) {
905     size_t index = 0;
906     // calculate the value of each moment according to the rules of overdrive.
907     for (size_t j = i; j < rows * cols; j += cols, index++) {
908       // 0.995 is the preservation ratio of sound waves.
909       last_out[index] = temp[j] - last_in[index] + last_out[index] * 0.995;
910       last_in[index] = temp[j];
911       // 0.5 + 2/3 * 0.75 = 1, zoom and shift the sound.
912       T temp_fp = input_vec[j] * 0.5 + last_out[index] * 0.75;
913       // clamp min=-1, max=1.
914       if (temp_fp < -1) {
915         out_vec[j] = -1.0;
916       } else if (temp_fp > 1) {
917         out_vec[j] = 1.0;
918       } else {
919         out_vec[j] = temp_fp;
920       }
921     }
922   }
923   // move data to output tensor.
924   std::shared_ptr<Tensor> out;
925   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_vec, input_shape, &out));
926   *output = out;
927   return Status::OK();
928 }
929 
930 /// \brief Add a fade in and/or fade out to an input.
931 /// \param[in] input: The input tensor.
932 /// \param[out] output: Added fade in and/or fade out audio with the same shape.
933 /// \param[in] fade_in_len: Length of fade-in (time frames).
934 /// \param[in] fade_out_len: Length of fade-out (time frames).
935 /// \param[in] fade_shape: Shape of fade.
936 Status Fade(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t fade_in_len,
937             int32_t fade_out_len, FadeShape fade_shape);
938 
939 /// \brief Add a volume to an waveform.
940 /// \param input/output: Tensor of shape <..., time>.
941 /// \param gain: Gain value, varies according to the value of gain_type.
942 /// \param gain_type: Type of gain, should be one of [GainType::kAmplitude, GainType::kDb, GainType::kPower].
943 /// \return Status code.
944 template <typename T>
Vol(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T gain,GainType gain_type)945 Status Vol(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T gain, GainType gain_type) {
946   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
947   const T lower_bound = -1;
948   const T upper_bound = 1;
949 
950   // DB is a unit which converts a numeric value into decibel scale and for conversion, we have to use log10
951   // A(in dB) = 20log10(A in amplitude)
952   // When referring to measurements of power quantities, a ratio can be expressed as a level in decibels by evaluating
953   // ten times the base-10 logarithm of the ratio of the measured quantity to reference value
954   // A(in dB) = 10log10(A in power)
955   const int power_factor_div = 20;
956   const int power_factor_mul = 10;
957   const int base = 10;
958 
959   if (gain_type == GainType::kDb) {
960     if (gain != 0) {
961       gain = std::pow(base, (gain / power_factor_div));
962     }
963   } else if (gain_type == GainType::kPower) {
964     gain = power_factor_mul * std::log10(gain);
965     gain = std::pow(base, (gain / power_factor_div));
966   }
967 
968   for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
969     if (gain != 0 || gain_type == GainType::kAmplitude) {
970       *itr = (*itr) * gain;
971     }
972     *itr = std::min(std::max((*itr), lower_bound), upper_bound);
973   }
974 
975   return Status::OK();
976 }
977 
978 /// \brief Separate a complex-valued spectrogram with shape (…, 2) into its magnitude and phase.
979 /// \param input: Complex tensor.
980 /// \param output: The magnitude and phase of the complex tensor.
981 /// \param power: Power of the norm.
982 Status Magphase(const TensorRow &input, TensorRow *output, float power);
983 
984 /// \brief Compute Normalized Cross-Correlation Function (NCCF).
985 /// \param input: Tensor of shape <channel,waveform_length>.
986 /// \param output: Tensor of shape <channel, num_of_frames, lags>.
987 /// \param sample_rate: The sample rate of the waveform (Hz).
988 /// \param frame_time: Duration of a frame.
989 /// \param freq_low: Lowest frequency that can be detected (Hz).
990 /// \return Status code.
991 template <typename T>
ComputeNccf(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float frame_time,int32_t freq_low)992 Status ComputeNccf(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
993                    float frame_time, int32_t freq_low) {
994   auto channel = input->shape()[0];
995   auto waveform_length = input->shape()[1];
996   size_t idx = 0;
997   size_t channel_idx = 1;
998   int32_t lags = static_cast<int32_t>(ceil(static_cast<float>(sample_rate) / freq_low));
999   int32_t frame_size = static_cast<int32_t>(ceil(sample_rate * frame_time));
1000   int32_t num_of_frames = static_cast<int32_t>(ceil(static_cast<float>(waveform_length) / frame_size));
1001   int32_t p = lags + num_of_frames * frame_size - waveform_length;
1002   TensorShape output_shape({channel, num_of_frames, lags});
1003   DataType intput_type = input->type();
1004   RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, intput_type, output));
1005   // pad p 0 in -1 dimension
1006   std::vector<T> signal;
1007   // Tensor -> vector
1008   for (auto itr = input->begin<T>(); itr != input->end<T>();) {
1009     while (idx < waveform_length * channel_idx) {
1010       signal.push_back(*itr);
1011       ++itr;
1012       ++idx;
1013     }
1014     // Each channel is processed with the sliding window
1015     // waveform:[channel, time] -->  waveform:[channel, time+p]
1016     for (size_t i = 0; i < p; ++i) {
1017       signal.push_back(static_cast<T>(0.0));
1018     }
1019     if (idx % waveform_length == 0) {
1020       ++channel_idx;
1021     }
1022   }
1023   // compute ncc
1024   for (dsize_t lag = 1; lag <= lags; ++lag) {
1025     // compute one ncc
1026     // one ncc out
1027     std::vector<T> out;
1028     channel_idx = 1;
1029     idx = 0;
1030     size_t win_idx = 0;
1031     size_t waveform_length_p = waveform_length + p;
1032     // Traversal signal
1033     for (auto itr = signal.begin(); itr != signal.end();) {
1034       // Each channel is processed with the sliding window
1035       size_t s1 = idx;
1036       size_t s2 = idx + lag;
1037       size_t frame_count = 0;
1038       T s1_norm = static_cast<T>(0);
1039       T s2_norm = static_cast<T>(0);
1040       T ncc_umerator = static_cast<T>(0);
1041       T ncc = static_cast<T>(0);
1042       while (idx < waveform_length_p * channel_idx) {
1043         // Sliding window
1044         if (frame_count == num_of_frames) {
1045           ++itr;
1046           ++idx;
1047           continue;
1048         }
1049         if (win_idx < frame_size) {
1050           ncc_umerator += signal[s1] * signal[s2];
1051           s1_norm += signal[s1] * signal[s1];
1052           s2_norm += signal[s2] * signal[s2];
1053           ++win_idx;
1054           ++s1;
1055           ++s2;
1056         }
1057         if (win_idx == frame_size) {
1058           if (s1_norm != static_cast<T>(0.0) && s2_norm != static_cast<T>(0.0)) {
1059             ncc = ncc_umerator / s1_norm / s2_norm;
1060           } else {
1061             ncc = static_cast<T>(0.0);
1062           }
1063           out.push_back(ncc);
1064           ncc_umerator = static_cast<T>(0.0);
1065           s1_norm = static_cast<T>(0.0);
1066           s2_norm = static_cast<T>(0.0);
1067           ++frame_count;
1068           win_idx = 0;
1069         }
1070         ++itr;
1071         ++idx;
1072       }
1073       if (idx % waveform_length_p == 0) {
1074         ++channel_idx;
1075       }
1076     }  // compute one ncc
1077     // cat tensor
1078     auto itr_out = out.begin();
1079     for (dsize_t row_idx = 0; row_idx < channel; ++row_idx) {
1080       for (dsize_t frame_idx = 0; frame_idx < num_of_frames; ++frame_idx) {
1081         RETURN_IF_NOT_OK((*output)->SetItemAt({row_idx, frame_idx, lag - 1}, *itr_out));
1082         ++itr_out;
1083       }
1084     }
1085   }  // compute ncc
1086   return Status::OK();
1087 }
1088 
1089 /// \brief For each frame, take the highest value of NCCF.
1090 /// \param input: Tensor of shape <channel, num_of_frames, lags>.
1091 /// \param output: Tensor of shape <channel, num_of_frames>.
1092 /// \param sample_rate: The sample rate of the waveform (Hz).
1093 /// \param freq_high: Highest frequency that can be detected (Hz).
1094 /// \return Status code.
1095 template <typename T>
FindMaxPerFrame(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t freq_high)1096 Status FindMaxPerFrame(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
1097                        int32_t freq_high) {
1098   std::vector<T> signal;
1099   std::vector<int> out;
1100   auto channel = input->shape()[0];
1101   auto num_of_frames = input->shape()[1];
1102   auto lags = input->shape()[2];
1103   CHECK_FAIL_RETURN_UNEXPECTED(freq_high != 0, "DetectPitchFrequency: freq_high can not be zero.");
1104   auto lag_min = static_cast<int32_t>(ceil(static_cast<float>(sample_rate) / freq_high));
1105   TensorShape out_shape({channel, num_of_frames});
1106   // pack batch
1107   for (auto itr = input->begin<T>(); itr != input->end<T>(); ++itr) {
1108     signal.push_back(*itr);
1109   }
1110   // find the best nccf
1111   T best_max_value = static_cast<T>(0.0);
1112   T half_max_value = static_cast<T>(0.0);
1113   int32_t best_max_indices = 0;
1114   int32_t half_max_indices = 0;
1115   auto thresh = static_cast<T>(0.99);
1116   auto lags_half = lags / 2;
1117   for (dsize_t channel_idx = 0; channel_idx < channel; ++channel_idx) {
1118     for (dsize_t frame_idx = 0; frame_idx < num_of_frames; ++frame_idx) {
1119       auto index_01 = channel_idx * num_of_frames * lags + frame_idx * lags + lag_min;
1120       best_max_value = signal[index_01];
1121       half_max_value = signal[index_01];
1122       best_max_indices = lag_min;
1123       half_max_indices = lag_min;
1124       for (dsize_t lag_idx = 0; lag_idx < lags; ++lag_idx) {
1125         if (lag_idx > lag_min) {
1126           auto index_02 = channel_idx * num_of_frames * lags + frame_idx * lags + lag_idx;
1127           if (signal[index_02] > best_max_value) {
1128             best_max_value = signal[index_02];
1129             best_max_indices = lag_idx;
1130             if (lag_idx < lags_half) {
1131               half_max_value = signal[index_02];
1132               half_max_indices = lag_idx;
1133             }
1134           }
1135         }
1136       }
1137       // Add back minimal lag
1138       // Add 1 empirical calibration offset
1139       if (half_max_value > best_max_value * thresh) {
1140         out.push_back(half_max_indices + 1);
1141       } else {
1142         out.push_back(best_max_indices + 1);
1143       }
1144     }
1145   }
1146   // unpack batch
1147   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
1148   return Status::OK();
1149 }
1150 
1151 /// \brief Apply median smoothing to the 1D tensor over the given window.
1152 /// \param input: Tensor of shape<channel, num_of_frames>.
1153 /// \param output: Tensor of shape <channel, num_of_window>.
1154 /// \param win_length: The window length for median smoothing (in number of frames).
1155 /// \return Status code.
1156 Status MedianSmoothing(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length);
1157 
1158 /// \brief Detect pitch frequency.
1159 /// \param input: Tensor of shape <channel,waveform_length>.
1160 /// \param output: Tensor of shape <channel, num_of_frames, lags>.
1161 /// \param sample_rate: The sample rate of the waveform (Hz).
1162 /// \param frame_time: Duration of a frame.
1163 /// \param win_length: The window length for median smoothing (in number of frames).
1164 /// \param freq_low: Lowest frequency that can be detected (Hz).
1165 /// \param freq_high: Highest frequency that can be detected (Hz).
1166 /// \return Status code.
1167 Status DetectPitchFrequency(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
1168                             float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high);
1169 
1170 /// \brief A helper function for phaser, generates a table with given parameters.
1171 /// \param output: Tensor of shape <time>.
1172 /// \param type: can choose DataType::DE_FLOAT32 or DataType::DE_INT32.
1173 /// \param modulation: Modulation of the input tensor.
1174 ///     It can be one of Modulation.kSinusoidal or Modulation.kTriangular.
1175 /// \param table_size: The length of table.
1176 /// \param min: Calculate the sampling rate within the delay time.
1177 /// \param max: Calculate the sampling rate within the delay and delay depth time.
1178 /// \param phase: Phase offset of function.
1179 /// \return Status code.
1180 Status GenerateWaveTable(std::shared_ptr<Tensor> *output, const DataType &type, Modulation modulation,
1181                          int32_t table_size, float min, float max, float phase);
1182 
1183 /// \brief Apply a phaser effect to the audio.
1184 /// \param input Tensor of shape <..., time>.
1185 /// \param output Tensor of shape <..., time>.
1186 /// \param sample_rate Sampling rate of the waveform.
1187 /// \param gain_in Desired input gain at the boost (or attenuation) in dB.
1188 /// \param gain_out Desired output gain at the boost (or attenuation) in dB.
1189 /// \param delay_ms Desired delay in milli seconds.
1190 /// \param decay Desired decay relative to gain-in.
1191 /// \param mod_speed Modulation speed in Hz.
1192 /// \param sinusoidal If true, use sinusoidal modulation. If false, use triangular modulation.
1193 /// \return Status code.
1194 template <typename T>
Phaser(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float gain_in,float gain_out,float delay_ms,float decay,float mod_speed,bool sinusoidal)1195 Status Phaser(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, float gain_in,
1196               float gain_out, float delay_ms, float decay, float mod_speed, bool sinusoidal) {
1197   TensorShape input_shape = input->shape();
1198   // input convert to 2D (channels,time)
1199   auto channels = input->Size() / input_shape[-1];
1200   auto time = input_shape[-1];
1201   TensorShape to_shape({channels, time});
1202   RETURN_IF_NOT_OK(input->Reshape(to_shape));
1203   // input vector
1204   std::vector<std::vector<T>> input_vec(channels, std::vector<T>(time, 0));
1205   // output vector
1206   std::vector<std::vector<T>> out_vec(channels, std::vector<T>(time, 0));
1207   // input convert to vector
1208   auto input_itr = input->begin<T>();
1209   for (size_t i = 0; i < channels; i++) {
1210     for (size_t j = 0; j < time; j++) {
1211       input_vec[i][j] = *input_itr * gain_in;
1212       input_itr++;
1213     }
1214   }
1215   // compute
1216   // create delay buffer
1217   int delay_buf_nrow = channels;
1218   // calculate the length of the delay
1219   int delay_buf_len = static_cast<int>((delay_ms * 0.001 * sample_rate) + 0.5);
1220   std::vector<std::vector<T>> delay_buf(delay_buf_nrow, std::vector<T>(delay_buf_len, 0 * decay));
1221   // calculate the length after the momentum
1222   int mod_buf_len = static_cast<int>(sample_rate / mod_speed + 0.5);
1223   Modulation modulation = sinusoidal ? Modulation::kSinusoidal : Modulation::kTriangular;
1224   // create and compute mod buffer
1225   std::shared_ptr<Tensor> mod_buf_tensor;
1226   auto PI_factor = 2;
1227   RETURN_IF_NOT_OK(GenerateWaveTable(&mod_buf_tensor, DataType(DataType::DE_INT32), modulation, mod_buf_len,
1228                                      static_cast<float>(1.0f), static_cast<float>(delay_buf_len),
1229                                      static_cast<float>(PI / PI_factor)));
1230   // tensor mod_buf convert to vector
1231   std::vector<int> mod_buf;
1232   for (auto itr = mod_buf_tensor->begin<int>(); itr != mod_buf_tensor->end<int>(); itr++) {
1233     mod_buf.push_back(*itr);
1234   }
1235   dsize_t delay_pos = 0;
1236   dsize_t mod_pos = 0;
1237   // for every channal at the current time
1238   for (size_t i = 0; i < time; i++) {
1239     // calculate the delay data that should be added to each channal at this time
1240     int idx = static_cast<int>((delay_pos + mod_buf[mod_pos]) % delay_buf_len);
1241     mod_pos = (mod_pos + 1) % mod_buf_len;
1242     delay_pos = (delay_pos + 1) % delay_buf_len;
1243     // update the next delay data with the current result * decay
1244     for (size_t j = 0; j < channels; j++) {
1245       out_vec[j][i] = input_vec[j][i] + delay_buf[j][idx];
1246       delay_buf[j][delay_pos] = (input_vec[j][i] + delay_buf[j][idx]) * decay;
1247     }
1248   }
1249   std::vector<T> out_vec_one_d;
1250   for (size_t i = 0; i < channels; i++) {
1251     for (size_t j = 0; j < time; j++) {
1252       // gain_out on the output
1253       out_vec[i][j] *= gain_out;
1254       // clamp
1255       out_vec[i][j] = std::max<float>(-1.0f, std::min<float>(1.0f, out_vec[i][j]));
1256       // output vector is transformed from 2d to 1d
1257       out_vec_one_d.push_back(out_vec[i][j]);
1258     }
1259   }
1260   // move data to output tensor
1261   std::shared_ptr<Tensor> out;
1262   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_vec_one_d, input_shape, &out));
1263   *output = out;
1264   return Status::OK();
1265 }
1266 
1267 /// \brief Flanger about interpolation effect.
1268 /// \param input: Tensor of shape <batch, channel, time>.
1269 /// \param int_delay: A dimensional vector about integer delay, subscript representing delay.
1270 /// \param frac_delay: A dimensional vector about delay obtained by using the frac function.
1271 /// \param interpolation: Interpolation of the input tensor.
1272 ///     It can be one of Interpolation::kLinear or Interpolation::kQuadratic.
1273 /// \param delay_buf_pos: Minimum dimension length about delay_bufs.
1274 /// \Returns Flanger about interpolation effect.
1275 template <typename T>
FlangerInterpolation(const std::shared_ptr<Tensor> & input,std::vector<int> int_delay,const std::vector<T> & frac_delay,Interpolation interpolation,int delay_buf_pos)1276 std::vector<std::vector<T>> FlangerInterpolation(const std::shared_ptr<Tensor> &input, std::vector<int> int_delay,
1277                                                  const std::vector<T> &frac_delay, Interpolation interpolation,
1278                                                  int delay_buf_pos) {
1279   int n_batch = input->shape()[0];
1280   int n_channels = input->shape()[-2];
1281   int delay_buf_length = input->shape()[-1];
1282   const int32_t bias = 2;
1283 
1284   std::vector<std::vector<T>> delayed_value_a(n_batch, std::vector<T>(n_channels, 0));
1285   std::vector<std::vector<T>> delayed_value_b(n_batch, std::vector<T>(n_channels, 0));
1286   for (int j = 0; j < n_batch; j++) {
1287     for (int k = 0; k < n_channels; k++) {
1288       // delay after obtaining the current number of channels
1289       auto iter_input = input->begin<T>();
1290       int it = j * n_channels * delay_buf_length + k * delay_buf_length;
1291       iter_input += it + (delay_buf_pos + int_delay[k]) % delay_buf_length;
1292       delayed_value_a[j][k] = *(iter_input);
1293       iter_input = input->begin<T>();
1294       iter_input += it + (delay_buf_pos + int_delay[k] + 1) % delay_buf_length;
1295       delayed_value_b[j][k] = *(iter_input);
1296     }
1297   }
1298   // delay subscript backward
1299   for (int j = 0; j < n_channels; j++) {
1300     int_delay[j] = int_delay[j] + bias;
1301   }
1302   std::vector<std::vector<T>> delayed(n_batch, std::vector<T>(n_channels, 0));
1303   std::vector<std::vector<T>> delayed_value_c(n_batch, std::vector<T>(n_channels, 0));
1304   if (interpolation == Interpolation::kLinear) {
1305     for (int j = 0; j < n_batch; j++) {
1306       for (int k = 0; k < n_channels; k++) {
1307         delayed[j][k] = delayed_value_a[j][k] + (delayed_value_b[j][k] - delayed_value_a[j][k]) * frac_delay[k];
1308       }
1309     }
1310   } else {
1311     for (int j = 0; j < n_batch; j++) {
1312       for (int k = 0; k < n_channels; k++) {
1313         auto iter_input = input->begin<T>();
1314         int it = j * n_channels * delay_buf_length + k * delay_buf_length;
1315         iter_input += it + (delay_buf_pos + int_delay[k]) % delay_buf_length;
1316         delayed_value_c[j][k] = *(iter_input);
1317       }
1318     }
1319     // delay subscript backward
1320     for (int j = 0; j < n_channels; j++) {
1321       int_delay[j] = int_delay[j] + 1;
1322     }
1323     std::vector<std::vector<T>> frac_delay_coefficient(n_batch, std::vector<T>(n_channels, 0));
1324     std::vector<std::vector<T>> frac_delay_value(n_batch, std::vector<T>(n_channels, 0));
1325     for (int j = 0; j < n_batch; j++) {
1326       for (int k = 0; k < n_channels; k++) {
1327         delayed_value_c[j][k] = delayed_value_c[j][k] - delayed_value_a[j][k];
1328         delayed_value_b[j][k] = delayed_value_b[j][k] - delayed_value_a[j][k];
1329         // delayed_value_c[j][k] * 0.5 is half of the delayed_value_c[j][k]
1330         frac_delay_coefficient[j][k] = delayed_value_c[j][k] * 0.5 - delayed_value_b[j][k];
1331         frac_delay_value[j][k] = delayed_value_b[j][k] * 2 - delayed_value_c[j][k] * 0.5;
1332         // the next delay is obtained by delaying the data in the buffer
1333         delayed[j][k] = delayed_value_a[j][k] +
1334                         (frac_delay_coefficient[j][k] * frac_delay[k] + frac_delay_value[j][k]) * frac_delay[k];
1335       }
1336     }
1337   }
1338   return delayed;
1339 }
1340 
1341 /// \brief Interval limiting function.
1342 /// \param output_waveform: Tensor of shape <..., time>.
1343 /// \param min: If value is less than min, min is returned.
1344 /// \param max: If value is greater than max, max is returned.
1345 /// \Returns Tensor at the same latitude.
1346 template <typename T>
Clamp(const std::shared_ptr<Tensor> & tensor,T min,T max)1347 std::shared_ptr<Tensor> Clamp(const std::shared_ptr<Tensor> &tensor, T min, T max) {
1348   for (auto itr = tensor->begin<T>(); itr != tensor->end<T>(); itr++) {
1349     if (*itr > max) {
1350       *itr = max;
1351     } else if (*itr < min) {
1352       *itr = min;
1353     }
1354   }
1355   return tensor;
1356 }
1357 
1358 /// \brief Apply flanger effect.
1359 /// \param input/output: Tensor of shape <..., channel, time>.
1360 /// \param sample_rate: Sampling rate of the waveform, e.g. 44100 (Hz), the value can't be zero.
1361 /// \param delay: Desired delay in milliseconds (ms), range: [0, 30].
1362 /// \param depth: Desired delay depth in milliseconds (ms), range: [0, 10].
1363 /// \param regen: Desired regen (feedback gain) in dB., range: [-95, 95].
1364 /// \param width: Desired width (delay gain) in dB, range: [0, 100].
1365 /// \param speed: Modulation speed in Hz, range: [0.1, 10].
1366 /// \param phase: Percentage phase-shift for multi-channel, range: [0, 100].
1367 /// \param modulation: Modulation of the input tensor.
1368 ///     It can be one of Modulation::kSinusoidal or Modulation::kTriangular.
1369 /// \param interpolation: Interpolation of the input tensor.
1370 ///     It can be one of Interpolation::kLinear or Interpolation::kQuadratic.
1371 /// \return Status code.
1372 template <typename T>
Flanger(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float delay,float depth,float regen,float width,float speed,float phase,Modulation modulation,Interpolation interpolation)1373 Status Flanger(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, int32_t sample_rate, float delay,
1374                float depth, float regen, float width, float speed, float phase, Modulation modulation,
1375                Interpolation interpolation) {
1376   std::shared_ptr<Tensor> waveform;
1377   if (input->type() == DataType::DE_FLOAT64) {
1378     waveform = input;
1379   } else {
1380     RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
1381   }
1382   // convert to 3D (batch, channels, time)
1383   TensorShape actual_shape = waveform->shape();
1384   TensorShape toShape({waveform->Size() / actual_shape[-2] / actual_shape[-1], actual_shape[-2], actual_shape[-1]});
1385   RETURN_IF_NOT_OK(waveform->Reshape(toShape));
1386 
1387   // scaling
1388   T feedback_gain = static_cast<T>(regen) / 100;
1389   T delay_gain = static_cast<T>(width) / 100;
1390   T channel_phase = static_cast<T>(phase) / 100;
1391   T delay_min = static_cast<T>(delay) / 1000;
1392   T delay_depth = static_cast<T>(depth) / 1000;
1393 
1394   // balance output:
1395   T in_gain = 1.0 / (1 + delay_gain);
1396   delay_gain = delay_gain / (1 + delay_gain);
1397   // balance feedback loop:
1398   delay_gain = delay_gain * (1 - abs(feedback_gain));
1399 
1400   int delay_buf_length = static_cast<int>((delay_min + delay_depth) * sample_rate + 0.5);
1401   auto delay_buf_length_factor = 2;
1402   delay_buf_length = delay_buf_length + delay_buf_length_factor;
1403 
1404   int lfo_length = static_cast<int>(sample_rate / speed);
1405 
1406   T table_min = floor(delay_min * sample_rate + 0.5);
1407   T table_max = delay_buf_length - 2.0;
1408   // generate wave table
1409   T lfo_phase = 3 * PI / 2;
1410   std::shared_ptr<Tensor> lfo;
1411   RETURN_IF_NOT_OK(GenerateWaveTable(&lfo, DataType(DataType::DE_FLOAT32), modulation, lfo_length,
1412                                      static_cast<float>(table_min), static_cast<float>(table_max),
1413                                      static_cast<float>(lfo_phase)));
1414   int n_batch = waveform->shape()[0];
1415   int n_channels = waveform->shape()[-2];
1416   int time = waveform->shape()[-1];
1417   std::vector<T> delay_tensor(n_channels, 0.0), frac_delay(n_channels, 0.0);
1418   std::vector<int> cur_channel_phase(n_channels, 0), int_delay(n_channels, 0);
1419   // next delay
1420   std::vector<std::vector<T>> delay_last(n_batch, std::vector<T>(n_channels, 0));
1421 
1422   // initialization of delay_bufs
1423   TensorShape delay_bufs_shape({n_batch, n_channels, delay_buf_length});
1424   std::shared_ptr<Tensor> delay_bufs, output_waveform;
1425   RETURN_IF_NOT_OK(Tensor::CreateEmpty(delay_bufs_shape, waveform->type(), &delay_bufs));
1426   RETURN_IF_NOT_OK(delay_bufs->Zero());
1427   // initialization of output_waveform
1428   TensorShape output_waveform_shape({n_batch, n_channels, actual_shape[-1]});
1429   RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_waveform_shape, waveform->type(), &output_waveform));
1430 
1431   int delay_buf_pos = 0, lfo_pos = 0;
1432   for (int i = 0; i < time; i++) {
1433     delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length;
1434     for (int j = 0; j < n_channels; j++) {
1435       auto channel_phase_factor = 0.5;
1436       // get current channel phase
1437       cur_channel_phase[j] = static_cast<int>(j * lfo_length * channel_phase + channel_phase_factor);
1438       // through the current channel phase and lfo arrays to get the delay
1439       auto iter_lfo = lfo->begin<float>();
1440       delay_tensor[j] = *(iter_lfo + static_cast<ptrdiff_t>((lfo_pos + cur_channel_phase[j]) % lfo_length));
1441       // the frac delay is obtained by using the frac function
1442       frac_delay[j] = delay_tensor[j] - static_cast<int>(delay_tensor[j]);
1443       delay_tensor[j] = floor(delay_tensor[j]);
1444       int_delay[j] = static_cast<int>(delay_tensor[j]);
1445     }
1446     // get the waveform of [:, :, i]
1447     std::shared_ptr<Tensor> temp;
1448     TensorShape temp_shape({n_batch, n_channels});
1449     RETURN_IF_NOT_OK(Tensor::CreateEmpty(temp_shape, waveform->type(), &temp));
1450     Slice ss1(0, n_batch), ss2(0, n_channels), ss3(i, i + 1);
1451     SliceOption sp1(ss1), sp2(ss2), sp3(ss3);
1452     std::vector<SliceOption> slice_option;
1453     slice_option.push_back(sp1), slice_option.push_back(sp2), slice_option.push_back(sp3);
1454     RETURN_IF_NOT_OK(waveform->Slice(&temp, slice_option));
1455 
1456     auto iter_temp = temp->begin<T>();
1457     auto iter_delay_bufs = delay_bufs->begin<T>();
1458     for (int j = 0; j < n_batch; j++) {
1459       for (int k = 0; k < n_channels; k++) {
1460         iter_delay_bufs += delay_buf_pos;
1461         // the value of delay_bufs is processed by next delay
1462         *(iter_delay_bufs) = *iter_temp + delay_last[j][k] * feedback_gain;
1463         iter_delay_bufs -= (delay_buf_pos - delay_buf_length);
1464         iter_temp++;
1465       }
1466     }
1467     // different delayed values can be obtained by judging the type of interpolation
1468     std::vector<std::vector<T>> delayed(n_batch, std::vector<T>(n_channels, 0));
1469     delayed = FlangerInterpolation<T>(delay_bufs, int_delay, frac_delay, interpolation, delay_buf_pos);
1470 
1471     for (int j = 0; j < n_channels; j++) {
1472       int_delay[j] = int_delay[j] + 1;
1473     }
1474     iter_temp = temp->begin<T>();
1475     for (int j = 0; j < n_batch; j++) {
1476       for (int k = 0; k < n_channels; k++) {
1477         auto iter_output_waveform = output_waveform->begin<T>();
1478         // update the next delay
1479         delay_last[j][k] = delayed[j][k];
1480         int it = j * n_channels * actual_shape[-1] + k * actual_shape[-1];
1481         iter_output_waveform += it + i;
1482         // the results are obtained by balancing the output and balancing the feedback loop
1483         *(iter_output_waveform) = *(iter_temp)*in_gain + delayed[j][k] * delay_gain;
1484         iter_temp++;
1485       }
1486     }
1487     // update lfo location
1488     lfo_pos = (lfo_pos + 1) % lfo_length;
1489   }
1490   // the output value is limited by the interval limit function
1491   output_waveform = Clamp<T>(output_waveform, -1, 1);
1492   // convert dimension to waveform dimension
1493   RETURN_IF_NOT_OK(output_waveform->Reshape(actual_shape));
1494   RETURN_IF_NOT_OK(TypeCast(output_waveform, output, input->type()));
1495   return Status::OK();
1496 }
1497 
1498 // A brief structure of wave file header.
1499 struct WavHeader {
1500   int8_t chunk_id[4] = {0};
1501   int32_t chunk_size = 0;
1502   int8_t format[4] = {0};
1503   int8_t sub_chunk1_id[4] = {0};
1504   int32_t sub_chunk1_size = 0;
1505   int16_t audio_format = 0;
1506   int16_t num_channels = 0;
1507   int32_t sample_rate = 0;
1508   int32_t byte_rate = 0;
1509   int16_t byte_align = 0;
1510   int16_t bits_per_sample = 0;
1511   int8_t sub_chunk2_id[4] = {0};
1512   int32_t sub_chunk2_size = 0;
WavHeaderWavHeader1513   WavHeader() {}
1514 };
1515 
1516 /// \brief Get an audio data from a wav file and store into a vector.
1517 /// \param wav_file_dir: wave file dir.
1518 /// \param waveform_vec: vector of waveform.
1519 /// \param sample_rate: sample rate.
1520 /// \return Status code.
1521 Status ReadWaveFile(const std::string &wav_file_dir, std::vector<float> *waveform_vec, int32_t *sample_rate);
1522 
1523 /// \brief Apply sliding-window cepstral mean and variance (optional) normalization per utterance.
1524 /// \param input: Tensor of shape <..., freq, time>.
1525 /// \param output: Tensor of shape <..., frame>.
1526 /// \param cmn_window: Window in frames for running average CMN computation.
1527 /// \param min_cmn_window: Minimum CMN window used at start of decoding.
1528 /// \param center: If true, use a window centered on the current frame. If false, window is to the left.
1529 /// \param norm_vars: If true, normalize variance to one.
1530 /// \return Status code.
1531 Status SlidingWindowCmn(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t cmn_window,
1532                         int32_t min_cmn_window, bool center, bool norm_vars);
1533 /// \brief Compute delta coefficients of a tensor, usually a spectrogram.
1534 /// \param input: Tensor of shape <...,freq,time>.
1535 /// \param output: Tensor of shape <...,freq,time>.
1536 /// \param win_length: The window length used for computing delta.
1537 /// \param mode: Padding mode.
1538 /// \return Status code.
1539 Status ComputeDeltas(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length,
1540                      const BorderType &mode);
1541 
1542 template <typename T>
Mul(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,T value)1543 Status Mul(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, T value) {
1544   RETURN_UNEXPECTED_IF_NULL(output);
1545   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1546   auto iter_in = input->begin<T>();
1547   auto iter_out = (*output)->begin<T>();
1548   for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1549     *iter_out = (*iter_in) * value;
1550   }
1551   return Status::OK();
1552 }
1553 
1554 template <typename T>
Div(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,T value)1555 Status Div(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, T value) {
1556   RETURN_UNEXPECTED_IF_NULL(output);
1557   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1558   CHECK_FAIL_RETURN_UNEXPECTED(value != 0, "Div: invalid parameter, 'value' can not be zero.");
1559   auto iter_in = input->begin<T>();
1560   auto iter_out = (*output)->begin<T>();
1561   for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1562     *iter_out = (*iter_in) / value;
1563   }
1564   return Status::OK();
1565 }
1566 
1567 template <typename T>
Add(const std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,T value)1568 Status Add(const std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, T value) {
1569   RETURN_UNEXPECTED_IF_NULL(output);
1570   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1571   auto iter_in = input->begin<T>();
1572   auto iter_out = (*output)->begin<T>();
1573   for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1574     *iter_out = (*iter_in) + value;
1575   }
1576   return Status::OK();
1577 }
1578 
1579 template <typename T>
SubTensor(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int len)1580 Status SubTensor(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int len) {
1581   RETURN_UNEXPECTED_IF_NULL(output);
1582   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), input->type(), output));
1583   RETURN_IF_NOT_OK(
1584     ValidateNoGreaterThan("SubTensor", "len", len, "size of input tensor", static_cast<int>(input->Size())));
1585   auto iter_in = input->begin<T>();
1586   auto iter_out = (*output)->begin<T>();
1587   for (; iter_out != (*output)->end<T>(); ++iter_in, ++iter_out) {
1588     *iter_out = *iter_in;
1589   }
1590   return Status::OK();
1591 }
1592 
1593 template <typename T>
TensorAdd(const std::shared_ptr<Tensor> & input,const std::shared_ptr<Tensor> & other,std::shared_ptr<Tensor> * output)1594 Status TensorAdd(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
1595                  std::shared_ptr<Tensor> *output) {
1596   RETURN_UNEXPECTED_IF_NULL(output);
1597   CHECK_FAIL_RETURN_UNEXPECTED(input->shape() == other->shape(), "TensorAdd: input tensor shape must be the same.");
1598   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == other->type(), "TensorAdd: input tensor type must be the same.");
1599 
1600   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1601   auto iter_in1 = input->begin<T>();
1602   auto iter_in2 = other->begin<T>();
1603   auto iter_out = (*output)->begin<T>();
1604   for (; iter_out != (*output)->end<T>(); ++iter_in1, ++iter_in2, ++iter_out) {
1605     *iter_out = (*iter_in1) + (*iter_in2);
1606   }
1607   return Status::OK();
1608 }
1609 
1610 template <typename T>
TensorSub(const std::shared_ptr<Tensor> & input,const std::shared_ptr<Tensor> & other,std::shared_ptr<Tensor> * output)1611 Status TensorSub(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
1612                  std::shared_ptr<Tensor> *output) {
1613   RETURN_UNEXPECTED_IF_NULL(output);
1614   CHECK_FAIL_RETURN_UNEXPECTED(input->shape() == other->shape(), "TensorSub: input tensor shape must be the same.");
1615   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == other->type(), "TensorSub: input tensor type must be the same.");
1616 
1617   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1618   auto iter_in1 = input->begin<T>();
1619   auto iter_in2 = other->begin<T>();
1620   auto iter_out = (*output)->begin<T>();
1621   for (; iter_out != (*output)->end<T>(); ++iter_in1, ++iter_in2, ++iter_out) {
1622     *iter_out = (*iter_in1) - (*iter_in2);
1623   }
1624   return Status::OK();
1625 }
1626 
1627 template <typename T>
TensorCat(const std::shared_ptr<Tensor> & input,const std::shared_ptr<Tensor> & other,std::shared_ptr<Tensor> * output)1628 Status TensorCat(const std::shared_ptr<Tensor> &input, const std::shared_ptr<Tensor> &other,
1629                  std::shared_ptr<Tensor> *output) {
1630   RETURN_UNEXPECTED_IF_NULL(output);
1631   CHECK_FAIL_RETURN_UNEXPECTED(input->type() == other->type(), "TensorCat: input tensor type must be the same.");
1632   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[-1] + other->shape()[-1]}), input->type(), output));
1633   auto iter_in1 = input->begin<T>();
1634   auto iter_in2 = other->begin<T>();
1635   auto iter_out = (*output)->begin<T>();
1636   for (; iter_in1 != input->end<T>(); ++iter_in1, ++iter_out) {
1637     *iter_out = *iter_in1;
1638   }
1639   for (; iter_in2 != other->end<T>(); ++iter_in2, ++iter_out) {
1640     *iter_out = *iter_in2;
1641   }
1642   return Status::OK();
1643 }
1644 
1645 template <typename T>
TensorRepeat(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int rank_repeat)1646 Status TensorRepeat(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int rank_repeat) {
1647   RETURN_UNEXPECTED_IF_NULL(output);
1648 
1649   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({rank_repeat, (input->shape()[-1])}), input->type(), output));
1650   auto iter_in = input->begin<T>();
1651   auto iter_out = (*output)->begin<T>();
1652   for (int i = 0; i < rank_repeat; i++) {
1653     auto iter_in = input->begin<T>();
1654     for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1655       *iter_out = *iter_in;
1656     }
1657   }
1658   return Status::OK();
1659 }
1660 
1661 template <typename T>
TensorRowReplace(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int row)1662 Status TensorRowReplace(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int row) {
1663   RETURN_UNEXPECTED_IF_NULL(output);
1664   auto iter_in = input->begin<T>();
1665   auto iter_out = (*output)->begin<T>() + static_cast<ptrdiff_t>((*output)->shape()[-1] * row);
1666   CHECK_FAIL_RETURN_UNEXPECTED(iter_out <= (*output)->end<T>(), "TensorRowReplace: pointer out of bounds");
1667   CHECK_FAIL_RETURN_UNEXPECTED(input->Size() <= (*output)->shape()[-1], "TensorRowReplace: pointer out of bounds");
1668   for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1669     *iter_out = *iter_in;
1670   }
1671   return Status::OK();
1672 }
1673 
1674 template <typename T>
TensorRowAt(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int rank_index)1675 Status TensorRowAt(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int rank_index) {
1676   RETURN_UNEXPECTED_IF_NULL(output);
1677   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[-1]}), input->type(), output));
1678   auto iter_in = input->begin<T>() + static_cast<ptrdiff_t>(input->shape()[-1] * rank_index);
1679   auto iter_out = (*output)->begin<T>();
1680   CHECK_FAIL_RETURN_UNEXPECTED(iter_in <= input->end<T>(), "TensorRowAt: pointer out of bounds");
1681   for (; iter_out != (*output)->end<T>(); ++iter_in, ++iter_out) {
1682     *iter_out = *iter_in;
1683   }
1684   return Status::OK();
1685 }
1686 
1687 template <typename T>
TensorRound(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)1688 Status TensorRound(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
1689   RETURN_UNEXPECTED_IF_NULL(output);
1690 
1691   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
1692   auto iter_in = input->begin<T>();
1693   auto iter_out = (*output)->begin<T>();
1694   for (; iter_in != input->end<T>(); ++iter_in, ++iter_out) {
1695     *iter_out = round(*iter_in);
1696   }
1697   return Status::OK();
1698 }
1699 
1700 template <typename T>
ApplyProbabilityDistribution(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,DensityFunction density_function,std::mt19937 * rnd)1701 Status ApplyProbabilityDistribution(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
1702                                     DensityFunction density_function, std::mt19937 *rnd) {
1703   int channel_size = input->shape()[0] - 1;
1704   int time_size = input->shape()[-1] - 1;
1705   std::uniform_int_distribution<> dis_channel(0, channel_size);
1706   int random_channel = channel_size > 0 ? dis_channel(*rnd) : 0;
1707   std::uniform_int_distribution<> dis_time(0, time_size);
1708   int random_time = time_size > 0 ? dis_time(*rnd) : 0;
1709   int number_of_bits = 16;
1710   int up_scaling = static_cast<int>(pow(2, number_of_bits - 1) - 2);
1711   int down_scaling = static_cast<int>(pow(2, number_of_bits - 1));
1712 
1713   std::shared_ptr<Tensor> signal_scaled;
1714   RETURN_IF_NOT_OK(Mul<T>(input, &signal_scaled, up_scaling));
1715 
1716   std::shared_ptr<Tensor> signal_scaled_dis;
1717   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, &signal_scaled_dis));
1718 
1719   if (density_function == DensityFunction::kRPDF) {
1720     auto iter_in = input->begin<T>();
1721     iter_in += (time_size + 1) * random_channel + random_time;
1722     auto RPDF = *(iter_in);
1723     RETURN_IF_NOT_OK(Add<T>(signal_scaled, &signal_scaled_dis, RPDF));
1724   } else if (density_function == DensityFunction::kGPDF) {
1725     int num_rand_variables = 6;
1726     auto iter_in = input->begin<T>();
1727     iter_in += (time_size + 1) * random_channel + random_time;
1728     auto gaussian = *(iter_in);
1729     for (int i = 0; i < num_rand_variables; i++) {
1730       int rand_channel = channel_size > 0 ? dis_channel(*rnd) : 0;
1731       int rand_time = time_size > 0 ? dis_time(*rnd) : 0;
1732 
1733       auto iter_in_rand = input->begin<T>();
1734       iter_in_rand += (time_size + 1) * rand_channel + rand_time;
1735       gaussian += *(iter_in_rand);
1736       *(iter_in_rand) = gaussian;
1737     }
1738     RETURN_IF_NOT_OK(Add<T>(signal_scaled, &signal_scaled_dis, gaussian));
1739   } else {
1740     int window_length = time_size + 1;
1741     std::shared_ptr<Tensor> float_bartlett;
1742     RETURN_IF_NOT_OK(Bartlett(&float_bartlett, window_length));
1743     std::shared_ptr<Tensor> type_convert_bartlett;
1744     RETURN_IF_NOT_OK(TypeCast(float_bartlett, &type_convert_bartlett, input->type()));
1745 
1746     int rank_repeat = channel_size + 1;
1747     std::shared_ptr<Tensor> TPDF;
1748     RETURN_IF_NOT_OK(TensorRepeat<T>(type_convert_bartlett, &TPDF, rank_repeat));
1749     RETURN_IF_NOT_OK(TensorAdd<T>(signal_scaled, TPDF, &signal_scaled_dis));
1750   }
1751   std::shared_ptr<Tensor> quantised_signal_scaled;
1752   RETURN_IF_NOT_OK(TensorRound<T>(signal_scaled_dis, &quantised_signal_scaled));
1753 
1754   std::shared_ptr<Tensor> quantised_signal;
1755   RETURN_IF_NOT_OK(Div<T>(quantised_signal_scaled, &quantised_signal, down_scaling));
1756   *output = quantised_signal;
1757   return Status::OK();
1758 }
1759 
1760 template <typename T>
AddNoiseShaping(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)1761 Status AddNoiseShaping(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
1762   std::shared_ptr<Tensor> dithered_waveform;
1763   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(*output, &dithered_waveform));
1764   std::shared_ptr<Tensor> waveform;
1765   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, &waveform));
1766 
1767   std::shared_ptr<Tensor> error;
1768   RETURN_IF_NOT_OK(TensorSub<T>(dithered_waveform, waveform, &error));
1769   for (int i = 0; i < error->shape()[0]; i++) {
1770     std::shared_ptr<Tensor> err;
1771     RETURN_IF_NOT_OK(TensorRowAt<T>(error, &err, i));
1772     std::shared_ptr<Tensor> tensor_zero;
1773     std::vector<T> vector_zero(1, 0);
1774     RETURN_IF_NOT_OK(Tensor::CreateFromVector(vector_zero, TensorShape({1}), &tensor_zero));
1775     std::shared_ptr<Tensor> error_offset;
1776     RETURN_IF_NOT_OK(TensorCat<T>(tensor_zero, err, &error_offset));
1777     int k = error->shape()[-1];
1778     std::shared_ptr<Tensor> fresh_error_offset;
1779     RETURN_IF_NOT_OK(SubTensor<T>(error_offset, &fresh_error_offset, k));
1780     RETURN_IF_NOT_OK(TensorRowReplace<T>(fresh_error_offset, &error, i));
1781   }
1782   std::shared_ptr<Tensor> noise_shaped;
1783   RETURN_IF_NOT_OK(TensorAdd<T>(dithered_waveform, error, &noise_shaped));
1784   *output = noise_shaped;
1785   return Status::OK();
1786 }
1787 
1788 /// \brief Apply dither effect.
1789 /// \param input/output: Tensor of shape <..., time>.
1790 /// \param density_function: The density function of a continuous random variable.
1791 /// \param noise_shaing: A filtering process that shapes the spectral energy of quantisation error.
1792 /// \return Status code.
1793 template <typename T>
Dither(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,DensityFunction density_function,bool noise_shaping,std::mt19937 * rnd)1794 Status Dither(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, DensityFunction density_function,
1795               bool noise_shaping, std::mt19937 *rnd) {
1796   TensorShape shape = input->shape();
1797   TensorShape new_shape({input->Size() / shape[-1], shape[-1]});
1798   RETURN_IF_NOT_OK(input->Reshape(new_shape));
1799 
1800   RETURN_IF_NOT_OK(ApplyProbabilityDistribution<T>(input, output, density_function, rnd));
1801   if (noise_shaping) {
1802     RETURN_IF_NOT_OK(AddNoiseShaping<T>(input, output));
1803   }
1804 
1805   RETURN_IF_NOT_OK((*output)->Reshape(shape));
1806   RETURN_IF_NOT_OK(input->Reshape(shape));
1807   return Status::OK();
1808 }
1809 
1810 /// \brief Apply GriffinLim to calculate waveform from linear scalar amplitude spectrogram.
1811 /// \param input Tensor of shape <..., freq, time>.
1812 /// \param output Tensor of shape <..., time>.
1813 /// \param n_fft Size of FFT.
1814 /// \param n_iter Number of iteration for phase recovery.
1815 /// \param win_length Window size for GriffinLim.
1816 /// \param hop_length Length of hop between STFT windows.
1817 /// \param window_type Window type for GriffinLim.
1818 /// \param power Exponent for the magnitude spectrogram.
1819 /// \param momentum The momentum for fast GriffinLim.
1820 /// \param length Length of the expected output waveform.
1821 /// \param rand_init Flag for random phase initialization or all-zero phase initialization.
1822 /// \param rnd Random generator.
1823 /// \return Status code.
1824 Status GriffinLim(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft, int32_t n_iter,
1825                   int32_t win_length, int32_t hop_length, WindowType window_type, float power, float momentum,
1826                   int32_t length, bool rand_init, std::mt19937 *rnd);
1827 
1828 /// \brief Calculate measure for VAD.
1829 template <typename T>
Measure(int32_t measure_len_ws,int32_t index,const Eigen::MatrixXd & samples,Eigen::MatrixXd * spectrum,Eigen::MatrixXd * noise_spectrum,const std::vector<T> & spectrum_window,int32_t spectrum_start,int32_t spectrum_end,const std::vector<T> & cepstrum_window,int32_t cepstrum_start,int32_t cepstrum_end,T noise_reduction_amount,T measure_smooth_time_mult,T noise_up_time_mult,T noise_down_time_mult,int32_t index_ns,int32_t boot_count,float * meas)1830 Status Measure(int32_t measure_len_ws, int32_t index, const Eigen::MatrixXd &samples, Eigen::MatrixXd *spectrum,
1831                Eigen::MatrixXd *noise_spectrum, const std::vector<T> &spectrum_window, int32_t spectrum_start,
1832                int32_t spectrum_end, const std::vector<T> &cepstrum_window, int32_t cepstrum_start,
1833                int32_t cepstrum_end, T noise_reduction_amount, T measure_smooth_time_mult, T noise_up_time_mult,
1834                T noise_down_time_mult, int32_t index_ns, int32_t boot_count, float *meas) {
1835   RETURN_UNEXPECTED_IF_NULL(spectrum);
1836   RETURN_UNEXPECTED_IF_NULL(noise_spectrum);
1837   RETURN_UNEXPECTED_IF_NULL(meas);
1838   CHECK_FAIL_RETURN_UNEXPECTED(spectrum->cols() == noise_spectrum->cols(),
1839                                "Measure: the number of columns of spectrum must be equal to noise_spectrum.");
1840   int samples_len_ns = samples.cols();
1841   CHECK_FAIL_RETURN_UNEXPECTED(samples_len_ns != 0, "Measure: the number of columns of samples cannot be zero.");
1842   int dft_len_ws = spectrum->cols();
1843   std::vector<float> dft_buf(dft_len_ws, 0);
1844   for (int ind = 0; ind < measure_len_ws; ind++) {
1845     int index_new = (ind == 0) ? index_ns : ((index_ns + ind) % samples_len_ns);
1846     dft_buf[ind] = samples(index, index_new) * spectrum_window[ind];
1847   }
1848 
1849   // use fft from eigen to calculate rfft
1850   Eigen::FFT<float> fft;
1851   std::vector<std::complex<float>> rfft_res;
1852   fft.fwd(rfft_res, dft_buf);
1853   // truncate redundant information in fft
1854   int rfft_len = rfft_res.size() - (rfft_res.size() - 1) / 2;
1855   rfft_res.resize(rfft_len);
1856 
1857   float mult = (boot_count >= 0) ? (static_cast<float>(boot_count) / (1.0 + boot_count))
1858                                  : static_cast<float>(measure_smooth_time_mult);
1859   std::vector<float> dft_buf_abs(spectrum_end - spectrum_start);
1860   for (int i = 0; i < spectrum_end - spectrum_start; i++) {
1861     auto dba_i = std::abs(rfft_res[i + spectrum_start]);
1862     // inplace revise spectrum
1863     (*spectrum)(index, spectrum_start + i) *= mult;
1864     (*spectrum)(index, spectrum_start + i) += dba_i * (1 - mult);
1865 
1866     dba_i = std::pow((*spectrum)(index, spectrum_start + i), TWO);
1867 
1868     float mult2 = 0;
1869     // new mult
1870     if (boot_count >= 0) {
1871       mult2 = 0;
1872     } else {
1873       if (dba_i > (*noise_spectrum)(index, spectrum_start + i)) {
1874         mult2 = noise_up_time_mult;
1875       } else {
1876         mult2 = noise_down_time_mult;
1877       }
1878     }
1879     // inplace revise noise spectrum
1880     (*noise_spectrum)(index, spectrum_start + i) *= mult2;
1881     (*noise_spectrum)(index, spectrum_start + i) += dba_i * (1 - mult2);
1882 
1883     dba_i = dba_i - noise_reduction_amount * (*noise_spectrum)(index, spectrum_start + i);
1884     dba_i = dba_i <= 0 ? 0 : std::sqrt(dba_i);
1885 
1886     dft_buf_abs.push_back(dba_i);
1887   }
1888 
1889   // cepstrum_buf
1890   std::vector<float> cepstrum_buf(dft_len_ws >> 1, 0);
1891   for (int i = 0; i < spectrum_end - spectrum_start; i++) {
1892     cepstrum_buf[spectrum_start + i] = dft_buf_abs[i] * cepstrum_window[i];
1893   }
1894   std::vector<std::complex<float>> rfft_res2;
1895   fft.fwd(rfft_res2, cepstrum_buf);
1896   rfft_len = rfft_res2.size() - (rfft_res.size() - 1) / TWO;
1897   rfft_res2.resize(rfft_len);
1898 
1899   float result = 0;
1900   for (int i = cepstrum_start; i < cepstrum_end; i++) {
1901     result += std::pow(std::abs(rfft_res2[i]), TWO);
1902   }
1903 
1904   result = result > 0 ? std::log(result / (cepstrum_end - cepstrum_start)) : INT_MIN;
1905   int base = 21;
1906   *meas = static_cast<float>(std::max(0.0f, base + result));
1907   return Status::OK();
1908 }
1909 
1910 /// \brief Update parameters in VAD calculation.
UpdateVadParams(int32_t * samples_index_ns,int32_t * pos,const int32_t samples_len_ns,int32_t * measure_timer_ns,const int32_t measure_period_ns,int32_t * measures_index,const int32_t measures_len,int32_t * boot_count,const int32_t boot_count_max,bool has_triggered,const int32_t num_measures_to_flush,int32_t * flushed_len_ns)1911 inline Status UpdateVadParams(int32_t *samples_index_ns, int32_t *pos, const int32_t samples_len_ns,
1912                               int32_t *measure_timer_ns, const int32_t measure_period_ns, int32_t *measures_index,
1913                               const int32_t measures_len, int32_t *boot_count, const int32_t boot_count_max,
1914                               bool has_triggered, const int32_t num_measures_to_flush, int32_t *flushed_len_ns) {
1915   RETURN_UNEXPECTED_IF_NULL(samples_index_ns);
1916   RETURN_UNEXPECTED_IF_NULL(pos);
1917   RETURN_UNEXPECTED_IF_NULL(measure_timer_ns);
1918   RETURN_UNEXPECTED_IF_NULL(measures_index);
1919   RETURN_UNEXPECTED_IF_NULL(boot_count);
1920   RETURN_UNEXPECTED_IF_NULL(flushed_len_ns);
1921 
1922   *samples_index_ns = *samples_index_ns + 1;
1923   *pos += 1;
1924   CHECK_FAIL_RETURN_UNEXPECTED(measures_len != 0, "UpdateVadParams: the length of measures cannot be zero.");
1925   CHECK_FAIL_RETURN_UNEXPECTED(samples_len_ns != 0,
1926                                "UpdateVadParams: the number of columns of samples cannot be zero.");
1927   *samples_index_ns = *samples_index_ns == samples_len_ns ? 0 : *samples_index_ns;
1928   if (*measure_timer_ns == 0) {
1929     *measure_timer_ns = measure_period_ns;
1930     *measures_index += 1;
1931     *measures_index = *measures_index % measures_len;
1932     *boot_count = (*boot_count >= 0) && (*boot_count == boot_count_max) ? -1 : *boot_count + 1;
1933   }
1934   if (has_triggered) {
1935     *flushed_len_ns = (measures_len - num_measures_to_flush) * measure_period_ns;
1936     *samples_index_ns = (*samples_index_ns + *flushed_len_ns) % samples_len_ns;
1937   }
1938   return Status::OK();
1939 }
1940 
1941 /// \brief Init spectrum window and cepstrum window for VAD.
FlushMeasures(int measures_len,int measures_index,const int row_ind,const Eigen::MatrixXd & measures,float trigger_level,int gap_len,int * num_measures_to_flush)1942 inline Status FlushMeasures(int measures_len, int measures_index, const int row_ind, const Eigen::MatrixXd &measures,
1943                             float trigger_level, int gap_len, int *num_measures_to_flush) {
1944   RETURN_UNEXPECTED_IF_NULL(num_measures_to_flush);
1945 
1946   int n = measures_len, k = measures_index;
1947   int j_trigger = n, j_zero = n, j = 0;
1948   for (int j = 0; j < n; j++) {
1949     if ((measures(row_ind, k) >= trigger_level) && (j <= (j_trigger + gap_len))) {
1950       j_trigger = j;
1951       j_zero = j_trigger;
1952     } else if ((measures(row_ind, k) == 0) && (j_trigger >= j_zero)) {
1953       j_zero = j;
1954     }
1955     k = (k + n - 1) % n;
1956   }
1957   j = std::min(j, j_zero);
1958   *num_measures_to_flush = std::min(std::max((*num_measures_to_flush), j), n);
1959 
1960   return Status::OK();
1961 }
1962 
1963 /// \brief Init spectrum window and cepstrum window for Vad.
1964 template <typename T>
InitWindows(std::vector<T> * spectrum_window,std::vector<T> * cepstrum_window)1965 Status InitWindows(std::vector<T> *spectrum_window, std::vector<T> *cepstrum_window) {
1966   RETURN_UNEXPECTED_IF_NULL(spectrum_window);
1967   RETURN_UNEXPECTED_IF_NULL(cepstrum_window);
1968 
1969   float half = 0.5;
1970   for (int i = 0; i < spectrum_window->size(); i++) {
1971     auto hann = half - half * std::cos(static_cast<T>(i) / spectrum_window->size() * PI * TWO);
1972     (*spectrum_window)[i] *= hann;
1973   }
1974 
1975   for (int i = 0; i < cepstrum_window->size(); i++) {
1976     auto hann = half - half * std::cos(static_cast<T>(i) / cepstrum_window->size() * PI * TWO);
1977     (*cepstrum_window)[i] *= hann;
1978   }
1979   return Status::OK();
1980 }
1981 
1982 /// \brief Voice activity detector.
1983 /// \param input/output Tensor of shape <..., time>.
1984 /// \param sample_rate Sample rate of audio signal.
1985 /// \param trigger_level The measurement level used to trigger activity detection.
1986 /// \param trigger_time The time constant (in seconds) used to help ignore short sounds.
1987 /// \param search_time The amount of audio (in seconds) to search for quieter/shorter sounds to include prior to
1988 ///     the detected trigger point.
1989 /// \param allowed_gap The allowed gap (in seconds) between quiteter/shorter sounds to include prior to the
1990 ///     detected trigger point.
1991 /// \param pre_trigger_time The amount of audio (in seconds) to preserve before the trigger point and any found
1992 ///     quieter/shorter bursts.
1993 /// \param boot_time The time for the initial noise estimate.
1994 /// \param noise_up_time Time constant used by the adaptive noise estimator, when the noise level is increasing.
1995 /// \param noise_down_time Time constant used by the adaptive noise estimator, when the noise level is decreasing.
1996 /// \param noise_reduction_amount The amount of noise reduction used in the detection algorithm.
1997 /// \param measure_freq The frequency of the algorithm’s processing.
1998 /// \param measure_duration The duration of measurement.
1999 /// \param measure_smooth_time The time constant used to smooth spectral measurements.
2000 /// \param hp_filter_freq The "Brick-wall" frequency of high-pass filter applied at the input to the detector
2001 ///     algorithm.
2002 /// \param lp_filter_freq The "Brick-wall" frequency of low-pass filter applied at the input to the detector
2003 ///     algorithm.
2004 /// \param hp_lifter_freq The "Brick-wall" frequency of high-pass lifter applied at the input to the detector
2005 ///     algorithm.
2006 /// \param lp_lifter_freq The "Brick-wall" frequency of low-pass lifter applied at the input to the detector
2007 ///     algorithm.
2008 /// \return Status code.
2009 template <typename T>
Vad(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int sample_rate,T trigger_level,T trigger_time,T search_time,T allowed_gap,T pre_trigger_time,T boot_time,T noise_up_time,T noise_down_time,T noise_reduction_amount,T measure_freq,T measure_duration,T measure_smooth_time,T hp_filter_freq,T lp_filter_freq,T hp_lifter_freq,T lp_lifter_freq)2010 Status Vad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate, T trigger_level,
2011            T trigger_time, T search_time, T allowed_gap, T pre_trigger_time, T boot_time, T noise_up_time,
2012            T noise_down_time, T noise_reduction_amount, T measure_freq, T measure_duration, T measure_smooth_time,
2013            T hp_filter_freq, T lp_filter_freq, T hp_lifter_freq, T lp_lifter_freq) {
2014   const int measure_len_ws = static_cast<int>(sample_rate * measure_duration + 0.5);
2015   const int measure_len_ns = measure_len_ws;
2016 
2017   int dft_len_ws = 16;
2018   for (; dft_len_ws < measure_len_ws;) {
2019     dft_len_ws *= TWO;
2020   }
2021 
2022   CHECK_FAIL_RETURN_UNEXPECTED(measure_freq != 0, "Vad: measure_freq cannot be zero.");
2023   const int measure_period_ns = static_cast<int>(sample_rate / measure_freq + 0.5);
2024   const int measures_len = std::ceil(search_time * measure_freq);
2025   const int search_pre_trigger_len_ns = static_cast<int>(measures_len * measure_period_ns);
2026   const int gap_len = static_cast<int>(allowed_gap * measure_freq + 0.5);
2027   const int fixed_pre_trigger_len_ns = static_cast<int>(pre_trigger_time * sample_rate + 0.5);
2028   const int samples_len_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns;
2029 
2030   CHECK_FAIL_RETURN_UNEXPECTED(sample_rate != 0, "Vad: sample_rate cannot be zero.");
2031   auto spectrum_start = static_cast<int>(hp_filter_freq / sample_rate * dft_len_ws + 0.5);
2032   spectrum_start = std::max(spectrum_start, 1);
2033   auto spectrum_end = static_cast<int>(lp_filter_freq / sample_rate * dft_len_ws + 0.5);
2034   spectrum_end = std::min(spectrum_end, dft_len_ws / TWO);
2035   CHECK_FAIL_RETURN_UNEXPECTED(spectrum_end > spectrum_start,
2036                                "Vad: the end of spectrum must be greater than the start. Check if `hp_filter_freq` is "
2037                                "too large or `lp_filter_freq` is too small.");
2038 
2039   CHECK_FAIL_RETURN_UNEXPECTED(lp_lifter_freq != 0, "Vad: lp_lifter_freq cannot be zero.");
2040   CHECK_FAIL_RETURN_UNEXPECTED(hp_lifter_freq != 0, "Vad: hp_lifter_freq cannot be zero.");
2041   int cepstrum_start = std::ceil(sample_rate * 0.5 / lp_lifter_freq);
2042   int cepstrum_end = std::floor(sample_rate * 0.5 / hp_lifter_freq);
2043   cepstrum_end = std::min(cepstrum_end, dft_len_ws / (TWO * TWO));
2044   CHECK_FAIL_RETURN_UNEXPECTED(cepstrum_end > cepstrum_start,
2045                                "Vad: the end of cepstrum must be greater than the start. Check if `hp_lifter_freq` is "
2046                                "too large or `lp_lifter_freq` is too small.");
2047   // init spectrum & cepstrum window
2048   std::vector<T> spectrum_window(measure_len_ws, static_cast<T>(TWO / std::sqrt(static_cast<T>(measure_len_ws))));
2049   std::vector<T> cepstrum_window(spectrum_end - spectrum_start,
2050                                  static_cast<T>(TWO / std::sqrt(static_cast<T>(spectrum_end) - spectrum_start)));
2051   RETURN_IF_NOT_OK(InitWindows<T>(&spectrum_window, &cepstrum_window));
2052 
2053   T noise_up_time_mult = std::exp(-1.0 / (noise_up_time * measure_freq));
2054   T noise_down_time_mult = std::exp(-1.0 / (noise_down_time * measure_freq));
2055   T measure_smooth_time_mult = std::exp(-1.0 / (measure_smooth_time * measure_freq));
2056   T trigger_meas_time_mult = std::exp(-1.0 / (trigger_time * measure_freq));
2057 
2058   auto boot_count_max = static_cast<int>(boot_time * measure_freq - 0.5);
2059   auto measure_timer_ns = measure_len_ns;
2060   int boot_count = 0, measures_index = 0, flushed_len_ns = 0, samples_index_ns = 0;
2061 
2062   // pack batch
2063   TensorShape input_shape = input->shape();
2064   TensorShape to_shape({input->Size() / input_shape[-1], input_shape[-1]});
2065   RETURN_IF_NOT_OK(input->Reshape(to_shape));
2066   int n_channels = to_shape[0], ilen = to_shape[1];
2067   std::vector<T> mean_meas(n_channels, 0);
2068   Eigen::MatrixXd samples = Eigen::MatrixXd::Zero(n_channels, samples_len_ns);
2069   Eigen::MatrixXd spectrum = Eigen::MatrixXd::Zero(n_channels, dft_len_ws);
2070   Eigen::MatrixXd noise_spectrum = Eigen::MatrixXd::Zero(n_channels, dft_len_ws);
2071   Eigen::MatrixXd measures = Eigen::MatrixXd::Zero(n_channels, measures_len);
2072 
2073   bool has_triggered = false;
2074   int num_measures_to_flush = 0, pos = 0;
2075 
2076   // convert input to eigen mat
2077   auto wave_ptr = &*input->begin<T>();
2078   Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> waveform_t(wave_ptr, ilen, n_channels);
2079   auto waveform = waveform_t.transpose();
2080   while ((pos < ilen) && (!has_triggered)) {
2081     measure_timer_ns -= 1;
2082     for (int i = 0; i < n_channels; i++) {
2083       samples(i, samples_index_ns) = waveform(i, pos);
2084 
2085       if (measure_timer_ns == 0) {
2086         int index_ns = (samples_index_ns + samples_len_ns - measure_len_ns) % samples_len_ns;
2087         // measure
2088         float meas = 0;
2089         RETURN_IF_NOT_OK(Measure(measure_len_ws, i, samples, &spectrum, &noise_spectrum, spectrum_window,
2090                                  spectrum_start, spectrum_end, cepstrum_window, cepstrum_start, cepstrum_end,
2091                                  noise_reduction_amount, measure_smooth_time_mult, noise_up_time_mult,
2092                                  noise_down_time_mult, index_ns, boot_count, &meas));
2093         measures(i, measures_index) = meas;
2094         mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult);
2095 
2096         has_triggered = has_triggered || (mean_meas[i] >= trigger_level);
2097         if (has_triggered) {
2098           RETURN_IF_NOT_OK(
2099             FlushMeasures(measures_len, measures_index, i, measures, trigger_level, gap_len, &num_measures_to_flush));
2100         }
2101       }
2102     }
2103     RETURN_IF_NOT_OK(UpdateVadParams(&samples_index_ns, &pos, samples_len_ns, &measure_timer_ns, measure_period_ns,
2104                                      &measures_index, measures_len, &boot_count, boot_count_max, has_triggered,
2105                                      num_measures_to_flush, &flushed_len_ns));
2106   }
2107 
2108   // results truncate
2109   int new_col_ind = pos - samples_len_ns + flushed_len_ns;
2110   if (new_col_ind < (-1 * waveform.cols())) {
2111     new_col_ind = 0;
2112   } else if (new_col_ind < 0) {
2113     new_col_ind += ilen;
2114   }
2115   int new_cols = ilen - new_col_ind;
2116   auto res = waveform.rightCols(new_cols);
2117   // unpack
2118   std::shared_ptr<Tensor> res_tensor;
2119   Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_res = res.transpose();
2120   std::vector<T> res_vec(mat_res.data(), mat_res.data() + mat_res.size());
2121   RETURN_IF_NOT_OK(Tensor::CreateFromVector(res_vec, TensorShape({n_channels, new_cols}), &res_tensor));
2122   auto reshape_vec = input_shape.AsVector();
2123   reshape_vec[input_shape.Size() - 1] = new_cols;
2124   RETURN_IF_NOT_OK(res_tensor->Reshape(TensorShape(reshape_vec)));
2125   *output = res_tensor;
2126   return Status::OK();
2127 }
2128 
2129 /// \brief Flip tensor in last dimension.
2130 /// \param input: Tensor of shape <..., time>
2131 /// \Returns Status code.
2132 template <typename T>
FlipLastDim(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)2133 Status FlipLastDim(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
2134   // Create input copy
2135   std::shared_ptr<Tensor> res_tensor;
2136   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), &res_tensor));
2137   int64_t step_length = input->shape()[-1];
2138   auto itr = input->begin<T>();
2139   auto tar_itr = res_tensor->begin<T>();
2140 
2141   // Flip
2142   while (itr != input->end<T>()) {
2143     auto axis_begin = itr;
2144     auto axis_end = itr + static_cast<ptrdiff_t>(step_length);
2145     itr = axis_end;
2146 
2147     // Reversed copy input value T to res_tensor cache
2148     while (axis_begin != axis_end) {
2149       axis_end--;
2150       *tar_itr = *axis_end;
2151       tar_itr++;
2152     }
2153   }
2154   *output = res_tensor;
2155   return Status::OK();
2156 }
2157 
2158 /// \brief Perform an IIR filter forward and backward to a waveform.
2159 /// \param input/output: Tensor of shape <..., time>
2160 /// \param a_coeffs: denominator coefficients of difference equation of dimension of (n_order + 1).
2161 /// \param b_coeffs: numerator coefficients of difference equation of dimension of (n_order + 1).
2162 /// \param clamp: If True, clamp the output signal to be in the range [-1, 1]. Default: True.
2163 /// \return Status code
2164 template <typename T>
Filtfilt(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,std::vector<T> a_coeffs,std::vector<T> b_coeffs,bool clamp)2165 Status Filtfilt(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, std::vector<T> a_coeffs,
2166                 std::vector<T> b_coeffs, bool clamp) {
2167   std::shared_ptr<Tensor> forward;
2168   std::shared_ptr<Tensor> reversed_forward;
2169   std::shared_ptr<Tensor> backward;
2170   std::shared_ptr<Tensor> reversed_backward;
2171 
2172   RETURN_IF_NOT_OK(LFilter(input, &forward, a_coeffs, b_coeffs, false));
2173   RETURN_IF_NOT_OK(FlipLastDim<T>(forward, &reversed_forward));
2174   RETURN_IF_NOT_OK(LFilter(reversed_forward, &backward, a_coeffs, b_coeffs, clamp));
2175   RETURN_IF_NOT_OK(FlipLastDim<T>(backward, &reversed_backward));
2176 
2177   *output = reversed_backward;
2178 
2179   return Status::OK();
2180 }
2181 
2182 /// \brief Resample a signal from one frequency to another. A resampling method can be given.
2183 /// \param[in] input Input tensor.
2184 /// \param[out] output Output tensor.
2185 /// \param[in] orig_freq The original frequency of the signal.
2186 /// \param[in] des_freq The desired frequency.
2187 /// \param[in] resample_method The resample method.
2188 /// \param[in] lowpass_filter_width Controls the sharpness of the filter, more means sharper but less efficient.
2189 /// \param[in] rolloff The roll-off frequency of the filter, as a fraction of the Nyquist. Lower values reduce
2190 ///     anti-aliasing, but also reduce some of the highest frequencies.
2191 /// \param[in] beta The shape parameter used for kaiser window.
2192 /// \return Status return code.
2193 Status Resample(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float orig_freq, float des_freq,
2194                 ResampleMethod resample_method, int32_t lowpass_filter_width, float rolloff, float beta);
2195 
2196 /// \brief Create LFCC for a raw audio signal.
2197 /// \param[in] input Input tensor.
2198 /// \param[out] output Output tensor.
2199 /// \param[in] sample_rate Sample rate of audio signal.
2200 /// \param[in] n_filter Number of linear filters to apply.
2201 /// \param[in] n_lfcc Number of lfc coefficients to retain.
2202 /// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
2203 /// \param[in] log_lf Whether to use log-lf spectrograms instead of db-scaled.
2204 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2205 /// \param[in] win_length Window size.
2206 /// \param[in] hop_length Length of hop between STFT windows.
2207 /// \param[in] f_min Minimum frequency.
2208 /// \param[in] f_max Maximum frequency.
2209 /// \param[in] pad Two sided padding of signal.
2210 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2211 /// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
2212 /// \param[in] normalized Whether to normalize by magnitude after stft.
2213 /// \param[in] center Whether to pad waveform on both sides so that the tt-th frame is centered at time t
2214 ///     t*hop_length.
2215 /// \param[in] pad_mode Controls the padding method used when center is True.
2216 /// \param[in] onesided Controls whether to return half of results to avoid redundancy.
2217 /// \param[in] norm Norm to use.
2218 /// \return Status code.
2219 Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2220             int32_t n_filter, int32_t n_lfcc, int32_t dct_type, bool log_lf, int32_t n_fft, int32_t win_length,
2221             int32_t hop_length, float f_min, float f_max, int32_t pad, WindowType window, float power, bool normalized,
2222             bool center, BorderType pad_mode, bool onesided, NormMode norm);
2223 
2224 /// \brief Create MelSpectrogram for a raw audio signal.
2225 /// \param[in] input Input tensor.
2226 /// \param[out] output Output tensor.
2227 /// \param[in] sample_rate Sample rate of audio signal.
2228 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2229 /// \param[in] win_length Window size.
2230 /// \param[in] hop_length Length of hop between STFT windows.
2231 /// \param[in] f_min Minimum frequency, which must be non negative.
2232 /// \param[in] f_max Maximum frequency, which must be positive.
2233 /// \param[in] pad Two sided padding of signal.
2234 /// \param[in] n_mels Number of mel filter, which must be positive.
2235 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2236 /// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
2237 /// \param[in] normalized Whether to normalize by magnitude after stft.
2238 /// \param[in] center Whether to pad waveform on both sides.
2239 /// \param[in] pad_mode controls the padding method used when center is True.
2240 /// \param[in] onesided controls whether to return half of results to avoid redundancy.
2241 /// \param[in] norm If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
2242 /// \param[in] mel_scale Scale to use: htk or slaney.
2243 /// \return Status return code.
2244 Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2245                       int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
2246                       int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
2247                       bool onesided, NormType norm, MelType mel_scale);
2248 
2249 /// \brief Create MFCC for a raw audio signal.
2250 /// \param[in] input Input tensor.
2251 /// \param[out] output Output tensor.
2252 /// \param[in] sample_rate Sample rate of audio signal.
2253 /// \param[in] n_mfcc Number of mfc coefficients to retain.
2254 /// \param[in] dct_type Type of DCT (discrete cosine transform) to use.
2255 /// \param[in] log_mels Whether to use log-mel spectrograms instead of db-scaled.
2256 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2257 /// \param[in] win_length Window size.
2258 /// \param[in] hop_length Length of hop between STFT windows.
2259 /// \param[in] f_min Minimum frequency.
2260 /// \param[in] f_max Maximum frequency.
2261 /// \param[in] pad Two sided padding of signal.
2262 /// \param[in] n_mels Number of mel filterbanks.
2263 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2264 /// \param[in] power Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc.
2265 /// \param[in] normalized Whether to normalize by magnitude after stft.
2266 /// \param[in] center Whether to pad waveform on both sides.
2267 /// \param[in] pad_mode Controls the padding method used when center is True.
2268 /// \param[in] onesided Controls whether to return half of results to avoid redundancy.
2269 /// \param[in] norm Norm to use.
2270 /// \param[in] norm_M If 'slaney', divide the triangular mel weights by the width of the mel band (area normalization).
2271 /// \param[in] mel_scale Scale to use: htk or slaney.
2272 /// \return Status return code.
2273 Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
2274             int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
2275             float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
2276             BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale);
2277 
2278 /// \brief Shift the pitch of a waveform by steps.
2279 /// \param[in] input Input tensor.
2280 /// \param[out] output Output tensor.
2281 /// \param[in] sample_rate Sample rate of audio signal.
2282 /// \param[in] n_steps The steps to shift audio signal.
2283 /// \param[in] bins_per_octave The number of steps per octave
2284 /// \param[in] n_fft Size of FFT, creates n_fft // 2 + 1 bins.
2285 /// \param[in] win_length Window size.
2286 /// \param[in] hop_length Length of hop between STFT windows.
2287 /// \param[in] window A function to create a window tensor that is applied/multiplied to each frame/window.
2288 /// \return Status return code.
2289 Status PitchShift(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2290                   int32_t n_steps, int32_t bins_per_octave, int32_t n_fft, int32_t win_length, int32_t hop_length,
2291                   WindowType window);
2292 }  // namespace dataset
2293 }  // namespace mindspore
2294 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
2295