• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "minddata/dataset/audio/kernels/audio_utils.h"
18 
19 #include <fstream>
20 
21 #include "mindspore/core/base/float16.h"
22 #include "minddata/dataset/core/type_id.h"
23 #include "minddata/dataset/util/random.h"
24 #include "utils/file_utils.h"
25 
26 namespace mindspore {
27 namespace dataset {
28 // global lock for cyl_bessel_i and cyl_bessel_if function
29 std::mutex cyl_bessel_mux_;
30 
31 // Compute the thread nums.
CountThreadNums(size_t input_size,float block_size,size_t * task_num,size_t * once_compute_size)32 Status CountThreadNums(size_t input_size, float block_size, size_t *task_num, size_t *once_compute_size) {
33   CHECK_FAIL_RETURN_UNEXPECTED(block_size > 0, "Invalid data, the value of 'block_size' should be greater than 0.");
34   size_t audio_thread_num = 4;
35   size_t thread_num =
36     input_size < block_size * audio_thread_num ? std::ceil(input_size / block_size) : audio_thread_num;
37   *once_compute_size = (input_size + thread_num - 1) / thread_num;
38   if (*once_compute_size % TWO == 1) {
39     *once_compute_size += 1;
40   }
41   CHECK_FAIL_RETURN_UNEXPECTED((*once_compute_size) != 0,
42                                "Invalid data, the value of 'once_compute_size' should not be 0, but got 0.");
43   *task_num = input_size / (*once_compute_size);
44   if (input_size % (*once_compute_size) != 0) {
45     *task_num += 1;
46   }
47   return Status::OK();
48 }
49 
AudioParallelLaunch(const std::function<void (size_t,size_t,size_t,size_t)> & task,size_t input_size,float block_size)50 Status AudioParallelLaunch(const std::function<void(size_t, size_t, size_t, size_t)> &task, size_t input_size,
51                            float block_size) {
52   std::vector<std::thread> threads;
53   size_t task_num = 0;
54   size_t once_compute_size = 0;
55   RETURN_IF_NOT_OK(CountThreadNums(input_size, block_size, &task_num, &once_compute_size));
56 
57   for (size_t i = 0; i < task_num - 1; i++) {
58     size_t start = i * once_compute_size;
59     size_t end = start + once_compute_size;
60     threads.push_back(std::thread(task, start, end, i, task_num));
61   }
62 
63   size_t start = (task_num - 1) * once_compute_size;
64   size_t end = input_size;
65   task(start, end, task_num - 1, task_num);
66 
67   for (auto &thread : threads) {
68     thread.join();
69   }
70   return Status::OK();
71 }
72 
73 /// \brief Calculate complex tensor angle.
74 /// \param[in] input - Input tensor, must be complex, <channel, freq, time, complex=2>.
75 /// \param[out] output - Complex tensor angle.
76 /// \return Status return code.
77 template <typename T>
ComplexAngle(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)78 Status ComplexAngle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
79   // check complex
80   RETURN_IF_NOT_OK(ValidateTensorShape("ComplexAngle", input->IsComplex(), "<..., complex=2>"));
81   TensorShape input_shape = input->shape();
82   TensorShape out_shape({input_shape[0], input_shape[1], input_shape[2]});
83   std::vector<T> phase(input_shape[0] * input_shape[1] * input_shape[2]);
84   size_t ind = 0;
85 
86   for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++, ind++) {
87     auto x = (*itr);
88     itr++;
89     auto y = (*itr);
90     phase[ind] = atan2(y, x);
91   }
92 
93   std::shared_ptr<Tensor> out_t;
94   RETURN_IF_NOT_OK(Tensor::CreateFromVector(phase, out_shape, &out_t));
95   phase.clear();
96   phase.shrink_to_fit();
97   *output = out_t;
98   return Status::OK();
99 }
100 
101 /// \brief Calculate complex tensor abs.
102 /// \param[in] input - Input tensor, must be complex, <channel, freq, time, complex=2>.
103 /// \param[out] output - Complex tensor abs.
104 /// \return Status return code.
105 template <typename T>
ComplexAbs(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)106 Status ComplexAbs(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
107   // check complex
108   RETURN_IF_NOT_OK(ValidateTensorShape("ComplexAngle", input->IsComplex(), "<..., complex=2>"));
109   TensorShape input_shape = input->shape();
110   TensorShape out_shape({input_shape[0], input_shape[1], input_shape[2]});
111   std::vector<T> abs(input_shape[0] * input_shape[1] * input_shape[2]);
112   size_t ind = 0;
113   for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++, ind++) {
114     T x = (*itr);
115     itr++;
116     T y = (*itr);
117     abs[ind] = sqrt(pow(y, 2) + pow(x, 2));
118   }
119 
120   std::shared_ptr<Tensor> out_t;
121   RETURN_IF_NOT_OK(Tensor::CreateFromVector(abs, out_shape, &out_t));
122   *output = out_t;
123   return Status::OK();
124 }
125 
CreateLinearFbanks(std::shared_ptr<Tensor> * output,int32_t n_freqs,float f_min,float f_max,int32_t n_filter,int32_t sample_rate)126 Status CreateLinearFbanks(std::shared_ptr<Tensor> *output, int32_t n_freqs, float f_min, float f_max, int32_t n_filter,
127                           int32_t sample_rate) {
128   // all_freqs is equivalent filterbank construction.
129   std::shared_ptr<Tensor> all_freqs;
130   // the sampling frequency is at least twice the highest frequency of the signal.
131   const double signal_times = 2;
132   RETURN_IF_NOT_OK(Linspace<float>(&all_freqs, 0, sample_rate / signal_times, n_freqs));
133   // f_pts is mel value sequence in linspace of (m_min, m_max).
134   std::shared_ptr<Tensor> f_pts;
135   const int32_t bias = 2;
136   RETURN_IF_NOT_OK(Linspace<float>(&f_pts, f_min, f_max, n_filter + bias));
137   // create filterbank
138   std::shared_ptr<Tensor> fb;
139   RETURN_IF_NOT_OK(CreateTriangularFilterbank<float>(&fb, all_freqs, f_pts));
140   *output = fb;
141   return Status::OK();
142 }
143 
144 /// \brief Reconstruct complex tensor from norm and angle.
145 /// \param[in] abs - The absolute value of the complex tensor.
146 /// \param[in] angle - The angle of the complex tensor.
147 /// \param[out] output - Complex tensor, <channel, freq, time, complex=2>.
148 /// \return Status return code.
149 template <typename T>
Polar(const std::shared_ptr<Tensor> & abs,const std::shared_ptr<Tensor> & angle,std::shared_ptr<Tensor> * output)150 Status Polar(const std::shared_ptr<Tensor> &abs, const std::shared_ptr<Tensor> &angle,
151              std::shared_ptr<Tensor> *output) {
152   // check shape
153   if (abs->shape() != angle->shape()) {
154     std::string err_msg = "Polar: the shape of input tensor abs and angle should be the same, but got: abs " +
155                           abs->shape().ToString() + " and angle " + angle->shape().ToString();
156     LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
157   }
158 
159   TensorShape input_shape = abs->shape();
160   TensorShape out_shape({input_shape[0], input_shape[1], input_shape[2], 2});
161   std::vector<T> complex_vec(input_shape[0] * input_shape[1] * input_shape[2] * 2);
162   size_t ind = 0;
163   auto itr_abs = abs->begin<T>();
164   auto itr_angle = angle->begin<T>();
165 
166   for (; itr_abs != abs->end<T>(); itr_abs++, itr_angle++) {
167     complex_vec[ind++] = cos(*itr_angle) * (*itr_abs);
168     complex_vec[ind++] = sin(*itr_angle) * (*itr_abs);
169   }
170 
171   std::shared_ptr<Tensor> out_t;
172   RETURN_IF_NOT_OK(Tensor::CreateFromVector(complex_vec, out_shape, &out_t));
173   *output = out_t;
174   return Status::OK();
175 }
176 
177 /// \brief Pad complex tensor.
178 /// \param[in] input - The complex tensor.
179 /// \param[in] length - The length of padding.
180 /// \param[in] dim - The dim index for padding.
181 /// \param[out] output - Complex tensor, <channel, freq, time, complex=2>.
182 /// \return Status return code.
183 template <typename T>
PadComplexTensor(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int length,size_t dim)184 Status PadComplexTensor(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int length, size_t dim) {
185   TensorShape input_shape = input->shape();
186   std::vector<int64_t> pad_shape_vec = {input_shape[0], input_shape[1], input_shape[2], input_shape[3]};
187   pad_shape_vec[dim] += static_cast<int64_t>(length);
188   TensorShape input_shape_with_pad(pad_shape_vec);
189   std::vector<T> in_vect(input_shape_with_pad[0] * input_shape_with_pad[1] * input_shape_with_pad[2] *
190                          input_shape_with_pad[3]);
191   auto itr_input = input->begin<T>();
192   int64_t input_cnt = 0;
193   /*lint -e{446} ind is modified in the body of the for loop */
194   for (auto ind = 0; ind < static_cast<int>(in_vect.size()); ind++) {
195     in_vect[ind] = (*itr_input);
196     input_cnt = (input_cnt + 1) % (input_shape[2] * input_shape[3]);
197     itr_input++;
198     // complex tensor last dim equals 2, fill zero count equals 2*width
199     if (input_cnt == 0 && ind != 0) {
200       for (int c = 0; c < length * 2; c++) {
201         in_vect[++ind] = 0.0f;
202       }
203     }
204   }
205   std::shared_ptr<Tensor> out_t;
206   RETURN_IF_NOT_OK(Tensor::CreateFromVector(in_vect, input_shape_with_pad, &out_t));
207   *output = out_t;
208   return Status::OK();
209 }
210 
211 /// \brief Calculate phase.
212 /// \param[in] angle_0 - The angle.
213 /// \param[in] angle_1 - The angle.
214 /// \param[in] phase_advance - The phase advance.
215 /// \param[in] phase_time0 - The phase at time 0.
216 /// \param[out] output - Phase tensor.
217 /// \return Status return code.
218 template <typename T>
Phase(const std::shared_ptr<Tensor> & angle_0,const std::shared_ptr<Tensor> & angle_1,const std::shared_ptr<Tensor> & phase_advance,const std::shared_ptr<Tensor> & phase_time0,std::shared_ptr<Tensor> * output)219 Status Phase(const std::shared_ptr<Tensor> &angle_0, const std::shared_ptr<Tensor> &angle_1,
220              const std::shared_ptr<Tensor> &phase_advance, const std::shared_ptr<Tensor> &phase_time0,
221              std::shared_ptr<Tensor> *output) {
222   TensorShape phase_shape = angle_0->shape();
223   std::vector<T> phase(phase_shape[0] * phase_shape[1] * phase_shape[2]);
224   auto itr_angle_0 = angle_0->begin<T>();
225   auto itr_angle_1 = angle_1->begin<T>();
226   auto itr_pa = phase_advance->begin<T>();
227   for (int ind = 0, input_cnt = 0; itr_angle_0 != angle_0->end<T>(); itr_angle_0++, itr_angle_1++, ind++) {
228     if (ind != 0 && ind % phase_shape[2] == 0) {
229       itr_pa++;
230       if (itr_pa == phase_advance->end<T>()) {
231         itr_pa = phase_advance->begin<T>();
232       }
233       input_cnt++;
234     }
235     phase[ind] = (*itr_angle_1) - (*itr_angle_0) - (*itr_pa);
236     phase[ind] = phase[ind] - 2 * PI * round(phase[ind] / (2 * PI)) + (*itr_pa);
237   }
238 
239   // concat phase time 0
240   size_t ind = 0;
241   auto itr_p0 = phase_time0->begin<T>();
242   (void)phase.insert(phase.begin(), (*itr_p0));
243   itr_p0++;
244   while (itr_p0 != phase_time0->end<T>()) {
245     ind += phase_shape[2];
246     phase[ind] = (*itr_p0);
247     itr_p0++;
248   }
249   (void)phase.erase(phase.begin() + static_cast<int>(angle_0->Size()), phase.end());
250 
251   // cal phase accum
252   for (ind = 0; ind < static_cast<int64_t>(phase.size()); ind++) {
253     if (ind % phase_shape[2] != 0) {
254       phase[ind] = phase[ind] + phase[ind - 1];
255     }
256   }
257   std::shared_ptr<Tensor> phase_tensor;
258   RETURN_IF_NOT_OK(Tensor::CreateFromVector(phase, phase_shape, &phase_tensor));
259   *output = phase_tensor;
260   return Status::OK();
261 }
262 
263 /// \brief Calculate magnitude.
264 /// \param[in] alphas - The alphas.
265 /// \param[in] abs_0 - The norm.
266 /// \param[in] abs_1 - The norm.
267 /// \param[out] output - Magnitude tensor.
268 /// \return Status return code.
269 template <typename T>
Mag(const std::shared_ptr<Tensor> & abs_0,const std::shared_ptr<Tensor> & abs_1,std::shared_ptr<Tensor> * output,const std::vector<T> & alphas)270 Status Mag(const std::shared_ptr<Tensor> &abs_0, const std::shared_ptr<Tensor> &abs_1, std::shared_ptr<Tensor> *output,
271            const std::vector<T> &alphas) {
272   TensorShape mag_shape = abs_0->shape();
273   std::vector<T> mag(mag_shape[0] * mag_shape[1] * mag_shape[2]);
274   auto itr_abs_0 = abs_0->begin<T>();
275   auto itr_abs_1 = abs_1->begin<T>();
276   for (auto ind = 0; itr_abs_0 != abs_0->end<T>(); itr_abs_0++, itr_abs_1++, ind++) {
277     mag[ind] = alphas[ind % mag_shape[2]] * (*itr_abs_1) + (1 - alphas[ind % mag_shape[2]]) * (*itr_abs_0);
278   }
279   std::shared_ptr<Tensor> mag_tensor;
280   RETURN_IF_NOT_OK(Tensor::CreateFromVector(mag, mag_shape, &mag_tensor));
281   *output = mag_tensor;
282   return Status::OK();
283 }
284 
285 template <typename T>
TimeStretch(std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,float rate,const std::shared_ptr<Tensor> & phase_advance)286 Status TimeStretch(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, float rate,
287                    const std::shared_ptr<Tensor> &phase_advance) {
288   RETURN_UNEXPECTED_IF_NULL(input);
289   RETURN_UNEXPECTED_IF_NULL(output);
290   // pack <..., freq, time, complex>
291   TensorShape input_shape = input->shape();
292   TensorShape toShape({input->Size() / (input_shape[-1] * input_shape[-2] * input_shape[-3]), input_shape[-3],
293                        input_shape[-2], input_shape[-1]});
294   RETURN_IF_NOT_OK(input->Reshape(toShape));
295   if (std::fabs(rate - 1.0) <= std::numeric_limits<float>::epsilon()) {
296     *output = input;
297     return Status::OK();
298   }
299   // calculate time step and alphas
300   std::vector<dsize_t> time_steps_0, time_steps_1;
301   std::vector<T> alphas;
302   for (int ind = 0;; ind++) {
303     T val = static_cast<float>(ind) * rate;
304     if (val >= input_shape[-2]) {
305       break;
306     }
307     auto val_int = static_cast<dsize_t>(val);
308     time_steps_0.push_back(val_int);
309     time_steps_1.push_back(val_int + 1);
310     alphas.push_back(fmod(val, 1));
311   }
312 
313   // calculate phase on time 0
314   std::shared_ptr<Tensor> spec_time0, phase_time0;
315   RETURN_IF_NOT_OK(
316     input->Slice(&spec_time0, std::vector<SliceOption>({SliceOption(true), SliceOption(true),
317                                                         SliceOption(std::vector<dsize_t>{0}), SliceOption(true)})));
318   RETURN_IF_NOT_OK(ComplexAngle<T>(spec_time0, &phase_time0));
319 
320   // time pad: add zero to time dim
321   RETURN_IF_NOT_OK(PadComplexTensor<T>(input, &input, 2, 2));
322 
323   // slice
324   std::shared_ptr<Tensor> spec_0;
325   RETURN_IF_NOT_OK(input->Slice(&spec_0, std::vector<SliceOption>({SliceOption(true), SliceOption(true),
326                                                                    SliceOption(time_steps_0), SliceOption(true)})));
327   std::shared_ptr<Tensor> spec_1;
328   RETURN_IF_NOT_OK(input->Slice(&spec_1, std::vector<SliceOption>({SliceOption(true), SliceOption(true),
329                                                                    SliceOption(time_steps_1), SliceOption(true)})));
330 
331   // new slices angle and abs <channel, freq, time>
332   std::shared_ptr<Tensor> angle_0, angle_1, abs_0, abs_1;
333   RETURN_IF_NOT_OK(ComplexAngle<T>(spec_0, &angle_0));
334   RETURN_IF_NOT_OK(ComplexAbs<T>(spec_0, &abs_0));
335   RETURN_IF_NOT_OK(ComplexAngle<T>(spec_1, &angle_1));
336   RETURN_IF_NOT_OK(ComplexAbs<T>(spec_1, &abs_1));
337 
338   // cal phase, there exists precision loss between mindspore and pytorch
339   std::shared_ptr<Tensor> phase_tensor;
340   RETURN_IF_NOT_OK(Phase<T>(angle_0, angle_1, phase_advance, phase_time0, &phase_tensor));
341 
342   // calculate magnitude
343   std::shared_ptr<Tensor> mag_tensor;
344   RETURN_IF_NOT_OK(Mag<T>(abs_0, abs_1, &mag_tensor, alphas));
345 
346   // reconstruct complex from norm and angle
347   std::shared_ptr<Tensor> complex_spec_stretch;
348   RETURN_IF_NOT_OK(Polar<T>(mag_tensor, phase_tensor, &complex_spec_stretch));
349 
350   // unpack
351   auto output_shape_vec = input_shape.AsVector();
352   output_shape_vec.pop_back();
353   output_shape_vec.pop_back();
354   output_shape_vec.push_back(complex_spec_stretch->shape()[-2]);
355   output_shape_vec.push_back(input_shape[-1]);
356   RETURN_IF_NOT_OK(complex_spec_stretch->Reshape(TensorShape(output_shape_vec)));
357   *output = complex_spec_stretch;
358   return Status::OK();
359 }
360 
TimeStretch(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float rate,float hop_length,int32_t n_freq)361 Status TimeStretch(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate, float hop_length,
362                    int32_t n_freq) {
363   RETURN_UNEXPECTED_IF_NULL(input);
364   RETURN_UNEXPECTED_IF_NULL(output);
365   std::shared_ptr<Tensor> phase_advance;
366   switch (input->type().value()) {
367     case DataType::DE_FLOAT32:
368       RETURN_IF_NOT_OK(Linspace<float>(&phase_advance, 0, PI * hop_length, n_freq));
369       RETURN_IF_NOT_OK(TimeStretch<float>(input, output, rate, phase_advance));
370       break;
371     case DataType::DE_FLOAT64:
372       RETURN_IF_NOT_OK(Linspace<double>(&phase_advance, 0, PI * hop_length, n_freq));
373       RETURN_IF_NOT_OK(TimeStretch<double>(input, output, rate, phase_advance));
374       break;
375     default:
376       RETURN_IF_NOT_OK(ValidateTensorFloat("TimeStretch", input));
377   }
378   return Status::OK();
379 }
380 
PhaseVocoder(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float rate,const std::shared_ptr<Tensor> & phase_advance)381 Status PhaseVocoder(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate,
382                     const std::shared_ptr<Tensor> &phase_advance) {
383   const int32_t kFrequencePosInComplex = -3;
384   RETURN_IF_NOT_OK(ValidateTensorShape("PhaseVocoder", input->shape().Size() > kDefaultAudioDim && input->IsComplex(),
385                                        "<..., freq, num_frame, complex=2>"));
386   RETURN_IF_NOT_OK(ValidateTensorNumeric("PhaseVocoder", input));
387   RETURN_IF_NOT_OK(ValidateEqual("PhaseVocoder", "first dimension of 'phase_advance'", phase_advance->shape()[0],
388                                  "freq dimension length of input tensor", input->shape()[kFrequencePosInComplex]));
389   CHECK_FAIL_RETURN_UNEXPECTED(phase_advance->type() == input->type(),
390                                "PhaseVocoder: invalid parameter, data type of phase_advance should be equal to data "
391                                "type of input tensor, but got: data type of phase_advance " +
392                                  phase_advance->type().ToString() + " while data type of input tensor " +
393                                  input->type().ToString() + ".");
394   std::shared_ptr<Tensor> input_tensor;
395   if (input->type().value() != DataType::DE_FLOAT64) {
396     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
397     RETURN_IF_NOT_OK(TimeStretch<float>(input_tensor, output, rate, phase_advance));
398   } else {
399     RETURN_IF_NOT_OK(TimeStretch<double>(input, output, rate, phase_advance));
400   }
401   return Status::OK();
402 }
403 
Dct(std::shared_ptr<Tensor> * output,int n_mfcc,int n_mels,NormMode norm)404 Status Dct(std::shared_ptr<Tensor> *output, int n_mfcc, int n_mels, NormMode norm) {
405   TensorShape dct_shape({n_mels, n_mfcc});
406   RETURN_IF_NOT_OK(Tensor::CreateEmpty(dct_shape, DataType(DataType::DE_FLOAT32), output));
407   auto iter = (*output)->begin<float>();
408   const float sqrt_2 = 1 / sqrt(2.0f);
409   auto sqrt_2_n_mels = static_cast<float>(sqrt(2.0 / n_mels));
410   for (int i = 0; i < n_mels; i++) {
411     for (int j = 0; j < n_mfcc; j++) {
412       // calculate temp:
413       // 1. while norm = None, use 2*cos(PI*(i+0.5)*j/n_mels)
414       // 2. while norm = Ortho, divide the first row by sqrt(2),
415       //    then using sqrt(2.0 / n_mels)*cos(PI*(i+0.5)*j/n_mels)
416       auto temp = static_cast<float>(PI / n_mels * (i + 0.5) * j);
417       temp = cos(temp);
418       if (norm == NormMode::kOrtho) {
419         if (j == 0) {
420           temp *= sqrt_2;
421         }
422         temp *= sqrt_2_n_mels;
423       } else {
424         temp *= 2;
425       }
426       (*iter++) = temp;
427     }
428   }
429   return Status::OK();
430 }
431 
RandomMaskAlongAxis(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t mask_param,float mask_value,int axis,std::mt19937 * rnd)432 Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_param,
433                            float mask_value, int axis, std::mt19937 *rnd) {
434   std::uniform_int_distribution<int32_t> mask_width_value(0, mask_param);
435   TensorShape input_shape = input->shape();
436   int32_t mask_dim_size = axis == 1 ? input_shape[-2] : input_shape[-1];
437   int32_t mask_width = mask_width_value(*rnd);
438   std::uniform_int_distribution<int32_t> min_freq_value(0, mask_dim_size - mask_width);
439   int32_t mask_start = min_freq_value(*rnd);
440 
441   return MaskAlongAxis(input, output, mask_width, mask_start, mask_value, axis);
442 }
443 
MaskAlongAxis(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t mask_width,int32_t mask_start,float mask_value,int32_t axis)444 Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_width,
445                      int32_t mask_start, float mask_value, int32_t axis) {
446   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
447   if (mask_width == 0) {
448     return Status::OK();
449   }
450   if (axis != 2 && axis != 1) {
451     LOG_AND_RETURN_STATUS_SYNTAX_ERROR(
452       "MaskAlongAxis: invalid parameter, 'axis' can only be 1 for Frequency Masking or 2 for Time Masking.");
453   }
454   TensorShape input_shape = (*output)->shape();
455   // squeeze output
456   TensorShape squeeze_shape =
457     TensorShape({input_shape.NumOfElements() / input_shape[-2] / input_shape[-1], input_shape[-2], input_shape[-1]});
458   RETURN_IF_NOT_OK((*output)->Reshape(squeeze_shape));
459 
460   int check_dim_ind = (axis == 1) ? -2 : -1;
461   CHECK_FAIL_RETURN_SYNTAX_ERROR(mask_start >= 0 && mask_start <= input_shape[check_dim_ind],
462                                  "MaskAlongAxis: invalid parameter, 'mask_start' should be less than the length of the "
463                                  "masked dimension, but got: 'mask_start' " +
464                                    std::to_string(mask_start) + " and length " +
465                                    std::to_string(input_shape[check_dim_ind]));
466   CHECK_FAIL_RETURN_SYNTAX_ERROR(
467     mask_start + mask_width <= input_shape[check_dim_ind],
468     "MaskAlongAxis: invalid parameter, the sum of 'mask_start' and 'mask_width' should be no more "
469     "than the length of the masked dimension, but got: 'mask_start' " +
470       std::to_string(mask_start) + ", 'mask_width' " + std::to_string(mask_width) + " and length " +
471       std::to_string(input_shape[check_dim_ind]));
472 
473   size_t cell_size = (*output)->type().SizeInBytes();
474 
475   if (axis == 1) {
476     // freq
477     const auto kShapeIndex = -2;
478     for (auto ind = 0; ind < (*output)->Size() / input_shape[kShapeIndex] * mask_width; ind++) {
479       int block_num = ind / (mask_width * input_shape[-1]);
480       auto start_pos = ind % (mask_width * input_shape[-1]) + mask_start * input_shape[-1] +
481                        input_shape[-1] * input_shape[kShapeIndex] * block_num;
482       auto start_mem_pos = const_cast<uchar *>((*output)->GetBuffer() + start_pos * cell_size);
483       if ((*output)->type() != DataType::DE_FLOAT64) {
484         // tensor float 32
485         auto mask_val = static_cast<float>(mask_value);
486         auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
487         CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error, ret code: " +
488                                                         std::to_string(ret_code) + ".");
489       } else {
490         // tensor float 64
491         auto mask_val = static_cast<double>(mask_value);
492         auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
493         CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error, ret code: " +
494                                                         std::to_string(ret_code) + ".");
495       }
496     }
497   } else {
498     // time
499     for (int ind = 0; ind < (*output)->Size() / input_shape[-1] * mask_width; ind++) {
500       int row_num = ind / mask_width;
501       auto start_pos = ind % mask_width + mask_start + input_shape[-1] * row_num;
502       auto start_mem_pos = const_cast<uchar *>((*output)->GetBuffer() + start_pos * cell_size);
503       if ((*output)->type() != DataType::DE_FLOAT64) {
504         // tensor float 32
505         auto mask_val = static_cast<float>(mask_value);
506         auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
507         CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error, ret code: " +
508                                                         std::to_string(ret_code) + ".");
509       } else {
510         // tensor float 64
511         auto mask_val = static_cast<double>(mask_value);
512         auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
513         CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error. ret code: " +
514                                                         std::to_string(ret_code) + ".");
515       }
516     }
517   }
518   // unsqueeze output
519   RETURN_IF_NOT_OK((*output)->Reshape(input_shape));
520   return Status::OK();
521 }
522 
523 template <typename T>
Norm(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float power)524 Status Norm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power) {
525   // calculate the output dimension
526   auto input_size = input->shape().AsVector();
527   auto dim_back = input_size.back();
528   RETURN_IF_NOT_OK(
529     ValidateTensorShape("ComplexNorm", input->IsComplex(), "<..., complex=2>", std::to_string(dim_back)));
530   input_size.pop_back();
531   std::shared_ptr<Tensor> out;
532   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape(input_size), input->type(), &out));
533 
534   std::vector<std::thread> threads;
535   size_t count = input->Size();
536   float block_size = 30000;
537 
538   auto task = [&input, &out, &power, &count](size_t start, size_t end, size_t num, size_t task_num) {
539     if ((task_num - num) == 1) {
540       end = count;
541     }
542 
543     auto itr_start = input->begin<T>();
544     itr_start += start;
545     auto itr_end = input->begin<T>();
546     itr_end += end;
547     auto itr_out = out->begin<T>();
548     size_t offset = start / TWO;
549     itr_out += offset;
550 
551     // calculate norm, using: .pow(2.).sum(-1).pow(0.5 * power)
552     for (auto itr = itr_start; itr != itr_end; itr++) {
553       auto a = *itr;
554       ++itr;
555       auto b = *itr;
556       auto res = pow(a, TWO) + pow(b, TWO);
557       *itr_out = static_cast<T>(pow(res, (HALF * power)));
558       itr_out++;
559     }
560   };
561   RETURN_IF_NOT_OK(AudioParallelLaunch(task, count, block_size));
562   *output = out;
563 
564   return Status::OK();
565 }
566 
ComplexNorm(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float power)567 Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power) {
568   if (input->type().value() >= DataType::DE_BOOL && input->type().value() <= DataType::DE_FLOAT16) {
569     // convert the data type to float
570     std::shared_ptr<Tensor> input_tensor;
571     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
572 
573     RETURN_IF_NOT_OK(Norm<float>(input_tensor, output, power));
574   } else if (input->type().value() == DataType::DE_FLOAT32) {
575     RETURN_IF_NOT_OK(Norm<float>(input, output, power));
576   } else if (input->type().value() == DataType::DE_FLOAT64) {
577     RETURN_IF_NOT_OK(Norm<double>(input, output, power));
578   } else {
579     RETURN_IF_NOT_OK(ValidateTensorNumeric("ComplexNorm", input));
580   }
581   return Status::OK();
582 }
583 
584 template <typename T>
sgn(T val)585 inline float sgn(T val) {
586   return (val > 0) ? 1 : ((val < 0) ? -1 : 0);
587 }
588 
589 template <typename T>
Decoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T mu)590 Status Decoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T mu) {
591   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
592   auto itr_out = (*output)->begin<T>();
593   auto itr = input->begin<T>();
594   auto end = input->end<T>();
595 
596   while (itr != end) {
597     auto x_mu = *itr;
598     CHECK_FAIL_RETURN_SYNTAX_ERROR(mu != 0, "Decoding: invalid parameter, 'mu' can not be zero.");
599     x_mu = ((x_mu) / mu) * 2 - 1.0;
600     x_mu = sgn(x_mu) * expm1(fabs(x_mu) * log1p(mu)) / mu;
601     *itr_out = x_mu;
602     ++itr_out;
603     ++itr;
604   }
605   return Status::OK();
606 }
607 
MuLawDecoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t quantization_channels)608 Status MuLawDecoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
609                      int32_t quantization_channels) {
610   if (input->type() != DataType::DE_FLOAT64) {
611     float f_mu = static_cast<float>(quantization_channels) - 1;
612     // convert the data type to float
613     std::shared_ptr<Tensor> input_tensor;
614     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
615     RETURN_IF_NOT_OK(Decoding<float>(input_tensor, output, f_mu));
616   } else {
617     double f_mu = static_cast<double>(quantization_channels) - 1;
618     RETURN_IF_NOT_OK(Decoding<double>(input, output, f_mu));
619   }
620   return Status::OK();
621 }
622 
623 template <typename T>
Encoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T mu)624 Status Encoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T mu) {
625   RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), DataType(DataType::DE_INT32), output));
626   auto itr_out = (*output)->begin<int32_t>();
627   auto itr = input->begin<T>();
628   auto end = input->end<T>();
629 
630   while (itr != end) {
631     auto x = *itr;
632     x = sgn(x) * log1p(mu * fabs(x)) / log1p(mu);
633     x = (x + 1) / 2 * mu + 0.5;
634     *itr_out = static_cast<int32_t>(x);
635     ++itr_out;
636     ++itr;
637   }
638   return Status::OK();
639 }
640 
MuLawEncoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t quantization_channels)641 Status MuLawEncoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
642                      int32_t quantization_channels) {
643   if (input->type() != DataType::DE_FLOAT64) {
644     float f_mu = static_cast<float>(quantization_channels) - 1;
645     // convert the data type to float
646     std::shared_ptr<Tensor> input_tensor;
647     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
648     RETURN_IF_NOT_OK(Encoding<float>(input_tensor, output, f_mu));
649   } else {
650     double f_mu = static_cast<double>(quantization_channels) - 1;
651     RETURN_IF_NOT_OK(Encoding<double>(input, output, f_mu));
652   }
653   return Status::OK();
654 }
655 
656 template <typename T>
FadeIn(std::shared_ptr<Tensor> * output,int32_t fade_in_len,FadeShape fade_shape)657 Status FadeIn(std::shared_ptr<Tensor> *output, int32_t fade_in_len, FadeShape fade_shape) {
658   T start = 0;
659   T end = 1;
660   RETURN_IF_NOT_OK(Linspace<T>(output, start, end, fade_in_len));
661   for (auto iter = (*output)->begin<T>(); iter != (*output)->end<T>(); iter++) {
662     switch (fade_shape) {
663       case FadeShape::kLinear:
664         break;
665       case FadeShape::kExponential:
666         // Compute the scale factor of the exponential function, pow(2.0, *in_ter - 1.0) * (*in_ter)
667         *iter = static_cast<T>(std::pow(2.0, *iter - 1.0) * (*iter));
668         break;
669       case FadeShape::kLogarithmic:
670         // Compute the scale factor of the logarithmic function, log(*in_iter + 0.1) + 1.0
671         *iter = static_cast<T>(std::log10(*iter + 0.1) + 1.0);
672         break;
673       case FadeShape::kQuarterSine:
674         // Compute the scale factor of the quarter_sine function, sin((*in_iter - 1.0) * PI / 2.0)
675         *iter = static_cast<T>(std::sin((*iter) * PI / 2.0));
676         break;
677       case FadeShape::kHalfSine:
678         // Compute the scale factor of the half_sine function, sin((*in_iter) * PI - PI / 2.0) / 2.0 + 0.5
679         *iter = static_cast<T>(std::sin((*iter) * PI - PI / 2.0) / 2.0 + 0.5);
680         break;
681       default:
682         RETURN_STATUS_UNEXPECTED(
683           "The fade_shape parameter only supports these types [FadeShape::kLinear, FadeShape::kExponential, "
684           "FadeShape::kLogarithmic, FadeShape::kQuarterSine, FadeShape::kHalfSine].");
685     }
686   }
687   return Status::OK();
688 }
689 
690 template <typename T>
FadeOut(std::shared_ptr<Tensor> * output,int32_t fade_out_len,FadeShape fade_shape)691 Status FadeOut(std::shared_ptr<Tensor> *output, int32_t fade_out_len, FadeShape fade_shape) {
692   T start = 0;
693   T end = 1;
694   RETURN_IF_NOT_OK(Linspace<T>(output, start, end, fade_out_len));
695   for (auto iter = (*output)->begin<T>(); iter != (*output)->end<T>(); iter++) {
696     switch (fade_shape) {
697       case FadeShape::kLinear:
698         // In fade out, invert *out_iter
699         *iter = static_cast<T>(1.0 - *iter);
700         break;
701       case FadeShape::kExponential:
702         // Compute the scale factor of the exponential function
703         *iter = static_cast<T>(std::pow(2.0, -*iter) * (1.0 - *iter));
704         break;
705       case FadeShape::kLogarithmic:
706         // Compute the scale factor of the logarithmic function
707         *iter = static_cast<T>(std::log10(1.1 - *iter) + 1.0);
708         break;
709       case FadeShape::kQuarterSine:
710         // Compute the scale factor of the quarter_sine function
711         *iter = static_cast<T>(std::sin((*iter) * PI / 2.0 + PI / 2.0));
712         break;
713       case FadeShape::kHalfSine:
714         // Compute the scale factor of the half_sine function
715         *iter = static_cast<T>(std::sin((*iter) * PI + PI / 2.0) / 2.0 + 0.5);
716         break;
717       default:
718         RETURN_STATUS_UNEXPECTED(
719           "The fade_shape parameter only supports these types [FadeShape::kLinear, FadeShape::kExponential, "
720           "FadeShape::kLogarithmic, FadeShape::kQuarterSine, FadeShape::kHalfSine].");
721     }
722   }
723   return Status::OK();
724 }
725 
726 template <typename T>
Fade(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t fade_in_len,int32_t fade_out_len,FadeShape fade_shape)727 Status Fade(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t fade_in_len,
728             int32_t fade_out_len, FadeShape fade_shape) {
729   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
730   const TensorShape input_shape = input->shape();
731   auto waveform_length = static_cast<int32_t>(input_shape[-1]);
732   RETURN_IF_NOT_OK(ValidateNoGreaterThan("Fade", "fade_in_len", fade_in_len, "length of waveform", waveform_length));
733   RETURN_IF_NOT_OK(ValidateNoGreaterThan("Fade", "fade_out_len", fade_out_len, "length of waveform", waveform_length));
734   auto num_waveform = static_cast<int32_t>(input->Size() / waveform_length);
735   TensorShape toShape = TensorShape({num_waveform, waveform_length});
736   RETURN_IF_NOT_OK((*output)->Reshape(toShape));
737   TensorPtr fade_in;
738   RETURN_IF_NOT_OK(FadeIn<T>(&fade_in, fade_in_len, fade_shape));
739   TensorPtr fade_out;
740   RETURN_IF_NOT_OK(FadeOut<T>(&fade_out, fade_out_len, fade_shape));
741 
742   // Add fade in to input tensor
743   auto output_iter = (*output)->begin<T>();
744   for (auto fade_in_iter = fade_in->begin<T>(); fade_in_iter != fade_in->end<T>(); fade_in_iter++) {
745     *output_iter = (*output_iter) * (*fade_in_iter);
746     for (int32_t j = 1; j < num_waveform; j++) {
747       output_iter += waveform_length;
748       *output_iter = (*output_iter) * (*fade_in_iter);
749     }
750     output_iter -= ((num_waveform - 1) * waveform_length);
751     ++output_iter;
752   }
753 
754   // Add fade out to input tensor
755   output_iter = (*output)->begin<T>();
756   output_iter += (waveform_length - fade_out_len);
757   for (auto fade_out_iter = fade_out->begin<T>(); fade_out_iter != fade_out->end<T>(); fade_out_iter++) {
758     *output_iter = (*output_iter) * (*fade_out_iter);
759     for (int32_t j = 1; j < num_waveform; j++) {
760       output_iter += waveform_length;
761       *output_iter = (*output_iter) * (*fade_out_iter);
762     }
763     output_iter -= ((num_waveform - 1) * waveform_length);
764     ++output_iter;
765   }
766   (*output)->Reshape(input_shape);
767   return Status::OK();
768 }
769 
Fade(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t fade_in_len,int32_t fade_out_len,FadeShape fade_shape)770 Status Fade(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t fade_in_len,
771             int32_t fade_out_len, FadeShape fade_shape) {
772   if (DataType::DE_BOOL <= input->type().value() && input->type().value() <= DataType::DE_FLOAT32) {
773     std::shared_ptr<Tensor> waveform;
774     RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
775     RETURN_IF_NOT_OK(Fade<float>(waveform, output, fade_in_len, fade_out_len, fade_shape));
776   } else if (input->type().value() == DataType::DE_FLOAT64) {
777     RETURN_IF_NOT_OK(Fade<double>(input, output, fade_in_len, fade_out_len, fade_shape));
778   } else {
779     RETURN_IF_NOT_OK(ValidateTensorNumeric("Fade", input));
780   }
781   return Status::OK();
782 }
783 
Magphase(const TensorRow & input,TensorRow * output,float power)784 Status Magphase(const TensorRow &input, TensorRow *output, float power) {
785   std::shared_ptr<Tensor> mag;
786   std::shared_ptr<Tensor> phase;
787 
788   RETURN_IF_NOT_OK(ComplexNorm(input[0], &mag, power));
789   if (input[0]->type() == DataType(DataType::DE_FLOAT64)) {
790     RETURN_IF_NOT_OK(Angle<double>(input[0], &phase));
791   } else {
792     std::shared_ptr<Tensor> tmp;
793     RETURN_IF_NOT_OK(TypeCast(input[0], &tmp, DataType(DataType::DE_FLOAT32)));
794     RETURN_IF_NOT_OK(Angle<float>(tmp, &phase));
795   }
796   (*output).push_back(mag);
797   (*output).push_back(phase);
798 
799   return Status::OK();
800 }
801 
MedianSmoothing(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t win_length)802 Status MedianSmoothing(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length) {
803   auto channel = input->shape()[0];
804   auto num_of_frames = input->shape()[1];
805   // Centered windowed
806   int32_t pad_length = (win_length - 1) / 2;
807   int32_t out_length = num_of_frames + pad_length - win_length + 1;
808   TensorShape out_shape({channel, out_length});
809   std::vector<int> signal;
810   std::vector<int> out;
811   std::vector<int> indices(channel * (num_of_frames + pad_length), 0);
812   // "replicate" padding in any dimension
813   for (auto itr = input->begin<int>(); itr != input->end<int>(); ++itr) {
814     signal.push_back(*itr);
815   }
816   for (int i = 0; i < channel; ++i) {
817     for (int j = 0; j < pad_length; ++j) {
818       indices[i * (num_of_frames + pad_length) + j] = signal[i * num_of_frames];
819     }
820   }
821   for (int i = 0; i < channel; ++i) {
822     for (int j = 0; j < num_of_frames; ++j) {
823       indices[i * (num_of_frames + pad_length) + j + pad_length] = signal[i * num_of_frames + j];
824     }
825   }
826   for (int i = 0; i < channel; ++i) {
827     int32_t index = i * (num_of_frames + pad_length);
828     for (int j = 0; j < out_length; ++j) {
829       std::vector<int> tem(indices.begin() + index, indices.begin() + win_length + index);
830       std::sort(tem.begin(), tem.end());
831       out.push_back(tem[pad_length]);
832       ++index;
833     }
834   }
835   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
836   return Status::OK();
837 }
838 
DetectPitchFrequency(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float frame_time,int32_t win_length,int32_t freq_low,int32_t freq_high)839 Status DetectPitchFrequency(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
840                             float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high) {
841   std::shared_ptr<Tensor> nccf;
842   std::shared_ptr<Tensor> indices;
843   std::shared_ptr<Tensor> smooth_indices;
844   // pack batch
845   TensorShape input_shape = input->shape();
846   TensorShape to_shape({input->Size() / input_shape[-1], input_shape[-1]});
847   RETURN_IF_NOT_OK(input->Reshape(to_shape));
848   if (input->type() == DataType(DataType::DE_FLOAT32)) {
849     RETURN_IF_NOT_OK(ComputeNccf<float>(input, &nccf, sample_rate, frame_time, freq_low));
850     RETURN_IF_NOT_OK(FindMaxPerFrame<float>(nccf, &indices, sample_rate, freq_high));
851   } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
852     RETURN_IF_NOT_OK(ComputeNccf<double>(input, &nccf, sample_rate, frame_time, freq_low));
853     RETURN_IF_NOT_OK(FindMaxPerFrame<double>(nccf, &indices, sample_rate, freq_high));
854   } else {
855     RETURN_IF_NOT_OK(ComputeNccf<float16>(input, &nccf, sample_rate, frame_time, freq_low));
856     RETURN_IF_NOT_OK(FindMaxPerFrame<float16>(nccf, &indices, sample_rate, freq_high));
857   }
858   RETURN_IF_NOT_OK(MedianSmoothing(indices, &smooth_indices, win_length));
859 
860   // Convert indices to frequency
861   constexpr double EPSILON = 1e-9;
862   TensorShape freq_shape = smooth_indices->shape();
863   std::vector<float> out;
864   for (auto itr_fre = smooth_indices->begin<int>(); itr_fre != smooth_indices->end<int>(); ++itr_fre) {
865     out.push_back(sample_rate / (EPSILON + *itr_fre));
866   }
867 
868   // unpack batch
869   auto shape_vec = input_shape.AsVector();
870   shape_vec[shape_vec.size() - 1] = freq_shape[-1];
871   TensorShape out_shape(shape_vec);
872   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
873   return Status::OK();
874 }
875 
GenerateWaveTable(std::shared_ptr<Tensor> * output,const DataType & type,Modulation modulation,int32_t table_size,float min,float max,float phase)876 Status GenerateWaveTable(std::shared_ptr<Tensor> *output, const DataType &type, Modulation modulation,
877                          int32_t table_size, float min, float max, float phase) {
878   RETURN_UNEXPECTED_IF_NULL(output);
879   CHECK_FAIL_RETURN_UNEXPECTED(table_size > 0,
880                                "table_size must be more than 0, but got: " + std::to_string(table_size));
881   auto phase_offset = static_cast<int32_t>(phase / PI / 2 * table_size + 0.5);
882   // get the offset of the i-th
883   std::vector<int32_t> point;
884   for (auto i = 0; i < table_size; i++) {
885     point.push_back((i + phase_offset) % table_size);
886   }
887 
888   std::shared_ptr<Tensor> wave_table;
889   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({table_size}), DataType(DataType::DE_FLOAT32), &wave_table));
890 
891   auto iter = wave_table->begin<float>();
892 
893   if (modulation == Modulation::kSinusoidal) {
894     for (int i = 0; i < table_size; iter++, i++) {
895       // change phase
896       *iter = (sin(point[i] * PI / table_size * 2) + 1) / 2;
897     }
898   } else {
899     for (int i = 0; i < table_size; iter++, i++) {
900       // change phase
901       *iter = point[i] * 2.0 / table_size;
902       // get complete offset
903       auto value = static_cast<int>(4 * point[i] / table_size);
904       // change the value of the square wave according to the number of complete offsets
905       if (value == 0) {
906         *iter = *iter + 0.5;
907       } else if (value == 1 || value == 2) {
908         *iter = 1.5 - *iter;
909       } else if (value == 3) {
910         *iter = *iter - 1.5;
911       }
912     }
913   }
914   for (iter = wave_table->begin<float>(); iter != wave_table->end<float>(); iter++) {
915     *iter = *iter * (max - min) + min;
916   }
917   if (type.IsInt()) {
918     for (iter = wave_table->begin<float>(); iter != wave_table->end<float>(); iter++) {
919       if (*iter < 0) {
920         *iter = *iter - 0.5;
921       } else {
922         *iter = *iter + 0.5;
923       }
924     }
925     RETURN_IF_NOT_OK(TypeCast(wave_table, output, DataType(DataType::DE_INT32)));
926   } else if (type.IsFloat()) {
927     RETURN_IF_NOT_OK(TypeCast(wave_table, output, DataType(DataType::DE_FLOAT32)));
928   }
929 
930   return Status::OK();
931 }
932 
ReadWaveFile(const std::string & wav_file_dir,std::vector<float> * waveform_vec,int32_t * sample_rate)933 Status ReadWaveFile(const std::string &wav_file_dir, std::vector<float> *waveform_vec, int32_t *sample_rate) {
934   RETURN_UNEXPECTED_IF_NULL(waveform_vec);
935   RETURN_UNEXPECTED_IF_NULL(sample_rate);
936   auto wav_realpath = FileUtils::GetRealPath(wav_file_dir.c_str());
937   if (!wav_realpath.has_value()) {
938     LOG_AND_RETURN_STATUS_SYNTAX_ERROR("Invalid file path, get real path failed: " + wav_file_dir);
939   }
940 
941   const float kMaxVal = 32767.0;
942   Path file_path(wav_realpath.value());
943   CHECK_FAIL_RETURN_UNEXPECTED(file_path.Exists() && !file_path.IsDirectory(),
944                                "Invalid file path, failed to find waveform file: " + file_path.ToString());
945   std::ifstream in(file_path.ToString(), std::ios::in | std::ios::binary);
946   CHECK_FAIL_RETURN_UNEXPECTED(in.is_open(), "Invalid file, failed to open waveform file: " + file_path.ToString() +
947                                                ", make sure the file not damaged or permission denied.");
948   auto *header = new WavHeader();
949   in.read(reinterpret_cast<char *>(header), sizeof(WavHeader));
950   *sample_rate = header->sample_rate;
951   float bytes_per_sample = header->bits_per_sample / 8;
952   auto sub_chunk2_size = header->sub_chunk2_size;
953   if (bytes_per_sample == 0) {
954     in.close();
955     delete header;
956     RETURN_STATUS_UNEXPECTED("ReadWaveFile: zero division error, bits per sample of the audio can not be zero.");
957   }
958   int num_samples = sub_chunk2_size / bytes_per_sample;
959   std::unique_ptr<int16_t[]> data = std::make_unique<int16_t[]>(num_samples);
960   in.read(reinterpret_cast<char *>(data.get()), sizeof(int16_t) * num_samples);
961   waveform_vec->resize(num_samples);
962   for (int i = 0; i < num_samples; i++) {
963     (*waveform_vec)[i] = data[i] / kMaxVal;
964   }
965   in.close();
966   delete header;
967   return Status::OK();
968 }
969 
ComputeCmnStartAndEnd(int32_t cmn_window,int32_t min_cmn_window,bool center,int32_t idx,int32_t num_frames,int32_t * cmn_window_start_p,int32_t * cmn_window_end_p)970 Status ComputeCmnStartAndEnd(int32_t cmn_window, int32_t min_cmn_window, bool center, int32_t idx, int32_t num_frames,
971                              int32_t *cmn_window_start_p, int32_t *cmn_window_end_p) {
972   RETURN_UNEXPECTED_IF_NULL(cmn_window_start_p);
973   RETURN_UNEXPECTED_IF_NULL(cmn_window_end_p);
974   RETURN_IF_NOT_OK(ValidateNonNegative("SlidingWindowCmn", "cmn_window", cmn_window));
975   RETURN_IF_NOT_OK(ValidateNonNegative("SlidingWindowCmn", "min_cmn_window", min_cmn_window));
976   int32_t cmn_window_start = 0, cmn_window_end = 0;
977   const constexpr int window_center = 2;
978   if (center) {
979     cmn_window_start = idx - cmn_window / window_center;
980     cmn_window_end = cmn_window_start + cmn_window;
981   } else {
982     cmn_window_start = idx - cmn_window;
983     cmn_window_end = idx + 1;
984   }
985   if (cmn_window_start < 0) {
986     cmn_window_end -= cmn_window_start;
987     cmn_window_start = 0;
988   }
989   if (!center) {
990     if (cmn_window_end > idx) {
991       cmn_window_end = std::max(idx + 1, min_cmn_window);
992     }
993   }
994   if (cmn_window_end > num_frames) {
995     cmn_window_start -= (cmn_window_end - num_frames);
996     cmn_window_end = num_frames;
997     if (cmn_window_start < 0) {
998       cmn_window_start = 0;
999     }
1000   }
1001 
1002   *cmn_window_start_p = cmn_window_start;
1003   *cmn_window_end_p = cmn_window_end;
1004   return Status::OK();
1005 }
1006 
1007 template <typename T>
ComputeCmnWaveform(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * cmn_waveform_p,int32_t num_channels,int32_t num_frames,int32_t num_feats,int32_t cmn_window,int32_t min_cmn_window,bool center,bool norm_vars)1008 Status ComputeCmnWaveform(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *cmn_waveform_p,
1009                           int32_t num_channels, int32_t num_frames, int32_t num_feats, int32_t cmn_window,
1010                           int32_t min_cmn_window, bool center, bool norm_vars) {
1011   using ArrayXT = Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
1012   constexpr int square_num = 2;
1013   int32_t last_window_start = -1, last_window_end = -1;
1014   ArrayXT cur_sum = ArrayXT(num_channels, num_feats);
1015   ArrayXT cur_sum_sq;
1016   if (norm_vars) {
1017     cur_sum_sq = ArrayXT(num_channels, num_feats);
1018   }
1019   for (int i = 0; i < num_frames; ++i) {
1020     int32_t cmn_window_start = 0, cmn_window_end = 0;
1021     RETURN_IF_NOT_OK(
1022       ComputeCmnStartAndEnd(cmn_window, min_cmn_window, center, i, num_frames, &cmn_window_start, &cmn_window_end));
1023     int32_t row = cmn_window_end - cmn_window_start * 2;
1024     int32_t cmn_window_frames = cmn_window_end - cmn_window_start;
1025     for (int32_t m = 0; m < num_channels; ++m) {
1026       if (last_window_start == -1) {
1027         auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1028         it += (m * num_frames * num_feats + cmn_window_start * num_feats);
1029         auto tmp_map = Eigen::Map<ArrayXT>(it, row, num_feats);
1030         if (i > 0) {
1031           cur_sum.row(m) += tmp_map.colwise().sum();
1032           if (norm_vars) {
1033             cur_sum_sq.row(m) += tmp_map.pow(square_num).colwise().sum();
1034           }
1035         } else {
1036           cur_sum.row(m) = tmp_map.colwise().sum();
1037           if (norm_vars) {
1038             cur_sum_sq.row(m) = tmp_map.pow(square_num).colwise().sum();
1039           }
1040         }
1041       } else {
1042         if (cmn_window_start > last_window_start) {
1043           auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1044           it += (m * num_frames * num_feats + last_window_start * num_feats);
1045           auto tmp_map = Eigen::Map<ArrayXT>(it, 1, num_feats);
1046           cur_sum.row(m) -= tmp_map;
1047           if (norm_vars) {
1048             cur_sum_sq.row(m) -= tmp_map.pow(square_num);
1049           }
1050         }
1051         if (cmn_window_end > last_window_end) {
1052           auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1053           it += (m * num_frames * num_feats + last_window_end * num_feats);
1054           auto tmp_map = Eigen::Map<ArrayXT>(it, 1, num_feats);
1055           cur_sum.row(m) += tmp_map;
1056           if (norm_vars) {
1057             cur_sum_sq.row(m) += tmp_map.pow(square_num);
1058           }
1059         }
1060       }
1061 
1062       auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1063       auto cmn_it = reinterpret_cast<T *>(const_cast<uchar *>((*cmn_waveform_p)->GetBuffer()));
1064       it += (m * num_frames * num_feats + i * num_feats);
1065       cmn_it += (m * num_frames * num_feats + i * num_feats);
1066       CHECK_FAIL_RETURN_UNEXPECTED(cmn_window_frames != 0,
1067                                    "SlidingWindowCmn: invalid parameter, 'cmn_window_frames' can not be zero.");
1068       Eigen::Map<ArrayXT>(cmn_it, 1, num_feats) =
1069         Eigen::Map<ArrayXT>(it, 1, num_feats) - cur_sum.row(m) / cmn_window_frames;
1070       if (norm_vars) {
1071         if (cmn_window_frames == 1) {
1072           auto cmn_it_1 = reinterpret_cast<T *>(const_cast<uchar *>((*cmn_waveform_p)->GetBuffer()));
1073           cmn_it_1 += (m * num_frames * num_feats + i * num_feats);
1074           Eigen::Map<ArrayXT>(cmn_it_1, 1, num_feats).setZero();
1075         } else {
1076           auto variance = (Eigen::Map<ArrayXT>(cur_sum_sq.data(), num_channels, num_feats) / cmn_window_frames) -
1077                           (cur_sum.pow(2) / std::pow(cmn_window_frames, 2));
1078           auto cmn_it_2 = reinterpret_cast<T *>(const_cast<uchar *>((*cmn_waveform_p)->GetBuffer()));
1079           cmn_it_2 += (m * num_frames * num_feats + i * num_feats);
1080           Eigen::Map<ArrayXT>(cmn_it_2, 1, num_feats) =
1081             Eigen::Map<ArrayXT>(cmn_it_2, 1, num_feats) * (1 / variance.sqrt()).row(m);
1082         }
1083       }
1084     }
1085     last_window_start = cmn_window_start;
1086     last_window_end = cmn_window_end;
1087   }
1088   return Status::OK();
1089 }
1090 
1091 template <typename T>
SlidingWindowCmnHelper(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t cmn_window,int32_t min_cmn_window,bool center,bool norm_vars)1092 Status SlidingWindowCmnHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t cmn_window,
1093                               int32_t min_cmn_window, bool center, bool norm_vars) {
1094   int32_t num_frames = input->shape()[Tensor::HandleNeg(-2, input->shape().Size())];
1095   int32_t num_feats = input->shape()[Tensor::HandleNeg(-1, input->shape().Size())];
1096 
1097   int32_t first_index = 1;
1098   std::vector<dsize_t> input_shape = input->shape().AsVector();
1099   std::for_each(input_shape.begin(), input_shape.end(), [&first_index](const dsize_t &item) { first_index *= item; });
1100   RETURN_IF_NOT_OK(
1101     input->Reshape(TensorShape({static_cast<int>(first_index / (num_frames * num_feats)), num_frames, num_feats})));
1102 
1103   auto num_channels = static_cast<int32_t>(input->shape()[0]);
1104   TensorPtr cmn_waveform;
1105   RETURN_IF_NOT_OK(
1106     Tensor::CreateEmpty(TensorShape({num_channels, num_frames, num_feats}), input->type(), &cmn_waveform));
1107   RETURN_IF_NOT_OK(ComputeCmnWaveform<T>(input, &cmn_waveform, num_channels, num_frames, num_feats, cmn_window,
1108                                          min_cmn_window, center, norm_vars));
1109 
1110   std::vector<dsize_t> re_shape = input_shape;
1111   auto r_it = re_shape.rbegin();
1112   *r_it++ = num_feats;
1113   *r_it = num_frames;
1114   RETURN_IF_NOT_OK(cmn_waveform->Reshape(TensorShape(re_shape)));
1115 
1116   const constexpr int specify_input_shape = 2;
1117   const constexpr int specify_first_shape = 1;
1118   if (input_shape.size() == specify_input_shape && cmn_waveform->shape()[0] == specify_first_shape) {
1119     cmn_waveform->Squeeze();
1120   }
1121   *output = cmn_waveform;
1122   return Status::OK();
1123 }
1124 
SlidingWindowCmn(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t cmn_window,int32_t min_cmn_window,bool center,bool norm_vars)1125 Status SlidingWindowCmn(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t cmn_window,
1126                         int32_t min_cmn_window, bool center, bool norm_vars) {
1127   RETURN_IF_NOT_OK(ValidateLowRank("SlidingWindowCmn", input, kDefaultAudioDim, "<..., freq, time>"));
1128 
1129   if (input->type().IsNumeric() && input->type().value() != DataType::DE_FLOAT64) {
1130     std::shared_ptr<Tensor> temp;
1131     RETURN_IF_NOT_OK(TypeCast(input, &temp, DataType(DataType::DE_FLOAT32)));
1132     RETURN_IF_NOT_OK(SlidingWindowCmnHelper<float>(temp, output, cmn_window, min_cmn_window, center, norm_vars));
1133   } else if (input->type().value() == DataType::DE_FLOAT64) {
1134     RETURN_IF_NOT_OK(SlidingWindowCmnHelper<double>(input, output, cmn_window, min_cmn_window, center, norm_vars));
1135   } else {
1136     RETURN_IF_NOT_OK(ValidateTensorNumeric("SlidingWindowCmn", input));
1137   }
1138   return Status::OK();
1139 }
1140 
1141 template <typename T>
Pad(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t pad_left,int32_t pad_right,BorderType padding_mode,T value=0)1142 Status Pad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t pad_left, int32_t pad_right,
1143            BorderType padding_mode, T value = 0) {
1144   RETURN_IF_NOT_OK(ValidateLowRank("Pad", input, kMinAudioDim, "<..., time>"));
1145   RETURN_IF_NOT_OK(ValidateTensorNumeric("Pad", input));
1146   RETURN_IF_NOT_OK(ValidateNonNegative("Pad", "pad_left", pad_left));
1147   RETURN_IF_NOT_OK(ValidateNonNegative("Pad", "pad_right", pad_right));
1148   TensorShape input_shape = input->shape();
1149   int32_t wave_length = input_shape[-1];
1150   auto num_wavs = static_cast<int32_t>(input->Size() / wave_length);
1151   TensorShape to_shape = TensorShape({num_wavs, wave_length});
1152   RETURN_IF_NOT_OK(input->Reshape(to_shape));
1153   int32_t pad_length = wave_length + pad_left + pad_right;
1154   TensorShape new_shape = TensorShape({num_wavs, pad_length});
1155   RETURN_IF_NOT_OK(Tensor::CreateEmpty(new_shape, input->type(), output));
1156   using MatrixXT = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
1157   using Eigen::Map;
1158   constexpr int pad_mul = 2;
1159   T *input_data = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1160   T *output_data = reinterpret_cast<T *>(const_cast<uchar *>((*output)->GetBuffer()));
1161   auto input_map = Map<MatrixXT>(input_data, num_wavs, wave_length);
1162   auto output_map = Map<MatrixXT>(output_data, num_wavs, pad_length);
1163   output_map.block(0, pad_left, num_wavs, wave_length) = input_map;
1164   if (padding_mode == BorderType::kConstant) {
1165     output_map.block(0, 0, num_wavs, pad_left).setConstant(value);
1166     output_map.block(0, pad_left + wave_length, num_wavs, pad_right).setConstant(value);
1167   } else if (padding_mode == BorderType::kEdge) {
1168     output_map.block(0, 0, num_wavs, pad_left).colwise() = input_map.col(0);
1169     output_map.block(0, pad_left + wave_length, num_wavs, pad_right).colwise() = input_map.col(wave_length - 1);
1170   } else if (padding_mode == BorderType::kReflect) {
1171     // First, deal with the pad operation on the right.
1172     int32_t current_pad = wave_length - 1;
1173     while (pad_right >= current_pad) {
1174       // current_pad: the length of pad required for current loop.
1175       // pad_right: the length of the remaining pad on the right.
1176       output_map.block(0, pad_left + current_pad + 1, num_wavs, current_pad) =
1177         output_map.block(0, pad_left, num_wavs, current_pad).rowwise().reverse();
1178       pad_right -= current_pad;
1179       current_pad += current_pad;
1180     }
1181     output_map.block(0, pad_length - pad_right, num_wavs, pad_right) =
1182       output_map.block(0, pad_length - pad_right * pad_mul - 1, num_wavs, pad_right).rowwise().reverse();
1183     // Next, deal with the pad operation on the left.
1184     current_pad = wave_length - 1;
1185     while (pad_left >= current_pad) {
1186       // current_pad: the length of pad required for current loop.
1187       // pad_left: the length of the remaining pad on the left.
1188       output_map.block(0, pad_left - current_pad, num_wavs, current_pad) =
1189         output_map.block(0, pad_left + 1, num_wavs, current_pad).rowwise().reverse();
1190       pad_left -= current_pad;
1191       current_pad += current_pad;
1192     }
1193     output_map.block(0, 0, num_wavs, pad_left) =
1194       output_map.block(0, pad_left + 1, num_wavs, pad_left).rowwise().reverse();
1195   } else if (padding_mode == BorderType::kSymmetric) {
1196     // First, deal with the pad operation on the right.
1197     int32_t current_pad = wave_length;
1198     while (pad_right >= current_pad) {
1199       // current_pad: the length of pad required for current loop.
1200       // pad_right: the length of the remaining pad on the right.
1201       output_map.block(0, pad_left + current_pad, num_wavs, current_pad) =
1202         output_map.block(0, pad_left, num_wavs, current_pad).rowwise().reverse();
1203       pad_right -= current_pad;
1204       current_pad += current_pad;
1205     }
1206     output_map.block(0, pad_length - pad_right, num_wavs, pad_right) =
1207       output_map.block(0, pad_length - pad_right * pad_mul, num_wavs, pad_right).rowwise().reverse();
1208     // Next, deal with the pad operation on the left.
1209     current_pad = wave_length;
1210     while (pad_left >= current_pad) {
1211       // current_pad: the length of pad required for current loop.
1212       // pad_left: the length of the remaining pad on the left.
1213       output_map.block(0, pad_left - current_pad, num_wavs, current_pad) =
1214         output_map.block(0, pad_left, num_wavs, current_pad).rowwise().reverse();
1215       pad_left -= current_pad;
1216       current_pad += current_pad;
1217     }
1218     output_map.block(0, 0, num_wavs, pad_left) = output_map.block(0, pad_left, num_wavs, pad_left).rowwise().reverse();
1219   } else {
1220     LOG_AND_RETURN_STATUS_SYNTAX_ERROR("Pad: invalid padding_mode value, check the optional value of BorderType.");
1221   }
1222   std::vector<dsize_t> shape_vec = input_shape.AsVector();
1223   shape_vec[shape_vec.size() - 1] = static_cast<dsize_t>(pad_length);
1224   TensorShape output_shape(shape_vec);
1225   RETURN_IF_NOT_OK((*output)->Reshape(output_shape));
1226   return Status::OK();
1227 }
1228 
1229 template <typename T>
ComputeDeltasImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int all_freqs,int n_frame,int n)1230 Status ComputeDeltasImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int all_freqs,
1231                          int n_frame, int n) {
1232   using VectorXT = Eigen::Matrix<T, Eigen::Dynamic, 1>;
1233   using MatrixXT = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
1234   using Eigen::Map;
1235   int32_t denom = n * (n + 1) * (n * 2 + 1) / 3;
1236   // twice sum of integer squared
1237   VectorXT kernel = VectorXT::LinSpaced(2 * n + 1, -n, n);                         // 2n+1
1238   T *input_data = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));  // [all_freq,n_fram+2n]
1239   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{all_freqs, n_frame}, input->type(), output));
1240   T *output_data = reinterpret_cast<T *>(const_cast<uchar *>((*output)->GetBuffer()));
1241   for (int freq = 0; freq < all_freqs; ++freq) {  // conv with im2col
1242     auto input_map = Map<MatrixXT, 0, Eigen::OuterStride<1>>(input_data + freq * (n_frame + 2 * n), n_frame,
1243                                                              2 * n + 1);  // n_frmae,2n+1
1244     Map<VectorXT>(output_data + freq * n_frame, n_frame) = (input_map * kernel).array() / T(denom);
1245   }
1246   return Status::OK();
1247 }
1248 
Bartlett(std::shared_ptr<Tensor> * output,int len)1249 Status Bartlett(std::shared_ptr<Tensor> *output, int len) {
1250   CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Bartlett: len can not be zero.");
1251   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1252   // Bartlett window function.
1253   auto iter = (*output)->begin<float>();
1254   float twice = 2.0;
1255   for (ptrdiff_t i = 0; i < len; ++i) {
1256     *(iter + i) = 1.0 - std::abs(twice * i / len - 1.0);
1257   }
1258   return Status::OK();
1259 }
1260 
Blackman(std::shared_ptr<Tensor> * output,int len)1261 Status Blackman(std::shared_ptr<Tensor> *output, int len) {
1262   CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Blackman: len can not be zero.");
1263   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1264   // Blackman window function.
1265   auto iter = (*output)->begin<float>();
1266   const float alpha = 0.42;
1267   const float half = 0.5;
1268   const float delta = 0.08;
1269   for (ptrdiff_t i = 0; i < len; ++i) {
1270     *(iter + i) = alpha - half * std::cos(TWO * PI * i / len) + delta * std::cos(TWO * TWO * PI * i / len);
1271   }
1272   return Status::OK();
1273 }
1274 
Hamming(std::shared_ptr<Tensor> * output,int len,float alpha=0.54,float beta=0.46)1275 Status Hamming(std::shared_ptr<Tensor> *output, int len, float alpha = 0.54, float beta = 0.46) {
1276   CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Hamming: len can not be zero.");
1277   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1278   // Hamming window function.
1279   auto iter = (*output)->begin<float>();
1280   for (ptrdiff_t i = 0; i < len; ++i) {
1281     *(iter + i) = alpha - beta * std::cos(TWO * PI * i / len);
1282   }
1283   return Status::OK();
1284 }
1285 
Hann(std::shared_ptr<Tensor> * output,int len)1286 Status Hann(std::shared_ptr<Tensor> *output, int len) {
1287   CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Hann: len can not be zero.");
1288   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1289   // Hann window function.
1290   auto iter = (*output)->begin<float>();
1291   const float half = 0.5;
1292   for (ptrdiff_t i = 0; i < len; ++i) {
1293     *(iter + i) = half - half * std::cos(TWO * PI * i / len);
1294   }
1295   return Status::OK();
1296 }
1297 
Kaiser(std::shared_ptr<Tensor> * output,int len,float beta=12.0)1298 Status Kaiser(std::shared_ptr<Tensor> *output, int len, float beta = 12.0) {
1299 #ifdef __APPLE__
1300   return Status(StatusCode::kMDNotImplementedYet, "For macOS, Kaiser window is not supported.");
1301 #else
1302   CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Kaiser: len can not be zero.");
1303   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1304   // Kaiser window function.
1305   auto iter = (*output)->begin<float>();
1306   float twice = 2.0;
1307   {
1308     std::unique_lock<std::mutex> _lock(cyl_bessel_mux_);
1309     for (ptrdiff_t i = 0; i < len; ++i) {
1310       *(iter + i) =
1311         std::cyl_bessel_i(0, beta * std::sqrt(1 - std::pow(i * twice / (len)-1.0, TWO))) / std::cyl_bessel_i(0, beta);
1312     }
1313   }
1314   return Status::OK();
1315 #endif
1316 }
1317 
Window(std::shared_ptr<Tensor> * output,WindowType window_type,int len)1318 Status Window(std::shared_ptr<Tensor> *output, WindowType window_type, int len) {
1319   switch (window_type) {
1320     case WindowType::kBartlett:
1321       return Bartlett(output, len);
1322     case WindowType::kBlackman:
1323       return Blackman(output, len);
1324     case WindowType::kHamming:
1325       return Hamming(output, len);
1326     case WindowType::kHann:
1327       return Hann(output, len);
1328     case WindowType::kKaiser:
1329       return Kaiser(output, len);
1330     default:
1331       return Hann(output, len);
1332   }
1333 }
1334 
1335 // control whether return half of results after stft.
1336 template <typename T>
Onesided(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int n_fft,int n_columns)1337 Status Onesided(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int n_fft, int n_columns) {
1338   std::shared_ptr<Tensor> output_onsided;
1339   RETURN_IF_NOT_OK(
1340     Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft, n_columns, 2}), input->type(), &output_onsided));
1341   auto onside_begin = output_onsided->begin<T>();
1342   auto spec_f_begin = input->begin<T>();
1343   std::vector<int> spec_f_slice = {(n_fft / 2 + 1) * n_columns * 2, n_columns * 2, 2};
1344   for (int r = 0; r < input->shape()[0]; r++) {
1345     for (int i = 0; i < (n_fft / TWO + 1); i++) {
1346       for (int j = 0; j < n_columns; j++) {
1347         ptrdiff_t onside_offset_0 = r * n_fft * n_columns * 2 + i * spec_f_slice[1] + j * spec_f_slice[2];
1348         ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1349         ptrdiff_t onside_offset_1 = onside_offset_0 + 1;
1350         ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1351         *(onside_begin + onside_offset_0) = *(spec_f_begin + spec_f_offset_0);
1352         *(onside_begin + onside_offset_1) = *(spec_f_begin + spec_f_offset_1);
1353       }
1354     }
1355     for (int i = n_fft / 2 + 1; i < n_fft; i++) {
1356       for (int j = 0; j < n_columns; j++) {
1357         ptrdiff_t onside_offset_0 = r * n_fft * n_columns * 2 + i * spec_f_slice[1] + j * spec_f_slice[2];
1358         ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + (n_fft - i) * spec_f_slice[1] + j * spec_f_slice[2];
1359         ptrdiff_t onside_offset_1 = onside_offset_0 + 1;
1360         ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1361         *(onside_begin + onside_offset_0) = *(spec_f_begin + spec_f_offset_0);
1362         *(onside_begin + onside_offset_1) = *(spec_f_begin + spec_f_offset_1);
1363       }
1364     }
1365   }
1366   *output = output_onsided;
1367   return Status::OK();
1368 }
1369 
1370 template <typename T>
PowerStft(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float power,int n_columns,int n_length)1371 Status PowerStft(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power, int n_columns,
1372                  int n_length) {
1373   auto spec_f_begin = input->begin<T>();
1374   std::vector<int> spec_f_slice = {n_length * n_columns * 2, n_columns * 2, 2};
1375   std::vector<int> spec_p_slice = {n_length * n_columns, n_columns};
1376   std::shared_ptr<Tensor> spec_p;
1377   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[0], n_length, n_columns}), input->type(), &spec_p));
1378   auto spec_p_begin = spec_p->begin<T>();
1379   for (int r = 0; r < input->shape()[0]; r++) {
1380     for (int i = 0; i < n_length; i++) {
1381       for (int j = 0; j < n_columns; j++) {
1382         ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1383         ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1384         ptrdiff_t spec_p_offset = r * spec_p_slice[0] + i * spec_p_slice[1] + j;
1385         T spec_power_0 = *(spec_f_begin + spec_f_offset_0);
1386         T spec_power_1 = *(spec_f_begin + spec_f_offset_1);
1387         *(spec_p_begin + spec_p_offset) =
1388           std::pow(std::sqrt(std::pow(spec_power_0, TWO) + std::pow(spec_power_1, TWO)), power);
1389       }
1390     }
1391   }
1392   *output = spec_p;
1393   return Status::OK();
1394 }
1395 
1396 template <typename T>
Stft(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int n_fft,const std::shared_ptr<Tensor> & win,int win_length,int n_columns,bool normalized,float power,bool onesided)1397 Status Stft(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int n_fft,
1398             const std::shared_ptr<Tensor> &win, int win_length, int n_columns, bool normalized, float power,
1399             bool onesided) {
1400   CHECK_FAIL_RETURN_UNEXPECTED(win_length != 0, "Spectrogram: win_length can not be zero.");
1401   double win_sum = 0.;
1402   float twice = 2.0;
1403   for (auto iter_win = win->begin<float>(); iter_win != win->end<float>(); iter_win++) {
1404     win_sum += (*iter_win) * (*iter_win);
1405   }
1406   win_sum = std::sqrt(win_sum);
1407   std::shared_ptr<Tensor> spec_f;
1408 
1409   RETURN_IF_NOT_OK(
1410     Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft / 2 + 1, n_columns, 2}), input->type(), &spec_f));
1411 
1412   auto spec_f_begin = spec_f->begin<T>();
1413   auto input_win_begin = input->begin<T>();
1414   std::vector<int> spec_f_slice = {(n_fft / 2 + 1) * n_columns * 2, n_columns * 2, 2};
1415   std::vector<int> input_win_slice = {n_columns * win_length, win_length};
1416   std::shared_ptr<Tensor> spec_p;
1417   RETURN_IF_NOT_OK(
1418     Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft / 2 + 1, n_columns}), input->type(), &spec_p));
1419   std::shared_ptr<Tensor> exp_complex;
1420   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({n_fft / 2 + 1, win_length, 2}), input->type(), &exp_complex));
1421   auto exp_complex_begin = exp_complex->begin<T>();
1422   std::vector<int> exp_complex_slice = {win_length * 2, 2};
1423   for (int i = 0; i < (n_fft / TWO + 1); i++) {
1424     for (int k = 0; k <= win_length - 1; k++) {
1425       ptrdiff_t exp_complex_offset_0 = i * exp_complex_slice[0] + k * exp_complex_slice[1];
1426       ptrdiff_t exp_complex_offset_1 = exp_complex_offset_0 + 1;
1427       *(exp_complex_begin + exp_complex_offset_0) = std::cos(twice * PI * i * k / win_length);
1428       *(exp_complex_begin + exp_complex_offset_1) = std::sin(twice * PI * i * k / win_length);
1429     }
1430   }
1431   for (int r = 0; r < input->shape()[0]; r++) {
1432     for (int i = 0; i < (n_fft / TWO + 1); i++) {
1433       for (int j = 0; j < n_columns; j++) {
1434         T spec_f_0 = 0.;
1435         T spec_f_1 = 0.;
1436         ptrdiff_t exp_complex_offset_0 = i * exp_complex_slice[0];
1437         for (int k = 0; k < win_length; k++) {
1438           ptrdiff_t exp_complex_offset_1 = exp_complex_offset_0 + 1;
1439           T exp_complex_a = *(exp_complex_begin + exp_complex_offset_0);
1440           T exp_complex_b = *(exp_complex_begin + exp_complex_offset_1);
1441           ptrdiff_t input_win_offset = r * input_win_slice[0] + j * input_win_slice[1] + k;
1442           T input_value = *(input_win_begin + input_win_offset);
1443           spec_f_0 += input_value * exp_complex_a;
1444           spec_f_1 += -input_value * exp_complex_b;
1445           exp_complex_offset_0 = exp_complex_offset_1 + 1;
1446         }
1447         ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1448         ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1449         *(spec_f_begin + spec_f_offset_0) = spec_f_0;
1450         *(spec_f_begin + spec_f_offset_1) = spec_f_1;
1451       }
1452     }
1453   }
1454   CHECK_FAIL_RETURN_UNEXPECTED(win_sum != 0, "Window: the total value of window function can not be zero.");
1455   if (normalized) {
1456     for (int r = 0; r < input->shape()[0]; r++) {
1457       for (int i = 0; i < (n_fft / TWO + 1); i++) {
1458         for (int j = 0; j < n_columns; j++) {
1459           ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1460           ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1461           T spec_norm_a = *(spec_f_begin + spec_f_offset_0);
1462           T spec_norm_b = *(spec_f_begin + spec_f_offset_1);
1463           *(spec_f_begin + spec_f_offset_0) = spec_norm_a / win_sum;
1464           *(spec_f_begin + spec_f_offset_1) = spec_norm_b / win_sum;
1465         }
1466       }
1467     }
1468   }
1469   std::shared_ptr<Tensor> output_onsided;
1470   if (!onesided) {
1471     RETURN_IF_NOT_OK(Onesided<T>(spec_f, &output_onsided, n_fft, n_columns));
1472     if (power == 0) {
1473       *output = output_onsided;
1474       return Status::OK();
1475     }
1476     RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft, n_columns}), input->type(), &spec_p));
1477     RETURN_IF_NOT_OK(PowerStft<T>(output_onsided, &spec_p, power, n_columns, n_fft));
1478     *output = spec_p;
1479     return Status::OK();
1480   }
1481   if (power == 0) {
1482     *output = spec_f;
1483     return Status::OK();
1484   }
1485   RETURN_IF_NOT_OK(PowerStft<T>(spec_f, &spec_p, power, n_columns, n_fft / TWO + 1));
1486   *output = spec_p;
1487   return Status::OK();
1488 }
1489 
1490 template <typename T>
SpectrogramImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int pad,WindowType window,int n_fft,int hop_length,int win_length,float power,bool normalized,bool center,BorderType pad_mode,bool onesided)1491 Status SpectrogramImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int pad,
1492                        WindowType window, int n_fft, int hop_length, int win_length, float power, bool normalized,
1493                        bool center, BorderType pad_mode, bool onesided) {
1494   std::shared_ptr<Tensor> fft_window_tensor;
1495   std::shared_ptr<Tensor> fft_window_later;
1496   TensorShape shape = input->shape();
1497   std::vector output_shape = shape.AsVector();
1498   output_shape.pop_back();
1499   int input_len = input->shape()[-1];
1500 
1501   RETURN_IF_NOT_OK(input->Reshape(TensorShape({input->Size() / input_len, input_len})));
1502 
1503   DataType data_type = input->type();
1504   // get the windows
1505   RETURN_IF_NOT_OK(Window(&fft_window_tensor, window, win_length));
1506   if (win_length == 1) {
1507     RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &fft_window_tensor));
1508     auto win = fft_window_tensor->begin<float>();
1509     *(win) = 1;
1510   }
1511 
1512   // Pad window length
1513   int pad_left = (n_fft - win_length) / 2;
1514   int pad_right = n_fft - win_length - pad_left;
1515   RETURN_IF_NOT_OK(fft_window_tensor->Reshape(TensorShape({1, win_length})));
1516   RETURN_IF_NOT_OK(Pad<float>(fft_window_tensor, &fft_window_later, pad_left, pad_right, BorderType::kConstant));
1517   RETURN_IF_NOT_OK(fft_window_later->Reshape(TensorShape({n_fft})));
1518 
1519   int length = input_len + pad * 2 + n_fft;
1520 
1521   std::shared_ptr<Tensor> input_data_tensor;
1522   std::shared_ptr<Tensor> input_data_tensor_pad;
1523   RETURN_IF_NOT_OK(
1524     Tensor::CreateEmpty(TensorShape({input->shape()[0], input_len + pad * 2}), data_type, &input_data_tensor_pad));
1525   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[0], length}), data_type, &input_data_tensor));
1526 
1527   RETURN_IF_NOT_OK(Pad<T>(input, &input_data_tensor_pad, pad, pad, BorderType::kConstant));
1528 
1529   if (center) {
1530     RETURN_IF_NOT_OK(Pad<T>(input_data_tensor_pad, &input_data_tensor, n_fft / TWO, n_fft / TWO, pad_mode));
1531   } else {
1532     input_data_tensor = input_data_tensor_pad;
1533   }
1534 
1535   CHECK_FAIL_RETURN_UNEXPECTED(n_fft <= input_data_tensor->shape()[-1],
1536                                "Spectrogram: n_fft should be more than 0 and less than " +
1537                                  std::to_string(input_data_tensor->shape()[-1]) +
1538                                  ", but got n_fft: " + std::to_string(n_fft) + ".");
1539 
1540   // calculate the sliding times of the window function
1541   CHECK_FAIL_RETURN_UNEXPECTED(hop_length != 0, "Spectrogram: hop_length can not be zero.");
1542   int n_columns = 0;
1543   while ((1 + n_columns++) * hop_length + n_fft <= input_data_tensor->shape()[-1]) {
1544   }
1545   std::shared_ptr<Tensor> stft_compute;
1546 
1547   auto input_begin = input_data_tensor->begin<T>();
1548   std::vector<int> input_win_slice = {n_columns * n_fft, n_fft};
1549   auto iter_win = fft_window_later->begin<float>();
1550   std::shared_ptr<Tensor> input_win;
1551   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input_data_tensor->shape()[0], n_columns, n_fft}),
1552                                        input_data_tensor->type(), &input_win));
1553   auto input_win_begin = input_win->begin<T>();
1554   for (int r = 0; r < input_data_tensor->shape()[0]; r++) {
1555     for (int j = 0; j < n_columns; j++) {
1556       for (int k = 0; k < n_fft; k++) {
1557         ptrdiff_t win_offset = k;
1558         float win_value = *(iter_win + win_offset);
1559         ptrdiff_t input_stft_offset = r * input_data_tensor->shape()[-1] + j * hop_length + k;
1560         T input_value = *(input_begin + input_stft_offset);
1561         ptrdiff_t input_win_offset = r * input_win_slice[0] + j * input_win_slice[1] + k;
1562         *(input_win_begin + input_win_offset) = win_value * input_value;
1563       }
1564     }
1565   }
1566   RETURN_IF_NOT_OK(
1567     Stft<T>(input_win, &stft_compute, n_fft, fft_window_later, n_fft, n_columns, normalized, power, onesided));
1568   if (onesided) {
1569     output_shape.push_back(n_fft / TWO + 1);
1570   } else {
1571     output_shape.push_back(n_fft);
1572   }
1573   output_shape.push_back(n_columns);
1574   if (power == 0) {
1575     output_shape.push_back(TWO);
1576   }
1577   // reshape the output
1578   RETURN_IF_NOT_OK(stft_compute->Reshape(TensorShape({output_shape})));
1579   *output = stft_compute;
1580   return Status::OK();
1581 }
1582 
Spectrogram(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int pad,WindowType window,int n_fft,int hop_length,int win_length,float power,bool normalized,bool center,BorderType pad_mode,bool onesided)1583 Status Spectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int pad, WindowType window,
1584                    int n_fft, int hop_length, int win_length, float power, bool normalized, bool center,
1585                    BorderType pad_mode, bool onesided) {
1586   TensorShape input_shape = input->shape();
1587 
1588   CHECK_FAIL_RETURN_UNEXPECTED(
1589     input->type().IsNumeric(),
1590     "Spectrogram: input tensor type should be int, float or double, but got: " + input->type().ToString());
1591 
1592   CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "Spectrogram: input tensor is not in shape of <..., time>.");
1593 
1594   std::shared_ptr<Tensor> input_tensor;
1595   if (input->type() != DataType::DE_FLOAT64) {
1596     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
1597     return SpectrogramImpl<float>(input_tensor, output, pad, window, n_fft, hop_length, win_length, power, normalized,
1598                                   center, pad_mode, onesided);
1599   } else {
1600     input_tensor = input;
1601     return SpectrogramImpl<double>(input_tensor, output, pad, window, n_fft, hop_length, win_length, power, normalized,
1602                                    center, pad_mode, onesided);
1603   }
1604 }
1605 
1606 template <typename T>
SpectralCentroidImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int sample_rate,int n_fft,int win_length,int hop_length,int pad,WindowType window)1607 Status SpectralCentroidImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate,
1608                             int n_fft, int win_length, int hop_length, int pad, WindowType window) {
1609   std::shared_ptr<Tensor> output_tensor;
1610   std::shared_ptr<Tensor> spectrogram_tensor;
1611   if (input->type() == DataType::DE_FLOAT64) {
1612     SpectrogramImpl<double>(input, &spectrogram_tensor, pad, window, n_fft, hop_length, win_length, 1.0, false, true,
1613                             BorderType::kReflect, true);
1614   } else {
1615     SpectrogramImpl<float>(input, &spectrogram_tensor, pad, window, n_fft, hop_length, win_length, 1.0, false, true,
1616                            BorderType::kReflect, true);
1617   }
1618   std::shared_ptr<Tensor> freqs;
1619   // sample_rate / TWO is half of sample_rate and n_fft / TWO is half of n_fft
1620   RETURN_IF_NOT_OK(Linspace<T>(&freqs, 0, sample_rate / TWO, 1 + n_fft / TWO));
1621   auto itr_freq = freqs->begin<T>();
1622   int num = freqs->Size();
1623   TensorShape spectrogram_shape = spectrogram_tensor->shape();
1624   int waveform = spectrogram_shape[-1];
1625   int channals = spectrogram_shape[-2];
1626   std::vector output_shape = spectrogram_shape.AsVector();
1627   output_shape[output_shape.size() - TWO] = 1;
1628   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{output_shape}, input->type(), &output_tensor));
1629   Eigen::MatrixXd freqs_r = Eigen::MatrixXd::Zero(num, 1);
1630   for (int i = 0; i < num; ++i) {
1631     freqs_r(i, 0) = *itr_freq;
1632     itr_freq++;
1633   }
1634   int k_num = spectrogram_tensor->Size() / (waveform * channals);
1635   std::vector<Eigen::MatrixXd> specgram;
1636   std::vector<Eigen::MatrixXd> specgram_result;
1637   std::vector<Eigen::MatrixXd> specgram_sum;
1638   Eigen::MatrixXd tmp = Eigen::MatrixXd::Zero(channals, waveform);
1639   auto itr_spectrogram = spectrogram_tensor->begin<T>();
1640   for (int k = 0; k < k_num; k++) {
1641     for (int i = 0; i < channals; ++i) {
1642       for (int j = 0; j < waveform; ++j) {
1643         tmp(i, j) = *itr_spectrogram;
1644         itr_spectrogram++;
1645       }
1646     }
1647     specgram.push_back(tmp);
1648     (void)specgram_sum.emplace_back(specgram[k].colwise().sum());
1649   }
1650   for (int k = 0; k < k_num; k++) {
1651     for (int i = 0; i < channals; ++i) {
1652       for (int j = 0; j < waveform; ++j) {
1653         tmp(i, j) = freqs_r(i, 0) * specgram[k](i, j);
1654       }
1655     }
1656     (void)specgram_result.emplace_back((tmp).colwise().sum());
1657   }
1658   auto itr_output = output_tensor->begin<T>();
1659   for (int k = 0; k < k_num; k++) {
1660     for (int i = 0; i < waveform; ++i) {
1661       *itr_output = specgram_result[k](0, i) / specgram_sum[k](0, i);
1662       itr_output++;
1663     }
1664   }
1665   *output = output_tensor;
1666   return Status::OK();
1667 }
1668 
SpectralCentroid(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int sample_rate,int n_fft,int win_length,int hop_length,int pad,WindowType window)1669 Status SpectralCentroid(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate,
1670                         int n_fft, int win_length, int hop_length, int pad, WindowType window) {
1671   RETURN_IF_NOT_OK(ValidateLowRank("SpectralCentroid", input, kMinAudioDim, "<..., time>"));
1672   RETURN_IF_NOT_OK(ValidateTensorNumeric("SpectralCentroid", input));
1673 
1674   std::shared_ptr<Tensor> input_tensor;
1675   if (input->type() != DataType::DE_FLOAT64) {
1676     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
1677     return SpectralCentroidImpl<float>(input_tensor, output, sample_rate, n_fft, win_length, hop_length, pad, window);
1678   } else {
1679     input_tensor = input;
1680     return SpectralCentroidImpl<double>(input_tensor, output, sample_rate, n_fft, win_length, hop_length, pad, window);
1681   }
1682 }
1683 
ComputeDeltas(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t win_length,const BorderType & mode)1684 Status ComputeDeltas(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length,
1685                      const BorderType &mode) {
1686   RETURN_IF_NOT_OK(ValidateLowRank("ComputeDeltas", input, kDefaultAudioDim, "<..., freq, time>"));
1687   RETURN_IF_NOT_OK(ValidateTensorNumeric("ComputeDeltas", input));
1688 
1689   // reshape Tensor from <..., freq, time> to <-1, time>
1690   auto raw_shape = input->shape();
1691   int32_t n_frames = raw_shape[-1];
1692   int32_t all_freqs = raw_shape.NumOfElements() / n_frames;
1693   RETURN_IF_NOT_OK(input->Reshape(TensorShape{all_freqs, n_frames}));
1694 
1695   int32_t n = (win_length - 1) / 2;
1696 
1697   std::shared_ptr<Tensor> specgram_local_pad;
1698   if (input->type() == DataType(DataType::DE_FLOAT64)) {
1699     RETURN_IF_NOT_OK(Pad<double>(input, &specgram_local_pad, n, n, mode));
1700     RETURN_IF_NOT_OK(ComputeDeltasImpl<double>(specgram_local_pad, output, all_freqs, n_frames, n));
1701   } else {
1702     std::shared_ptr<Tensor> float_tensor;
1703     RETURN_IF_NOT_OK(TypeCast(input, &float_tensor, DataType(DataType::DE_FLOAT32)));
1704     RETURN_IF_NOT_OK(Pad<float>(float_tensor, &specgram_local_pad, n, n, mode));
1705     RETURN_IF_NOT_OK(ComputeDeltasImpl<float>(specgram_local_pad, output, all_freqs, n_frames, n));
1706   }
1707   RETURN_IF_NOT_OK((*output)->Reshape(raw_shape));
1708   return Status::OK();
1709 }
1710 
1711 /// \brief IRFFT.
IRFFT(const Eigen::MatrixXcd & stft_matrix,Eigen::MatrixXd * inverse)1712 Status IRFFT(const Eigen::MatrixXcd &stft_matrix, Eigen::MatrixXd *inverse) {
1713   int32_t n = 2 * (stft_matrix.rows() - 1);
1714   int32_t s = stft_matrix.rows() - 1;
1715   Eigen::FFT<double> fft;
1716   for (int k = 0; k < stft_matrix.cols(); ++k) {
1717     Eigen::VectorXcd output_complex(n);
1718     // pad input
1719     Eigen::VectorXcd input(n);
1720     input.head(s + 1) = stft_matrix.col(k);
1721     auto reverse_pad = stft_matrix.col(k).segment(1, s - 1).colwise().reverse().conjugate();
1722     input.segment(s + 1, s - 1) = reverse_pad;
1723     fft.inv(output_complex, input);
1724     Eigen::VectorXd output_real = output_complex.real().eval();
1725     inverse->col(k) = Eigen::Map<Eigen::MatrixXd>(output_real.data(), n, 1);
1726   }
1727   return Status::OK();
1728 }
1729 
1730 /// \brief Overlap Add
OverlapAdd(Eigen::VectorXd * out_buf,const Eigen::MatrixXd & win_inverse_stft,int32_t hop_lengh)1731 Status OverlapAdd(Eigen::VectorXd *out_buf, const Eigen::MatrixXd &win_inverse_stft, int32_t hop_lengh) {
1732   int32_t n_fft = win_inverse_stft.rows();
1733   for (int frame = 0; frame < win_inverse_stft.cols(); frame++) {
1734     int32_t sample = frame * hop_lengh;
1735     out_buf->middleRows(sample, n_fft) += win_inverse_stft.col(frame);
1736   }
1737   return Status::OK();
1738 }
1739 
1740 /// \brief Window Sum Square
WindowSumSquare(const Eigen::MatrixXf & window_matrix,Eigen::VectorXf * win_sum_square,const int32_t n_frames,int32_t n_fft,int32_t hop_length)1741 Status WindowSumSquare(const Eigen::MatrixXf &window_matrix, Eigen::VectorXf *win_sum_square, const int32_t n_frames,
1742                        int32_t n_fft, int32_t hop_length) {
1743   Eigen::MatrixXf win_norm = window_matrix.array().pow(2);
1744   // window sum square fill
1745   int32_t n = n_fft + hop_length * (n_frames - 1);
1746   // check n_fft
1747   CHECK_FAIL_RETURN_UNEXPECTED(
1748     n_fft == win_norm.rows(),
1749     "GriffinLim: n_fft must be equal to the length of the window during window sum square calculation.");
1750   for (int ind = 0; ind < n_frames; ind++) {
1751     int sample = ind * hop_length;
1752     int end_ss = std::min(n, sample + n_fft);
1753     int end_win = std::max(0, std::min(n_fft, n - sample));
1754     win_sum_square->segment(sample, end_ss - sample) += win_norm.col(0).head(end_win);
1755   }
1756   return Status::OK();
1757 }
1758 
1759 /// \brief ISTFT.
1760 /// \param input: Complex matrix of eigen, shape of <freq, time>.
1761 /// \param output: Tensor of shape <time>.
1762 /// \param n_fft: Size of Fourier transform.
1763 /// \param hop_length: The distance between neighboring sliding window frames.
1764 /// \param win_length: The size of window frame and STFT filter.
1765 /// \param window_type: The type of window function.
1766 /// \param center:  Whether input was padded on both sides so that the `t`-th frame is centered at time
1767 ///     `t * hop_length`.
1768 /// \param normalized: Whether the STFT was normalized.
1769 /// \param onesided: Whether the STFT was onesided.
1770 /// \param length: The amount to trim the signal by (i.e. the original signal length).
1771 /// \return Status code.
1772 template <typename T>
ISTFT(const Eigen::MatrixXcd & stft_matrix,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t hop_length,int32_t win_length,WindowType window_type,bool center,int32_t length)1773 Status ISTFT(const Eigen::MatrixXcd &stft_matrix, std::shared_ptr<Tensor> *output, int32_t n_fft, int32_t hop_length,
1774              int32_t win_length, WindowType window_type, bool center, int32_t length) {
1775   // check input
1776   constexpr int64_t transform_size = 2;
1777   CHECK_FAIL_RETURN_UNEXPECTED(n_fft == ((stft_matrix.rows() - 1) * transform_size),
1778                                "GriffinLim: the frequency of the input should equal to n_fft / 2 + 1");
1779 
1780   // window
1781   std::shared_ptr<Tensor> ifft_window_tensor;
1782   RETURN_IF_NOT_OK(Window(&ifft_window_tensor, window_type, win_length));
1783   if (win_length == 1) {
1784     RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &ifft_window_tensor));
1785     auto win = ifft_window_tensor->begin<float>();
1786     *(win) = 1;
1787   }
1788   // pad window to match n_fft, and add a broadcasting axis
1789   std::shared_ptr<Tensor> ifft_window_pad;
1790   ifft_window_pad = ifft_window_tensor;
1791   if (win_length < n_fft) {
1792     RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({n_fft}), DataType(DataType::DE_FLOAT32), &ifft_window_pad));
1793     int pad_left = (n_fft - win_length) / 2;
1794     int pad_right = n_fft - win_length - pad_left;
1795     RETURN_IF_NOT_OK(Pad<float>(ifft_window_tensor, &ifft_window_pad, pad_left, pad_right, BorderType::kConstant));
1796   }
1797 
1798   int32_t n_frames = 0;
1799   if ((length != 0) && (hop_length != 0)) {
1800     int32_t padded_length = center ? (length + n_fft) : length;
1801     n_frames = std::min(static_cast<int32_t>(stft_matrix.cols()),
1802                         static_cast<int32_t>(std::ceil(static_cast<float>(padded_length) / hop_length)));
1803   } else {
1804     n_frames = stft_matrix.cols();
1805   }
1806   int32_t expected_signal_len = n_fft + hop_length * (n_frames - 1);
1807   Eigen::VectorXd y(expected_signal_len);
1808   y.setZero();
1809 
1810   // constrain STFT block sizes to 256 KB
1811   int32_t max_mem_block = std::pow(2, 8) * std::pow(2, 10);
1812   int32_t n_columns = max_mem_block / (stft_matrix.rows() * sizeof(T) * TWO);
1813   n_columns = std::max(n_columns, 1);
1814 
1815   // turn window to eigen matrix
1816   auto data_ptr = &*ifft_window_pad->begin<float>();
1817   Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> ifft_window_matrix(data_ptr,
1818                                                                                       ifft_window_pad->shape()[0], 1);
1819   for (int bl_s = 0, frame = 0; bl_s < n_frames;) {
1820     int bl_t = std::min(bl_s + n_columns, n_frames);
1821     // calculate ifft
1822     Eigen::MatrixXcd stft_temp = stft_matrix.middleCols(bl_s, bl_t - bl_s).eval();
1823     Eigen::MatrixXd inverse(TWO * (stft_temp.rows() - 1), stft_temp.cols());
1824     inverse.setZero();
1825     RETURN_IF_NOT_OK(IRFFT(stft_temp, &inverse));
1826     auto ytmp = ifft_window_matrix.template cast<double>().replicate(1, inverse.cols()).cwiseProduct(inverse);
1827     RETURN_IF_NOT_OK(OverlapAdd(&y, ytmp, hop_length));
1828     frame += bl_t - bl_s;
1829     bl_s += n_columns;
1830   }
1831   // normalize by sum of squared window
1832   int32_t n = n_fft + hop_length * (n_frames - 1);
1833   Eigen::VectorXf ifft_win_sum(n);
1834   ifft_win_sum.setZero();
1835   RETURN_IF_NOT_OK(WindowSumSquare(ifft_window_matrix, &ifft_win_sum, n_frames, n_fft, hop_length));
1836 
1837   for (int32_t ind = 0; ind < y.rows(); ind++) {
1838     if (ifft_win_sum[ind] > std::numeric_limits<float>::min()) {
1839       y[ind] /= ifft_win_sum[ind];
1840     }
1841   }
1842 
1843   std::shared_ptr<Tensor> res_tensor;
1844   if (length == 0 && center) {
1845     int32_t y_start = n_fft / 2;
1846     int32_t y_end = y.rows() - y_start;
1847     auto tmp = y.middleRows(y_start, y_end - y_start);
1848     std::vector<T> y_res(tmp.data(), tmp.data() + tmp.rows() * tmp.cols());
1849     RETURN_IF_NOT_OK(Tensor::CreateFromVector(y_res, TensorShape({tmp.size()}), &res_tensor));
1850   } else {
1851     int32_t start = center ? n_fft / 2 : 0;
1852     auto tmp = y.tail(y.rows() - start);
1853     // fix length
1854     std::vector<T> y_res(tmp.data(), tmp.data() + tmp.rows() * tmp.cols());
1855     if (length > y_res.size()) {
1856       while (y_res.size() != length) {
1857         y_res.push_back(0);
1858       }
1859     } else if (length < tmp.rows()) {
1860       while (y_res.size() != length) {
1861         y_res.pop_back();
1862       }
1863     }
1864     RETURN_IF_NOT_OK(Tensor::CreateFromVector(y_res, TensorShape({length}), &res_tensor));
1865   }
1866   *output = res_tensor;
1867   return Status::OK();
1868 }
1869 
1870 template <typename T>
Normalized(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t win_length,WindowType window)1871 Status Normalized(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft,
1872                   int32_t win_length, WindowType window) {
1873   RETURN_UNEXPECTED_IF_NULL(input);
1874   RETURN_UNEXPECTED_IF_NULL(output);
1875   // window
1876   std::shared_ptr<Tensor> ifft_win_tensor;
1877   RETURN_IF_NOT_OK(Window(&ifft_win_tensor, window, win_length));
1878   if (win_length == 1) {
1879     RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &ifft_win_tensor));
1880     auto win = ifft_win_tensor->begin<float>();
1881     *(win) = 1;
1882   }
1883   // pad window to match n_fft, and add a broadcasting axis
1884   std::shared_ptr<Tensor> ifft_win_pad;
1885   ifft_win_pad = ifft_win_tensor;
1886   if (win_length < n_fft) {
1887     RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({n_fft}), DataType(DataType::DE_FLOAT32), &ifft_win_pad));
1888     int pad_left = (n_fft - win_length) / TWO;
1889     int pad_right = n_fft - win_length - pad_left;
1890     RETURN_IF_NOT_OK(Pad<float>(ifft_win_tensor, &ifft_win_pad, pad_left, pad_right, BorderType::kConstant));
1891   }
1892   float win_sum = 0.;
1893   for (auto iter_win = ifft_win_pad->begin<float>(); iter_win != ifft_win_pad->end<float>(); ++iter_win) {
1894     win_sum += (*iter_win) * (*iter_win);
1895   }
1896   win_sum = std::sqrt(win_sum);
1897   for (auto iter_input = input->begin<T>(); iter_input != input->end<T>(); ++iter_input) {
1898     *iter_input = (*iter_input) * win_sum;
1899   }
1900   *output = input;
1901   return Status::OK();
1902 }
1903 
1904 template <typename T>
Transistft(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t length,int32_t n_fft,int32_t win_length,int32_t hop_length,WindowType window,bool center,bool onesided)1905 Status Transistft(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length, int32_t n_fft,
1906                   int32_t win_length, int32_t hop_length, WindowType window, bool center, bool onesided) {
1907   RETURN_UNEXPECTED_IF_NULL(output);
1908   TensorShape input_shape = input->shape();
1909   const int32_t kThreeAudioDim = 3;
1910   auto m = n_fft / TWO + 1;
1911   auto n = input_shape[-kDefaultAudioDim];
1912   auto r = input_shape[-kThreeAudioDim];
1913   auto size = r * n * input_shape[kThreeAudioDim];
1914   // construct new tensor by input
1915   std::shared_ptr<Tensor> istft_results;
1916   for (int dim = 0; dim < input_shape[0]; dim++) {
1917     // slice and squeeze the first dim
1918     Tensor::TensorIterator<T> itr = input->begin<T>();
1919     itr += dim * size;
1920     // onesided
1921     int a = onesided ? r : m;
1922     // turn tensor to eigen matrixXcd
1923     std::shared_ptr<Tensor> waveform;
1924     Eigen::MatrixXcd stft_matrix(a, n);
1925     for (int32_t index = 0; index < a; index++) {
1926       for (int32_t cnt = 0; cnt < n; cnt++) {
1927         if (itr < (dim + 1) * size + itr) {
1928           T real = *(itr++);
1929           T img = *(itr++);
1930           stft_matrix(index, cnt) = std::complex<double>(real, img);
1931         }
1932       }
1933     }
1934     RETURN_IF_NOT_OK(ISTFT<T>(stft_matrix, &waveform, n_fft, hop_length, win_length, window, center, length));
1935     if (input->shape().Rank() == TWO) {
1936       // do not expand dim
1937       istft_results = waveform;
1938       continue;
1939     }
1940 
1941     if (istft_results != nullptr) {
1942       RETURN_IF_NOT_OK(istft_results->InsertTensor({dim}, waveform));
1943     } else {
1944       RETURN_IF_NOT_OK(
1945         Tensor::CreateEmpty(TensorShape({input_shape[0], waveform->shape()[0]}), waveform->type(), &istft_results));
1946       RETURN_IF_NOT_OK(istft_results->InsertTensor({dim}, waveform));
1947     }
1948   }
1949   *output = istft_results;
1950   return Status::OK();
1951 }
1952 
1953 template <typename T>
InverseSpectrogramImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t length,int32_t n_fft,int32_t win_length,int32_t hop_length,int32_t pad,WindowType window,bool normalized,bool center,BorderType pad_mode,bool onesided)1954 Status InverseSpectrogramImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length,
1955                               int32_t n_fft, int32_t win_length, int32_t hop_length, int32_t pad, WindowType window,
1956                               bool normalized, bool center, BorderType pad_mode, bool onesided) {
1957   RETURN_UNEXPECTED_IF_NULL(output);
1958   // pack
1959   TensorShape input_shape = input->shape();
1960   std::vector output_shape = input_shape.AsVector();
1961   TensorShape input_reshape({input->Size() / input_shape[-1] / input_shape[-2] / input_shape[-3], input_shape[-3],
1962                              input_shape[-2], input_shape[-1]});
1963   RETURN_IF_NOT_OK(input->Reshape(input_reshape));
1964   // normalized
1965   std::shared_ptr<Tensor> normal_out;
1966   normal_out = input;
1967   if (normalized) {
1968     RETURN_IF_NOT_OK(Normalized<T>(input, &normal_out, n_fft, win_length, window));
1969   }
1970   // construct new tensor by input
1971   std::shared_ptr<Tensor> final_results;
1972   int32_t output_length = length == 0 ? (input_shape[-kDefaultAudioDim] - 1) * hop_length : length + TWO * pad;
1973   RETURN_IF_NOT_OK(
1974     Transistft<T>(normal_out, &final_results, output_length, n_fft, win_length, hop_length, window, center, onesided));
1975   // remove padding from front and backs
1976   TensorShape waveform_shape = final_results->shape();
1977   auto dim1_size = waveform_shape[-1];
1978   auto dim2_size = waveform_shape[-kDefaultAudioDim];
1979   auto cols = waveform_shape[-1] - TWO * pad;
1980   auto start = &*final_results->begin<T>();
1981   auto dim = dim1_size * dim2_size;
1982   std::shared_ptr<Tensor> waveform;
1983   if (length != 0 && (pad > 0)) {
1984     std::vector<float> result(dim2_size * cols);
1985     int32_t index = 0;
1986     for (auto i = 0; i < dim; i += dim1_size) {
1987       int32_t cnt = 0;
1988       for (auto itr = start + i + pad; cnt < cols; ++itr) {
1989         result[index] = *itr;
1990         ++index;
1991         ++cnt;
1992       }
1993     }
1994     TensorShape change_shape({dim2_size, cols});
1995     RETURN_IF_NOT_OK(Tensor::CreateFromVector(result, change_shape, &waveform));
1996   } else {
1997     waveform = final_results;
1998   }
1999   // unpack
2000   output_shape.pop_back();
2001   output_shape.pop_back();
2002   output_shape.pop_back();
2003   output_shape.push_back(cols);
2004   RETURN_IF_NOT_OK(waveform->Reshape(TensorShape(output_shape)));
2005   *output = waveform;
2006   return Status::OK();
2007 }
2008 
InverseSpectrogram(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t length,int32_t n_fft,int32_t win_length,int32_t hop_length,int32_t pad,WindowType window,bool normalized,bool center,BorderType pad_mode,bool onesided)2009 Status InverseSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length,
2010                           int32_t n_fft, int32_t win_length, int32_t hop_length, int32_t pad, WindowType window,
2011                           bool normalized, bool center, BorderType pad_mode, bool onesided) {
2012   RETURN_UNEXPECTED_IF_NULL(output);
2013   RETURN_IF_NOT_OK(ValidateTensorShape("InverseSpectrogram",
2014                                        input->shape().Size() > kDefaultAudioDim && input->IsComplex(),
2015                                        "<..., freq, time, complex=2>"));
2016   RETURN_IF_NOT_OK(ValidateTensorNumeric("InverseSpectrogram", input));
2017   if (input->type().value() != DataType::DE_FLOAT64) {
2018     RETURN_IF_NOT_OK(InverseSpectrogramImpl<float>(input, output, length, n_fft, win_length, hop_length, pad, window,
2019                                                    normalized, center, pad_mode, onesided));
2020   } else {
2021     RETURN_IF_NOT_OK(InverseSpectrogramImpl<double>(input, output, length, n_fft, win_length, hop_length, pad, window,
2022                                                     normalized, center, pad_mode, onesided));
2023   }
2024   return Status::OK();
2025 }
2026 
2027 template <typename T>
GriffinLimImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t n_iter,int32_t win_length,int32_t hop_length,WindowType window_type,float power,float momentum,int32_t length,bool rand_init,std::mt19937 * rnd)2028 Status GriffinLimImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft,
2029                       int32_t n_iter, int32_t win_length, int32_t hop_length, WindowType window_type, float power,
2030                       float momentum, int32_t length, bool rand_init, std::mt19937 *rnd) {
2031   RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
2032   // pack
2033   TensorShape shape = (*output)->shape();
2034   TensorShape new_shape({(*output)->Size() / shape[-1] / shape[-2], shape[-2], shape[-1]});
2035   RETURN_IF_NOT_OK((*output)->Reshape(new_shape));
2036   // power
2037   CHECK_FAIL_RETURN_UNEXPECTED(power != 0, "GriffinLim: power can not be zero.");
2038   for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
2039     *itr = pow(*itr, 1 / power);
2040   }
2041   // window
2042   std::shared_ptr<Tensor> fft_window_tensor;
2043   RETURN_IF_NOT_OK(Window(&fft_window_tensor, window_type, win_length));
2044   if (win_length == 1) {
2045     RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &fft_window_tensor));
2046     auto win = fft_window_tensor->begin<float>();
2047     *(win) = 1;
2048   }
2049   std::shared_ptr<Tensor> final_results;
2050   for (int dim = 0; dim < new_shape[0]; dim++) {
2051     // init complex phase
2052     Eigen::MatrixXcd angles(shape[(-1) * TWO], shape[-1]);
2053     if (rand_init) {
2054       // static std::default_random_engine e;
2055       std::uniform_real_distribution<double> dist(0, 1);
2056       angles = angles.unaryExpr(
2057         [&dist, &rnd](std::complex<double> value) { return std::complex<double>(dist(*rnd), dist(*rnd)); });
2058     } else {
2059       angles = angles.unaryExpr([](std::complex<double> value) { return std::complex<double>(1, 0); });
2060     }
2061     // slice and squeeze the first dim
2062     std::shared_ptr<Tensor> spec_tensor_slice;
2063     RETURN_IF_NOT_OK((*output)->Slice(
2064       &spec_tensor_slice,
2065       std::vector<SliceOption>({SliceOption(std::vector<dsize_t>{dim}), SliceOption(true), SliceOption(true)})));
2066     TensorShape new_slice_shape({shape[-2], shape[-1]});
2067     RETURN_IF_NOT_OK(spec_tensor_slice->Reshape(new_slice_shape));
2068     // turn tensor into eigen MatrixXd
2069     auto data_ptr = &*spec_tensor_slice->begin<T>();
2070     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> spec_matrix_transpose(data_ptr, shape[-1],
2071                                                                                        shape[(-1) * TWO]);
2072     Eigen::MatrixXd spec_matrix = spec_matrix_transpose.transpose().template cast<double>();
2073     auto stft_complex = angles.cwiseProduct(spec_matrix);
2074 
2075     // init tprev zero mat
2076     Eigen::MatrixXcd tprev(shape[(-1) * TWO], shape[-1]);
2077     Eigen::MatrixXcd rebuilt(shape[(-1) * TWO], shape[-1]);
2078     tprev.setZero();
2079     rebuilt.setZero();
2080     for (int iter = 0; iter < n_iter; iter++) {
2081       // istft
2082       std::shared_ptr<Tensor> inverse;
2083       RETURN_IF_NOT_OK(ISTFT<T>(stft_complex, &inverse, n_fft, hop_length, win_length, window_type, true, length));
2084       // stft
2085       std::shared_ptr<Tensor> stft_out;
2086       RETURN_IF_NOT_OK(SpectrogramImpl<T>(inverse, &stft_out, 0, window_type, n_fft, hop_length, win_length, 0, false,
2087                                           true, BorderType::kReflect, true));
2088 
2089       rebuilt.transposeInPlace();
2090       Tensor::TensorIterator<T> itr = stft_out->begin<T>();
2091       rebuilt = rebuilt.unaryExpr([&itr](std::complex<double> value) {
2092         T real = *(itr++);
2093         T img = *(itr++);
2094         return std::complex<double>(real, img);
2095       });
2096       rebuilt.transposeInPlace();
2097       angles = rebuilt.array();
2098 
2099       if (momentum != 0) {
2100         tprev = tprev * ((momentum) / (1 + momentum));
2101         angles = angles.array() - tprev.array();
2102       }
2103 
2104       float eps = 1e-16;
2105       auto angles_abs = angles.cwiseAbs().eval();
2106       angles_abs.array() += eps;
2107       angles = angles.array() / angles_abs.array();
2108       tprev = rebuilt.array();
2109     }
2110 
2111     // istft calculate final phase
2112     auto stft_complex_fin = angles.cwiseProduct(spec_matrix);
2113     std::shared_ptr<Tensor> waveform;
2114     RETURN_IF_NOT_OK(ISTFT<T>(stft_complex_fin, &waveform, n_fft, hop_length, win_length, window_type, true, length));
2115 
2116     if (shape.Rank() == TWO) {
2117       // do not expand dim
2118       final_results = waveform;
2119       continue;
2120     }
2121 
2122     if (final_results != nullptr) {
2123       RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2124     } else {
2125       RETURN_IF_NOT_OK(
2126         Tensor::CreateEmpty(TensorShape({new_shape[0], waveform->shape()[0]}), waveform->type(), &final_results));
2127       RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2128     }
2129   }
2130   *output = final_results;
2131   return Status::OK();
2132 }
2133 
GriffinLim(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t n_iter,int32_t win_length,int32_t hop_length,WindowType window_type,float power,float momentum,int32_t length,bool rand_init,std::mt19937 * rnd)2134 Status GriffinLim(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft, int32_t n_iter,
2135                   int32_t win_length, int32_t hop_length, WindowType window_type, float power, float momentum,
2136                   int32_t length, bool rand_init, std::mt19937 *rnd) {
2137   std::shared_ptr<Tensor> input_tensor;
2138   if (input->type() != DataType::DE_FLOAT64) {
2139     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
2140     return GriffinLimImpl<float>(input_tensor, output, n_fft, n_iter, win_length, hop_length, window_type, power,
2141                                  momentum, length, rand_init, rnd);
2142   } else {
2143     input_tensor = input;
2144     return GriffinLimImpl<double>(input_tensor, output, n_fft, n_iter, win_length, hop_length, window_type, power,
2145                                   momentum, length, rand_init, rnd);
2146   }
2147 }
2148 
2149 template <typename T>
InverseMelScaleImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_stft,int32_t n_mels,int32_t sample_rate,float f_min,float f_max,int32_t max_iter,float tolerance_loss,float tolerance_change,float sgd_lr,float sgd_momentum,NormType norm,MelType mel_type,std::mt19937 * rnd)2150 Status InverseMelScaleImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
2151                            int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
2152                            float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum,
2153                            NormType norm, MelType mel_type, std::mt19937 *rnd) {
2154   constexpr int32_t sample_rate_factor = 2;
2155   f_max = std::fabs(f_max) <= std::numeric_limits<float>::epsilon()
2156             ? static_cast<T>(std::floor(sample_rate / sample_rate_factor))
2157             : f_max;
2158   // create fb mat <freq, n_mels>
2159   std::shared_ptr<Tensor> freq_bin_mat;
2160   RETURN_IF_NOT_OK(CreateFbanks<T>(&freq_bin_mat, n_stft, f_min, f_max, n_mels, sample_rate, norm, mel_type));
2161 
2162   auto fb_ptr = &*freq_bin_mat->begin<float>();
2163   Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_fb(fb_ptr, n_mels, n_stft);
2164   // pack melspec <n, n_mels, time>
2165   TensorShape input_shape = input->shape();
2166   TensorShape input_reshape({input->Size() / input_shape[-1] / input_shape[-2], input_shape[-2], input_shape[-1]});
2167   RETURN_IF_NOT_OK(input->Reshape(input_reshape));
2168   CHECK_FAIL_RETURN_UNEXPECTED(n_mels == input_shape[-1 * TWO],
2169                                "InverseMelScale: n_mels must be equal to the penultimate dimension of input.");
2170 
2171   int time = input_shape[-1];
2172   int freq = matrix_fb.cols();
2173   // input matrix 3d
2174   std::vector<T> specgram;
2175   // engine for random matrix
2176   std::uniform_real_distribution<T> dist(0, 1);
2177   for (int channel = 0; channel < input_reshape[0]; channel++) {
2178     // slice by first dimension
2179     auto data_ptr = &*input->begin<T>();
2180     Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> input_channel(data_ptr + time * n_mels * channel, time,
2181                                                                                n_mels);
2182     // init specgram at n=channel
2183     Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_channel =
2184       Eigen::MatrixXd::Zero(time, freq).unaryExpr([&rnd, &dist](double dummy) { return dist(*rnd); });
2185     std::vector<T> vec_channel(mat_channel.data(), mat_channel.data() + mat_channel.size());
2186     std::shared_ptr<Tensor> param_channel;
2187     TensorShape output_shape = TensorShape({freq, time});
2188     RETURN_IF_NOT_OK(Tensor::CreateFromVector(vec_channel, TensorShape({freq * time}), &param_channel));
2189     // sgd
2190     T loss = std::numeric_limits<T>::max();
2191     for (int epoch = 0; epoch < max_iter; epoch++) {
2192       auto pred = mat_channel * (matrix_fb.transpose().template cast<T>());
2193       // cal loss with pred and gt
2194       auto diff = input_channel - pred;
2195       T new_loss = diff.array().square().mean();
2196       // cal grad
2197       auto grad = diff * (matrix_fb.template cast<T>()) * (-1) / time;
2198       Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_grad = grad;
2199       std::vector<T> vec_grad(mat_grad.data(), mat_grad.data() + mat_grad.size());
2200       std::shared_ptr<Tensor> tensor_grad;
2201       RETURN_IF_NOT_OK(Tensor::CreateFromVector(vec_grad, TensorShape({grad.size()}), &tensor_grad));
2202 
2203       std::shared_ptr<Tensor> nspec;
2204       RETURN_IF_NOT_OK(SGD<T>(param_channel, &nspec, tensor_grad, sgd_lr, sgd_momentum));
2205 
2206       T diff_loss = std::abs(loss - new_loss);
2207       if ((new_loss < tolerance_loss) || (diff_loss < tolerance_change)) {
2208         break;
2209       }
2210       loss = new_loss;
2211       data_ptr = &*nspec->begin<T>();
2212       mat_channel = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(data_ptr, time, freq);
2213       // use new mat_channel to update param_channel
2214       RETURN_IF_NOT_OK(Tensor::CreateFromTensor(nspec, &param_channel));
2215     }
2216     // clamp and transpose
2217     auto res = mat_channel.cwiseMax(0);
2218     Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_res = res;
2219     std::vector<T> spec_channel(mat_res.data(), mat_res.data() + mat_res.size());
2220     specgram.insert(specgram.end(), spec_channel.begin(), spec_channel.end());
2221   }
2222   std::shared_ptr<Tensor> final_out;
2223   if (input_shape.Size() > TWO) {
2224     std::vector<int64_t> out_shape_vec = input_shape.AsVector();
2225     out_shape_vec[input_shape.Size() - 1] = time;
2226     out_shape_vec[input_shape.Size() - TWO] = freq;
2227     TensorShape output_shape(out_shape_vec);
2228     RETURN_IF_NOT_OK(Tensor::CreateFromVector(specgram, output_shape, &final_out));
2229   } else {
2230     TensorShape output_shape = TensorShape({input_reshape[0], freq, time});
2231     RETURN_IF_NOT_OK(Tensor::CreateFromVector(specgram, output_shape, &final_out));
2232   }
2233   *output = final_out;
2234   return Status::OK();
2235 }
2236 
InverseMelScale(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_stft,int32_t n_mels,int32_t sample_rate,float f_min,float f_max,int32_t max_iter,float tolerance_loss,float tolerance_change,float sgd_lr,float sgd_momentum,NormType norm,MelType mel_type,std::mt19937 * rnd)2237 Status InverseMelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
2238                        int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
2239                        float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum, NormType norm,
2240                        MelType mel_type, std::mt19937 *rnd) {
2241   std::shared_ptr<Tensor> input_tensor;
2242   if (input->type() != DataType::DE_FLOAT64) {
2243     RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
2244     return InverseMelScaleImpl<float>(input_tensor, output, n_stft, n_mels, sample_rate, f_min, f_max, max_iter,
2245                                       tolerance_loss, tolerance_change, sgd_lr, sgd_momentum, norm, mel_type, rnd);
2246   } else {
2247     input_tensor = input;
2248     return InverseMelScaleImpl<double>(input_tensor, output, n_stft, n_mels, sample_rate, f_min, f_max, max_iter,
2249                                        tolerance_loss, tolerance_change, sgd_lr, sgd_momentum, norm, mel_type, rnd);
2250   }
2251 }
2252 
2253 template <typename T>
GetSincResampleKernel(const int32_t orig_freq,const int32_t des_freq,ResampleMethod resample_method,int32_t lowpass_filter_width,float rolloff,float beta,DataType datatype,Eigen::MatrixX<T> * kernel_ptr,int32_t * width)2254 Status GetSincResampleKernel(const int32_t orig_freq, const int32_t des_freq, ResampleMethod resample_method,
2255                              int32_t lowpass_filter_width, float rolloff, float beta, DataType datatype,
2256                              Eigen::MatrixX<T> *kernel_ptr, int32_t *width) {
2257   CHECK_FAIL_RETURN_UNEXPECTED(orig_freq != 0, "Resample: invalid parameter, 'orig_freq' can not be zero.");
2258   CHECK_FAIL_RETURN_UNEXPECTED(des_freq != 0, "Resample: invalid parameter, 'des_freq' can not be zero.");
2259   float base_freq = static_cast<float>(std::min(orig_freq, des_freq));
2260   // Removing the highest frequencies to perform antialiasing filtering. This is
2261   // needed in both upsampling and downsampling
2262   base_freq *= rolloff;
2263   *width = static_cast<int32_t>(ceil(static_cast<float>(lowpass_filter_width * orig_freq) / base_freq));
2264   int32_t kernel_length = 2 * (*width) + orig_freq;
2265   Eigen::MatrixX<T> idx(1, kernel_length);
2266   auto idx_it = idx.data();
2267   for (int32_t i = -(*width); i < (*width + orig_freq); i++) {
2268     *idx_it = static_cast<T>(i) / orig_freq;
2269     ++idx_it;
2270   }
2271   Eigen::VectorX<T> filter(des_freq);
2272   auto filter_it = filter.data();
2273   for (int32_t i = 0; i > -des_freq; i--) {
2274     *filter_it = static_cast<T>(i) / des_freq;
2275     ++filter_it;
2276   }
2277   Eigen::MatrixX<T> kernel_matrix(kernel_length, des_freq);
2278   kernel_matrix.noalias() = filter.replicate(1, kernel_length) + idx.replicate(des_freq, 1);
2279   kernel_matrix *= base_freq;
2280   // clamp
2281   kernel_matrix.noalias() = kernel_matrix.cwiseMin(lowpass_filter_width).cwiseMax(-lowpass_filter_width);
2282   Eigen::MatrixX<T> window;
2283   if (resample_method == ResampleMethod::kSincInterpolation) {
2284     window = kernel_matrix * PI / lowpass_filter_width / TWO;
2285     window.noalias() = window.unaryExpr([=](T x) -> T {
2286       T temp = cos(x);
2287       return temp * temp;
2288     });
2289   } else {
2290     // kaiser_window
2291 #ifdef __APPLE__
2292     RETURN_STATUS_ERROR(StatusCode::kMDNotImplementedYet,
2293                         "Resample: ResampleMethod of Kaiser Window is not "
2294                         "supported on MacOS yet.");
2295 #else
2296     {
2297       std::unique_lock<std::mutex> _lock(cyl_bessel_mux_);
2298       T beta_bessel = std::cyl_bessel_if(0, beta);
2299       window.noalias() = kernel_matrix.unaryExpr([=](T x) -> T {
2300         return std::cyl_bessel_i(0, beta * sqrt(1 - pow((x / lowpass_filter_width), TWO))) / beta_bessel;
2301       });
2302     }
2303 #endif
2304   }
2305   kernel_matrix *= PI;
2306   T scale = base_freq / orig_freq;
2307   kernel_matrix = kernel_matrix.unaryExpr([=](T x) -> T { return x == 0 ? scale : scale * (sin(x) / x); });
2308   kernel_matrix = kernel_matrix.cwiseProduct(window);
2309   *kernel_ptr = kernel_matrix;
2310   return Status::OK();
2311 }
2312 
2313 template <typename T>
Cov1dStride(const Eigen::MatrixX<T> & waveform_pad_matrix,const Eigen::MatrixX<T> & kernel,const int32_t num_waveform,const int32_t orig_freq,const int32_t target_length,const int32_t pad_length)2314 std::shared_ptr<Eigen::MatrixX<T>> Cov1dStride(const Eigen::MatrixX<T> &waveform_pad_matrix,
2315                                                const Eigen::MatrixX<T> &kernel, const int32_t num_waveform,
2316                                                const int32_t orig_freq, const int32_t target_length,
2317                                                const int32_t pad_length) {
2318   Eigen::MatrixX<T> output_matrix(target_length, num_waveform);
2319   Eigen::MatrixX<T> mul_matrix;
2320   const int32_t kernel_x = static_cast<int32_t>(kernel.rows());
2321   const int32_t kernel_y = static_cast<int32_t>(kernel.cols());
2322   const int32_t resample_num = static_cast<int32_t>(ceil(static_cast<float>(target_length) / kernel_x));
2323   Eigen::MatrixX<T> multi_input(kernel_y, resample_num);
2324   for (int32_t i = 0; i < num_waveform; i++) {
2325     for (int32_t j = 0, x_dim = 0; j + kernel_y < pad_length && x_dim < resample_num; j += orig_freq, ++x_dim) {
2326       multi_input.col(x_dim) = waveform_pad_matrix(Eigen::seqN(j, kernel_y), Eigen::seqN(i, 1));
2327     }
2328     mul_matrix.noalias() = kernel * multi_input;
2329     output_matrix.col(i) = Eigen::Map<Eigen::MatrixX<T>>(mul_matrix.data(), target_length, 1);
2330   }
2331   return std::make_shared<Eigen::MatrixX<T>>(output_matrix);
2332 }
2333 
2334 template <typename T>
Resample(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float orig_freq,float des_freq,ResampleMethod resample_method,int32_t lowpass_filter_width,float rolloff,float beta)2335 Status Resample(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float orig_freq, float des_freq,
2336                 ResampleMethod resample_method, int32_t lowpass_filter_width, float rolloff, float beta) {
2337   const TensorShape input_shape = input->shape();
2338   const int32_t waveform_length = input_shape[-1];
2339   const int32_t num_waveform = input->Size() / waveform_length;
2340   const TensorShape to_shape = TensorShape({num_waveform, waveform_length});
2341   RETURN_IF_NOT_OK(input->Reshape(to_shape));
2342 
2343   const int32_t gcd = std::gcd(static_cast<int32_t>(orig_freq), static_cast<int32_t>(des_freq));
2344   CHECK_FAIL_RETURN_UNEXPECTED(gcd != 0, "Resample: gcd cannot be equal to 0.");
2345   const int32_t orig_freq_prime = static_cast<int32_t>(floor(orig_freq / gcd));
2346   const int32_t des_freq_prime = static_cast<int32_t>(floor(des_freq / gcd));
2347   CHECK_FAIL_RETURN_UNEXPECTED(orig_freq_prime != 0, "Resample: invalid parameter, 'orig_freq_prime' cannot be zero.");
2348   Eigen::MatrixX<T> kernel;
2349   int32_t width = 0;
2350   RETURN_IF_NOT_OK(GetSincResampleKernel<T>(orig_freq_prime, des_freq_prime, resample_method, lowpass_filter_width,
2351                                             rolloff, beta, input->type(), &kernel, &width));
2352   const int32_t ZERO = 0;
2353   const int32_t kernel_rows = static_cast<int32_t>(kernel.rows());
2354   const int32_t kernel_cols = static_cast<int32_t>(kernel.cols());
2355   RETURN_IF_NOT_OK(ValidateGreaterThan("Resample", "kernel.cols()", kernel_cols, "boundary", ZERO));
2356   RETURN_IF_NOT_OK(ValidateGreaterThan("Resample", "kernel.rows()", kernel_rows, "boundary", ZERO));
2357 
2358   // padding
2359   const int32_t pad_length = waveform_length + 2 * width + orig_freq_prime;
2360   std::shared_ptr<Tensor> waveform_pad;
2361   RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1, pad_length}), input->type(), &waveform_pad));
2362   RETURN_IF_NOT_OK(Pad<T>(input, &waveform_pad, width, width + orig_freq_prime, BorderType::kConstant));
2363   Eigen::MatrixX<T> waveform_pad_matrix =
2364     Eigen::Map<Eigen::MatrixX<T>>(&*waveform_pad->begin<T>(), waveform_pad->shape()[1], waveform_pad->shape()[0]);
2365   const int32_t target_length =
2366     static_cast<int32_t>(std::ceil(static_cast<double>(des_freq_prime * waveform_length) / orig_freq_prime));
2367 
2368   // cov1d with stide = orig_freq_prime
2369   std::shared_ptr<Eigen::MatrixX<T>> output_ptr =
2370     Cov1dStride<T>(waveform_pad_matrix, kernel, num_waveform, orig_freq_prime, target_length, pad_length);
2371   std::vector<dsize_t> shape_vec = input_shape.AsVector();
2372   shape_vec.at(shape_vec.size() - 1) = static_cast<dsize_t>(target_length);
2373   const auto output_data = reinterpret_cast<const uchar *>(output_ptr->data());
2374   RETURN_IF_NOT_OK(Tensor::CreateFromMemory(TensorShape(shape_vec), DataType::FromCType<T>(), output_data, output));
2375   return Status::OK();
2376 }
2377 
Resample(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float orig_freq,float des_freq,ResampleMethod resample_method,int32_t lowpass_filter_width,float rolloff,float beta)2378 Status Resample(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float orig_freq, float des_freq,
2379                 ResampleMethod resample_method, int32_t lowpass_filter_width, float rolloff, float beta) {
2380   RETURN_IF_NOT_OK(ValidateLowRank("Resample", input, kMinAudioDim, "<..., time>"));
2381   RETURN_IF_NOT_OK(ValidateTensorNumeric("Resample", input));
2382   if (input->type().value() == DataType::DE_FLOAT64) {
2383     RETURN_IF_NOT_OK(
2384       Resample<double>(input, output, orig_freq, des_freq, resample_method, lowpass_filter_width, rolloff, beta));
2385   } else {
2386     std::shared_ptr<Tensor> waveform;
2387     RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
2388     RETURN_IF_NOT_OK(
2389       Resample<float>(waveform, output, orig_freq, des_freq, resample_method, lowpass_filter_width, rolloff, beta));
2390   }
2391   return Status::OK();
2392 }
2393 
LFCC(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_filter,int32_t n_lfcc,int32_t dct_type,bool log_lf,int32_t n_fft,int32_t win_length,int32_t hop_length,float f_min,float f_max,int32_t pad,WindowType window,float power,bool normalized,bool center,BorderType pad_mode,bool onesided,NormMode norm)2394 Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2395             int32_t n_filter, int32_t n_lfcc, int32_t dct_type, bool log_lf, int32_t n_fft, int32_t win_length,
2396             int32_t hop_length, float f_min, float f_max, int32_t pad, WindowType window, float power, bool normalized,
2397             bool center, BorderType pad_mode, bool onesided, NormMode norm) {
2398   RETURN_UNEXPECTED_IF_NULL(input);
2399   RETURN_UNEXPECTED_IF_NULL(output);
2400   std::shared_ptr<Tensor> spectrogram;
2401   std::shared_ptr<Tensor> filter_mat;
2402   std::shared_ptr<Tensor> dct_mat;
2403   RETURN_IF_NOT_OK(Spectrogram(input, &spectrogram, pad, window, n_fft, hop_length, win_length, power, normalized,
2404                                center, pad_mode, onesided));
2405   RETURN_IF_NOT_OK(
2406     CreateLinearFbanks(&filter_mat, static_cast<int32_t>(floor(n_fft / TWO)) + 1, f_min, f_max, n_filter, sample_rate));
2407   RETURN_IF_NOT_OK(Dct(&dct_mat, n_lfcc, n_filter, norm));
2408   std::shared_ptr<Tensor> spectrogramxfilter;
2409   std::shared_ptr<Tensor> specgram_temp;
2410 
2411   TensorShape specgram_shape = spectrogram->shape();
2412   TensorShape specgram_reshape(
2413     {spectrogram->Size() / specgram_shape[-1] / specgram_shape[-2], specgram_shape[-2], specgram_shape[-1]});
2414   RETURN_IF_NOT_OK(spectrogram->Reshape(specgram_reshape));
2415   auto filter_mat_ptr = &*filter_mat->begin<float>();
2416   Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_fm(
2417     filter_mat_ptr, n_filter, static_cast<int32_t>(floor(n_fft / TWO)) + 1);
2418   auto dct_mat_ptr = &*dct_mat->begin<float>();
2419   Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_dm(dct_mat_ptr, n_lfcc, n_filter);
2420 
2421   int rows = specgram_reshape[1];
2422   int cols = specgram_reshape[TWO];
2423 
2424   std::vector<float> sf_temp;
2425   Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> mat_res;
2426 
2427   for (int c = 0; c < specgram_reshape[0]; c++) {
2428     Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
2429       &*spectrogram->begin<float>() + rows * cols * c, cols, rows);
2430     mat_res.noalias() = (matrix_c * matrix_fm.transpose());
2431     std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
2432     sf_temp.insert(sf_temp.end(), vec_c.begin(), vec_c.end());
2433   }
2434   // unpack
2435   std::vector<int64_t> sf_shape_vec = specgram_shape.AsVector();
2436   sf_shape_vec[specgram_shape.Size() - 1] = cols;
2437   sf_shape_vec[specgram_shape.Size() - TWO] = n_filter;
2438   TensorShape sf_shape(sf_shape_vec);
2439   RETURN_IF_NOT_OK(Tensor::CreateFromVector(sf_temp, sf_shape, &spectrogramxfilter));
2440 
2441   if (log_lf == true) {
2442     float log_offset = 1e-6;
2443     for (auto itr = spectrogramxfilter->begin<float>(); itr != spectrogramxfilter->end<float>(); ++itr) {
2444       *itr = log(*itr + log_offset);
2445     }
2446     specgram_temp = spectrogramxfilter;
2447   } else {
2448     std::shared_ptr<Tensor> amplitude_to_db;
2449     float multiplier = 10.0;
2450     float db_multiplier = 0.0;
2451     float amin = 1e-10;
2452     float top_db = 80.0;
2453     RETURN_IF_NOT_OK(AmplitudeToDB(spectrogramxfilter, &amplitude_to_db, multiplier, amin, db_multiplier, top_db));
2454     specgram_temp = amplitude_to_db;
2455   }
2456 
2457   TensorShape st_shape = specgram_temp->shape();
2458   TensorShape st_reshape({specgram_temp->Size() / st_shape[-1] / st_shape[-2], st_shape[-2], st_shape[-1]});
2459   RETURN_IF_NOT_OK(specgram_temp->Reshape(st_reshape));
2460 
2461   rows = st_reshape[1];
2462   cols = st_reshape[TWO];
2463   std::vector<float> out_temp;
2464 
2465   for (int c = 0; c < st_reshape[0]; c++) {
2466     Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
2467       &*specgram_temp->begin<float>() + rows * cols * c, cols, rows);
2468     mat_res.noalias() = (matrix_c * matrix_dm.transpose());
2469     std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
2470     out_temp.insert(out_temp.end(), vec_c.begin(), vec_c.end());
2471   }
2472   // unpack
2473   std::vector<int64_t> output_shape_vec = st_shape.AsVector();
2474   output_shape_vec[st_shape.Size() - 1] = cols;
2475   output_shape_vec[st_shape.Size() - TWO] = n_lfcc;
2476   TensorShape output_shape(output_shape_vec);
2477   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_temp, output_shape, output));
2478 
2479   return Status::OK();
2480 }
2481 
MelSpectrogram(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_fft,int32_t win_length,int32_t hop_length,float f_min,float f_max,int32_t pad,int32_t n_mels,WindowType window,float power,bool normalized,bool center,BorderType pad_mode,bool onesided,NormType norm,MelType mel_scale)2482 Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2483                       int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
2484                       int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
2485                       bool onesided, NormType norm, MelType mel_scale) {
2486   RETURN_UNEXPECTED_IF_NULL(input);
2487   RETURN_UNEXPECTED_IF_NULL(output);
2488   auto input_shape_vec = input->shape().AsVector();
2489   CHECK_FAIL_RETURN_UNEXPECTED(n_fft < TWO * input_shape_vec[input_shape_vec.size() - 1],
2490                                "MelSpectrogram: Padding size should be less than the corresponding input dimension.");
2491   std::shared_ptr<Tensor> spectrogram;
2492   RETURN_IF_NOT_OK(Spectrogram(input, &spectrogram, pad, window, n_fft, hop_length, win_length, power, normalized,
2493                                center, pad_mode, onesided));
2494   RETURN_IF_NOT_OK(
2495     MelScale<float>(spectrogram, output, n_mels, sample_rate, f_min, f_max, n_fft / TWO + 1, norm, mel_scale));
2496   return Status::OK();
2497 }
2498 
MFCC(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_mfcc,int32_t dct_type,bool log_mels,int32_t n_fft,int32_t win_length,int32_t hop_length,float f_min,float f_max,int32_t pad,int32_t n_mels,WindowType window,float power,bool normalized,bool center,BorderType pad_mode,bool onesided,NormType norm,NormMode norm_M,MelType mel_scale)2499 Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
2500             int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
2501             float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
2502             BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale) {
2503   RETURN_UNEXPECTED_IF_NULL(input);
2504   RETURN_UNEXPECTED_IF_NULL(output);
2505   std::shared_ptr<Tensor> mel_spectrogram;
2506   std::shared_ptr<Tensor> dct_mat;
2507   RETURN_IF_NOT_OK(MelSpectrogram(input, &mel_spectrogram, sample_rate, n_fft, win_length, hop_length, f_min, f_max,
2508                                   pad, n_mels, window, power, normalized, center, pad_mode, onesided, norm, mel_scale));
2509   RETURN_IF_NOT_OK(Dct(&dct_mat, n_mfcc, n_mels, norm_M));
2510   if (log_mels) {
2511     for (auto itr = mel_spectrogram->begin<float>(); itr != mel_spectrogram->end<float>(); ++itr) {
2512       float log_offset = 1e-6;
2513       *itr = log(*itr + log_offset);
2514     }
2515   } else {
2516     std::shared_ptr<Tensor> amplitude_to_db;
2517     float multiplier = 10.0;
2518     float db_multiplier = 0.0;
2519     float amin = 1e-10;
2520     float top_db = 80.0;
2521     RETURN_IF_NOT_OK(AmplitudeToDB(mel_spectrogram, &amplitude_to_db, multiplier, amin, db_multiplier, top_db));
2522     mel_spectrogram = amplitude_to_db;
2523   }
2524   auto dct_mat_ptr = &*dct_mat->begin<float>();
2525   Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> mat_res;
2526   Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_dm(dct_mat_ptr, n_mfcc, n_mels);
2527   TensorShape st_shape = mel_spectrogram->shape();
2528   TensorShape st_reshape({mel_spectrogram->Size() / st_shape[-1] / st_shape[-2], st_shape[-2], st_shape[-1]});
2529   RETURN_IF_NOT_OK(mel_spectrogram->Reshape(st_reshape));
2530 
2531   const dsize_t kRowIndex = 1;
2532   const dsize_t kColIndex = 2;
2533   int rows = st_reshape[kRowIndex];
2534   int cols = st_reshape[kColIndex];
2535   std::vector<float> out_temp;
2536 
2537   for (int c = 0; c < st_reshape[0]; c++) {
2538     Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
2539       &*mel_spectrogram->begin<float>() + rows * cols * c, cols, rows);
2540     mat_res.noalias() = (matrix_c * matrix_dm.transpose());
2541     std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
2542     out_temp.insert(out_temp.end(), vec_c.begin(), vec_c.end());
2543   }
2544   // unpack
2545   std::vector<int64_t> output_shape_vec = st_shape.AsVector();
2546   output_shape_vec[st_shape.Size() - 1] = cols;
2547   output_shape_vec[st_shape.Size() - TWO] = n_mfcc;
2548   TensorShape output_shape(output_shape_vec);
2549   RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_temp, output_shape, output));
2550 
2551   return Status::OK();
2552 }
2553 
2554 template <typename T>
PitchShiftImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_steps,int32_t bins_per_octave,int32_t n_fft,int32_t win_length,int32_t hop_length,WindowType window)2555 Status PitchShiftImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2556                       int32_t n_steps, int32_t bins_per_octave, int32_t n_fft, int32_t win_length, int32_t hop_length,
2557                       WindowType window) {
2558   DataType data_type = input->type();
2559   // pack batch
2560   TensorShape shape = input->shape();
2561   int input_len = shape[-1];
2562 
2563   RETURN_IF_NOT_OK(input->Reshape(TensorShape({input->Size() / input_len, input_len})));
2564   std::shared_ptr<Tensor> spec_f;
2565 
2566   RETURN_IF_NOT_OK(SpectrogramImpl<T>(input, &spec_f, 0, window, n_fft, hop_length, win_length, 0, false, true,
2567                                       BorderType::kReflect, true));
2568 
2569   std::shared_ptr<Tensor> phase_advance;
2570   int spec_f_length = spec_f->shape()[-3];
2571   RETURN_IF_NOT_OK(Linspace<T>(&phase_advance, 0, PI * hop_length, spec_f_length));
2572   TensorShape pshape = phase_advance->shape();
2573   RETURN_IF_NOT_OK(phase_advance->Reshape(TensorShape(phase_advance->shape().AppendDim(1))));
2574   std::shared_ptr<Tensor> spec_stretch;
2575   float a = static_cast<float>(-n_steps) / bins_per_octave;
2576   float rate = pow(2, a);
2577 
2578   RETURN_IF_NOT_OK(TimeStretch<T>(spec_f, &spec_stretch, rate, phase_advance));
2579 
2580   int ori_len = shape[-1];
2581   int len_stretch = round(ori_len / rate);
2582   std::shared_ptr<Tensor> final_results;
2583   TensorShape tshape = spec_stretch->shape();
2584   TensorShape new_shape({tshape[0], tshape[-3], spec_stretch->Size() / tshape[-3] / tshape[0]});
2585   RETURN_IF_NOT_OK(spec_stretch->Reshape(new_shape));
2586   auto size = tshape[0] * tshape[-3] * spec_stretch->Size() / tshape[-3] / tshape[0];
2587   Tensor::TensorIterator<T> itr = spec_stretch->begin<T>();
2588 
2589   for (int dim = 0; dim < new_shape[0]; dim++) {
2590     // slice and squeeze the first dim
2591     const int row = -3;
2592     const int col = -2;
2593     Eigen::MatrixXcd rebuilt(tshape[row], tshape[col]);
2594 
2595     for (int j = 0; j < tshape[row]; j++) {
2596       for (int i = 0; i < tshape[col]; i++) {
2597         if (itr < (dim + 1) * size + itr) {
2598           T real = *(itr++);
2599           T img = *(itr++);
2600           rebuilt(j, i) = std::complex<double>(real, img);
2601         }
2602       }
2603     }
2604 
2605     std::shared_ptr<Tensor> waveform;
2606 
2607     RETURN_IF_NOT_OK(ISTFT<T>(rebuilt, &waveform, n_fft, hop_length, win_length, window, true, len_stretch));
2608 
2609     if (spec_stretch->shape().Rank() == TWO) {
2610       // do not expand dim
2611       final_results = waveform;
2612       continue;
2613     }
2614 
2615     if (final_results != nullptr) {
2616       RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2617     } else {
2618       RETURN_IF_NOT_OK(
2619         Tensor::CreateEmpty(TensorShape({new_shape[0], waveform->shape()[0]}), waveform->type(), &final_results));
2620       RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2621     }
2622   }
2623 
2624   std::shared_ptr<Tensor> waveform_shift;
2625   const int32_t lowpass_filter_width = 6;
2626   const float rolloff = 0.99;
2627   const int new_rate = static_cast<int>(sample_rate / rate);
2628   const float beta = 14.769656459379492;
2629 
2630   RETURN_IF_NOT_OK(Resample<T>(final_results, &waveform_shift, new_rate, sample_rate,
2631                                ResampleMethod::kSincInterpolation, lowpass_filter_width, rolloff, beta));
2632 
2633   int shift_len = waveform_shift->shape()[-1];
2634   std::shared_ptr<Tensor> waveform_end;
2635   if (shift_len > ori_len) {
2636     int32_t waveform_length = waveform_shift->shape()[-1];
2637     int32_t num_waveform = waveform_shift->Size() / waveform_length;
2638     auto ind = waveform_shift->begin<T>();
2639     for (int i = 0; i < num_waveform; i++) {
2640       std::shared_ptr<Tensor> wave_final;
2641       RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({ori_len}), input->type(), &wave_final));
2642       if (i != 0) ind += shift_len - ori_len;
2643       for (auto atr = wave_final->begin<T>(); atr != wave_final->end<T>(); atr++) {
2644         *atr = *(ind++);
2645       }
2646       if (waveform_end != nullptr) {
2647         RETURN_IF_NOT_OK(waveform_end->InsertTensor({i}, wave_final));
2648       } else {
2649         RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({num_waveform, ori_len}), wave_final->type(), &waveform_end));
2650         RETURN_IF_NOT_OK(waveform_end->InsertTensor({i}, wave_final));
2651       }
2652     }
2653   } else {
2654     RETURN_IF_NOT_OK(Pad<T>(waveform_shift, &waveform_end, 0, ori_len - shift_len, BorderType::kConstant));
2655   }
2656 
2657   auto output_shape = shape.AsVector();
2658   output_shape.pop_back();
2659   output_shape.push_back(waveform_end->shape()[-1]);
2660   RETURN_IF_NOT_OK(waveform_end->Reshape(TensorShape(output_shape)));
2661   *output = waveform_end;
2662   return Status::OK();
2663 }
2664 
PitchShift(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_steps,int32_t bins_per_octave,int32_t n_fft,int32_t win_length,int32_t hop_length,WindowType window)2665 Status PitchShift(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2666                   int32_t n_steps, int32_t bins_per_octave, int32_t n_fft, int32_t win_length, int32_t hop_length,
2667                   WindowType window) {
2668   RETURN_UNEXPECTED_IF_NULL(input);
2669   RETURN_UNEXPECTED_IF_NULL(output);
2670   RETURN_IF_NOT_OK(ValidateLowRank("PitchShift", input, kMinAudioDim, "<..., time>"));
2671   RETURN_IF_NOT_OK(ValidateTensorNumeric("PitchShift", input));
2672   if (input->type().value() == DataType::DE_FLOAT64) {
2673     return PitchShiftImpl<double>(input, output, sample_rate, n_steps, bins_per_octave, n_fft, win_length, hop_length,
2674                                   window);
2675   } else {
2676     std::shared_ptr<Tensor> waveform;
2677     RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
2678     return PitchShiftImpl<float>(waveform, output, sample_rate, n_steps, bins_per_octave, n_fft, win_length, hop_length,
2679                                  window);
2680   }
2681 }
2682 }  // namespace dataset
2683 }  // namespace mindspore
2684