1 /**
2 * Copyright 2021-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "minddata/dataset/audio/kernels/audio_utils.h"
18
19 #include <fstream>
20
21 #include "mindspore/core/base/float16.h"
22 #include "minddata/dataset/core/type_id.h"
23 #include "minddata/dataset/util/random.h"
24 #include "utils/file_utils.h"
25
26 namespace mindspore {
27 namespace dataset {
28 // global lock for cyl_bessel_i and cyl_bessel_if function
29 std::mutex cyl_bessel_mux_;
30
31 // Compute the thread nums.
CountThreadNums(size_t input_size,float block_size,size_t * task_num,size_t * once_compute_size)32 Status CountThreadNums(size_t input_size, float block_size, size_t *task_num, size_t *once_compute_size) {
33 CHECK_FAIL_RETURN_UNEXPECTED(block_size > 0, "Invalid data, the value of 'block_size' should be greater than 0.");
34 size_t audio_thread_num = 4;
35 size_t thread_num =
36 input_size < block_size * audio_thread_num ? std::ceil(input_size / block_size) : audio_thread_num;
37 *once_compute_size = (input_size + thread_num - 1) / thread_num;
38 if (*once_compute_size % TWO == 1) {
39 *once_compute_size += 1;
40 }
41 CHECK_FAIL_RETURN_UNEXPECTED((*once_compute_size) != 0,
42 "Invalid data, the value of 'once_compute_size' should not be 0, but got 0.");
43 *task_num = input_size / (*once_compute_size);
44 if (input_size % (*once_compute_size) != 0) {
45 *task_num += 1;
46 }
47 return Status::OK();
48 }
49
AudioParallelLaunch(const std::function<void (size_t,size_t,size_t,size_t)> & task,size_t input_size,float block_size)50 Status AudioParallelLaunch(const std::function<void(size_t, size_t, size_t, size_t)> &task, size_t input_size,
51 float block_size) {
52 std::vector<std::thread> threads;
53 size_t task_num = 0;
54 size_t once_compute_size = 0;
55 RETURN_IF_NOT_OK(CountThreadNums(input_size, block_size, &task_num, &once_compute_size));
56
57 for (size_t i = 0; i < task_num - 1; i++) {
58 size_t start = i * once_compute_size;
59 size_t end = start + once_compute_size;
60 threads.push_back(std::thread(task, start, end, i, task_num));
61 }
62
63 size_t start = (task_num - 1) * once_compute_size;
64 size_t end = input_size;
65 task(start, end, task_num - 1, task_num);
66
67 for (auto &thread : threads) {
68 thread.join();
69 }
70 return Status::OK();
71 }
72
73 /// \brief Calculate complex tensor angle.
74 /// \param[in] input - Input tensor, must be complex, <channel, freq, time, complex=2>.
75 /// \param[out] output - Complex tensor angle.
76 /// \return Status return code.
77 template <typename T>
ComplexAngle(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)78 Status ComplexAngle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
79 // check complex
80 RETURN_IF_NOT_OK(ValidateTensorShape("ComplexAngle", input->IsComplex(), "<..., complex=2>"));
81 TensorShape input_shape = input->shape();
82 TensorShape out_shape({input_shape[0], input_shape[1], input_shape[2]});
83 std::vector<T> phase(input_shape[0] * input_shape[1] * input_shape[2]);
84 size_t ind = 0;
85
86 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++, ind++) {
87 auto x = (*itr);
88 itr++;
89 auto y = (*itr);
90 phase[ind] = atan2(y, x);
91 }
92
93 std::shared_ptr<Tensor> out_t;
94 RETURN_IF_NOT_OK(Tensor::CreateFromVector(phase, out_shape, &out_t));
95 phase.clear();
96 phase.shrink_to_fit();
97 *output = out_t;
98 return Status::OK();
99 }
100
101 /// \brief Calculate complex tensor abs.
102 /// \param[in] input - Input tensor, must be complex, <channel, freq, time, complex=2>.
103 /// \param[out] output - Complex tensor abs.
104 /// \return Status return code.
105 template <typename T>
ComplexAbs(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)106 Status ComplexAbs(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
107 // check complex
108 RETURN_IF_NOT_OK(ValidateTensorShape("ComplexAngle", input->IsComplex(), "<..., complex=2>"));
109 TensorShape input_shape = input->shape();
110 TensorShape out_shape({input_shape[0], input_shape[1], input_shape[2]});
111 std::vector<T> abs(input_shape[0] * input_shape[1] * input_shape[2]);
112 size_t ind = 0;
113 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++, ind++) {
114 T x = (*itr);
115 itr++;
116 T y = (*itr);
117 abs[ind] = sqrt(pow(y, 2) + pow(x, 2));
118 }
119
120 std::shared_ptr<Tensor> out_t;
121 RETURN_IF_NOT_OK(Tensor::CreateFromVector(abs, out_shape, &out_t));
122 *output = out_t;
123 return Status::OK();
124 }
125
CreateLinearFbanks(std::shared_ptr<Tensor> * output,int32_t n_freqs,float f_min,float f_max,int32_t n_filter,int32_t sample_rate)126 Status CreateLinearFbanks(std::shared_ptr<Tensor> *output, int32_t n_freqs, float f_min, float f_max, int32_t n_filter,
127 int32_t sample_rate) {
128 // all_freqs is equivalent filterbank construction.
129 std::shared_ptr<Tensor> all_freqs;
130 // the sampling frequency is at least twice the highest frequency of the signal.
131 const double signal_times = 2;
132 RETURN_IF_NOT_OK(Linspace<float>(&all_freqs, 0, sample_rate / signal_times, n_freqs));
133 // f_pts is mel value sequence in linspace of (m_min, m_max).
134 std::shared_ptr<Tensor> f_pts;
135 const int32_t bias = 2;
136 RETURN_IF_NOT_OK(Linspace<float>(&f_pts, f_min, f_max, n_filter + bias));
137 // create filterbank
138 std::shared_ptr<Tensor> fb;
139 RETURN_IF_NOT_OK(CreateTriangularFilterbank<float>(&fb, all_freqs, f_pts));
140 *output = fb;
141 return Status::OK();
142 }
143
144 /// \brief Reconstruct complex tensor from norm and angle.
145 /// \param[in] abs - The absolute value of the complex tensor.
146 /// \param[in] angle - The angle of the complex tensor.
147 /// \param[out] output - Complex tensor, <channel, freq, time, complex=2>.
148 /// \return Status return code.
149 template <typename T>
Polar(const std::shared_ptr<Tensor> & abs,const std::shared_ptr<Tensor> & angle,std::shared_ptr<Tensor> * output)150 Status Polar(const std::shared_ptr<Tensor> &abs, const std::shared_ptr<Tensor> &angle,
151 std::shared_ptr<Tensor> *output) {
152 // check shape
153 if (abs->shape() != angle->shape()) {
154 std::string err_msg = "Polar: the shape of input tensor abs and angle should be the same, but got: abs " +
155 abs->shape().ToString() + " and angle " + angle->shape().ToString();
156 LOG_AND_RETURN_STATUS_SYNTAX_ERROR(err_msg);
157 }
158
159 TensorShape input_shape = abs->shape();
160 TensorShape out_shape({input_shape[0], input_shape[1], input_shape[2], 2});
161 std::vector<T> complex_vec(input_shape[0] * input_shape[1] * input_shape[2] * 2);
162 size_t ind = 0;
163 auto itr_abs = abs->begin<T>();
164 auto itr_angle = angle->begin<T>();
165
166 for (; itr_abs != abs->end<T>(); itr_abs++, itr_angle++) {
167 complex_vec[ind++] = cos(*itr_angle) * (*itr_abs);
168 complex_vec[ind++] = sin(*itr_angle) * (*itr_abs);
169 }
170
171 std::shared_ptr<Tensor> out_t;
172 RETURN_IF_NOT_OK(Tensor::CreateFromVector(complex_vec, out_shape, &out_t));
173 *output = out_t;
174 return Status::OK();
175 }
176
177 /// \brief Pad complex tensor.
178 /// \param[in] input - The complex tensor.
179 /// \param[in] length - The length of padding.
180 /// \param[in] dim - The dim index for padding.
181 /// \param[out] output - Complex tensor, <channel, freq, time, complex=2>.
182 /// \return Status return code.
183 template <typename T>
PadComplexTensor(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int length,size_t dim)184 Status PadComplexTensor(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int length, size_t dim) {
185 TensorShape input_shape = input->shape();
186 std::vector<int64_t> pad_shape_vec = {input_shape[0], input_shape[1], input_shape[2], input_shape[3]};
187 pad_shape_vec[dim] += static_cast<int64_t>(length);
188 TensorShape input_shape_with_pad(pad_shape_vec);
189 std::vector<T> in_vect(input_shape_with_pad[0] * input_shape_with_pad[1] * input_shape_with_pad[2] *
190 input_shape_with_pad[3]);
191 auto itr_input = input->begin<T>();
192 int64_t input_cnt = 0;
193 /*lint -e{446} ind is modified in the body of the for loop */
194 for (auto ind = 0; ind < static_cast<int>(in_vect.size()); ind++) {
195 in_vect[ind] = (*itr_input);
196 input_cnt = (input_cnt + 1) % (input_shape[2] * input_shape[3]);
197 itr_input++;
198 // complex tensor last dim equals 2, fill zero count equals 2*width
199 if (input_cnt == 0 && ind != 0) {
200 for (int c = 0; c < length * 2; c++) {
201 in_vect[++ind] = 0.0f;
202 }
203 }
204 }
205 std::shared_ptr<Tensor> out_t;
206 RETURN_IF_NOT_OK(Tensor::CreateFromVector(in_vect, input_shape_with_pad, &out_t));
207 *output = out_t;
208 return Status::OK();
209 }
210
211 /// \brief Calculate phase.
212 /// \param[in] angle_0 - The angle.
213 /// \param[in] angle_1 - The angle.
214 /// \param[in] phase_advance - The phase advance.
215 /// \param[in] phase_time0 - The phase at time 0.
216 /// \param[out] output - Phase tensor.
217 /// \return Status return code.
218 template <typename T>
Phase(const std::shared_ptr<Tensor> & angle_0,const std::shared_ptr<Tensor> & angle_1,const std::shared_ptr<Tensor> & phase_advance,const std::shared_ptr<Tensor> & phase_time0,std::shared_ptr<Tensor> * output)219 Status Phase(const std::shared_ptr<Tensor> &angle_0, const std::shared_ptr<Tensor> &angle_1,
220 const std::shared_ptr<Tensor> &phase_advance, const std::shared_ptr<Tensor> &phase_time0,
221 std::shared_ptr<Tensor> *output) {
222 TensorShape phase_shape = angle_0->shape();
223 std::vector<T> phase(phase_shape[0] * phase_shape[1] * phase_shape[2]);
224 auto itr_angle_0 = angle_0->begin<T>();
225 auto itr_angle_1 = angle_1->begin<T>();
226 auto itr_pa = phase_advance->begin<T>();
227 for (int ind = 0, input_cnt = 0; itr_angle_0 != angle_0->end<T>(); itr_angle_0++, itr_angle_1++, ind++) {
228 if (ind != 0 && ind % phase_shape[2] == 0) {
229 itr_pa++;
230 if (itr_pa == phase_advance->end<T>()) {
231 itr_pa = phase_advance->begin<T>();
232 }
233 input_cnt++;
234 }
235 phase[ind] = (*itr_angle_1) - (*itr_angle_0) - (*itr_pa);
236 phase[ind] = phase[ind] - 2 * PI * round(phase[ind] / (2 * PI)) + (*itr_pa);
237 }
238
239 // concat phase time 0
240 size_t ind = 0;
241 auto itr_p0 = phase_time0->begin<T>();
242 (void)phase.insert(phase.begin(), (*itr_p0));
243 itr_p0++;
244 while (itr_p0 != phase_time0->end<T>()) {
245 ind += phase_shape[2];
246 phase[ind] = (*itr_p0);
247 itr_p0++;
248 }
249 (void)phase.erase(phase.begin() + static_cast<int>(angle_0->Size()), phase.end());
250
251 // cal phase accum
252 for (ind = 0; ind < static_cast<int64_t>(phase.size()); ind++) {
253 if (ind % phase_shape[2] != 0) {
254 phase[ind] = phase[ind] + phase[ind - 1];
255 }
256 }
257 std::shared_ptr<Tensor> phase_tensor;
258 RETURN_IF_NOT_OK(Tensor::CreateFromVector(phase, phase_shape, &phase_tensor));
259 *output = phase_tensor;
260 return Status::OK();
261 }
262
263 /// \brief Calculate magnitude.
264 /// \param[in] alphas - The alphas.
265 /// \param[in] abs_0 - The norm.
266 /// \param[in] abs_1 - The norm.
267 /// \param[out] output - Magnitude tensor.
268 /// \return Status return code.
269 template <typename T>
Mag(const std::shared_ptr<Tensor> & abs_0,const std::shared_ptr<Tensor> & abs_1,std::shared_ptr<Tensor> * output,const std::vector<T> & alphas)270 Status Mag(const std::shared_ptr<Tensor> &abs_0, const std::shared_ptr<Tensor> &abs_1, std::shared_ptr<Tensor> *output,
271 const std::vector<T> &alphas) {
272 TensorShape mag_shape = abs_0->shape();
273 std::vector<T> mag(mag_shape[0] * mag_shape[1] * mag_shape[2]);
274 auto itr_abs_0 = abs_0->begin<T>();
275 auto itr_abs_1 = abs_1->begin<T>();
276 for (auto ind = 0; itr_abs_0 != abs_0->end<T>(); itr_abs_0++, itr_abs_1++, ind++) {
277 mag[ind] = alphas[ind % mag_shape[2]] * (*itr_abs_1) + (1 - alphas[ind % mag_shape[2]]) * (*itr_abs_0);
278 }
279 std::shared_ptr<Tensor> mag_tensor;
280 RETURN_IF_NOT_OK(Tensor::CreateFromVector(mag, mag_shape, &mag_tensor));
281 *output = mag_tensor;
282 return Status::OK();
283 }
284
285 template <typename T>
TimeStretch(std::shared_ptr<Tensor> input,std::shared_ptr<Tensor> * output,float rate,const std::shared_ptr<Tensor> & phase_advance)286 Status TimeStretch(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, float rate,
287 const std::shared_ptr<Tensor> &phase_advance) {
288 RETURN_UNEXPECTED_IF_NULL(input);
289 RETURN_UNEXPECTED_IF_NULL(output);
290 // pack <..., freq, time, complex>
291 TensorShape input_shape = input->shape();
292 TensorShape toShape({input->Size() / (input_shape[-1] * input_shape[-2] * input_shape[-3]), input_shape[-3],
293 input_shape[-2], input_shape[-1]});
294 RETURN_IF_NOT_OK(input->Reshape(toShape));
295 if (std::fabs(rate - 1.0) <= std::numeric_limits<float>::epsilon()) {
296 *output = input;
297 return Status::OK();
298 }
299 // calculate time step and alphas
300 std::vector<dsize_t> time_steps_0, time_steps_1;
301 std::vector<T> alphas;
302 for (int ind = 0;; ind++) {
303 T val = static_cast<float>(ind) * rate;
304 if (val >= input_shape[-2]) {
305 break;
306 }
307 auto val_int = static_cast<dsize_t>(val);
308 time_steps_0.push_back(val_int);
309 time_steps_1.push_back(val_int + 1);
310 alphas.push_back(fmod(val, 1));
311 }
312
313 // calculate phase on time 0
314 std::shared_ptr<Tensor> spec_time0, phase_time0;
315 RETURN_IF_NOT_OK(
316 input->Slice(&spec_time0, std::vector<SliceOption>({SliceOption(true), SliceOption(true),
317 SliceOption(std::vector<dsize_t>{0}), SliceOption(true)})));
318 RETURN_IF_NOT_OK(ComplexAngle<T>(spec_time0, &phase_time0));
319
320 // time pad: add zero to time dim
321 RETURN_IF_NOT_OK(PadComplexTensor<T>(input, &input, 2, 2));
322
323 // slice
324 std::shared_ptr<Tensor> spec_0;
325 RETURN_IF_NOT_OK(input->Slice(&spec_0, std::vector<SliceOption>({SliceOption(true), SliceOption(true),
326 SliceOption(time_steps_0), SliceOption(true)})));
327 std::shared_ptr<Tensor> spec_1;
328 RETURN_IF_NOT_OK(input->Slice(&spec_1, std::vector<SliceOption>({SliceOption(true), SliceOption(true),
329 SliceOption(time_steps_1), SliceOption(true)})));
330
331 // new slices angle and abs <channel, freq, time>
332 std::shared_ptr<Tensor> angle_0, angle_1, abs_0, abs_1;
333 RETURN_IF_NOT_OK(ComplexAngle<T>(spec_0, &angle_0));
334 RETURN_IF_NOT_OK(ComplexAbs<T>(spec_0, &abs_0));
335 RETURN_IF_NOT_OK(ComplexAngle<T>(spec_1, &angle_1));
336 RETURN_IF_NOT_OK(ComplexAbs<T>(spec_1, &abs_1));
337
338 // cal phase, there exists precision loss between mindspore and pytorch
339 std::shared_ptr<Tensor> phase_tensor;
340 RETURN_IF_NOT_OK(Phase<T>(angle_0, angle_1, phase_advance, phase_time0, &phase_tensor));
341
342 // calculate magnitude
343 std::shared_ptr<Tensor> mag_tensor;
344 RETURN_IF_NOT_OK(Mag<T>(abs_0, abs_1, &mag_tensor, alphas));
345
346 // reconstruct complex from norm and angle
347 std::shared_ptr<Tensor> complex_spec_stretch;
348 RETURN_IF_NOT_OK(Polar<T>(mag_tensor, phase_tensor, &complex_spec_stretch));
349
350 // unpack
351 auto output_shape_vec = input_shape.AsVector();
352 output_shape_vec.pop_back();
353 output_shape_vec.pop_back();
354 output_shape_vec.push_back(complex_spec_stretch->shape()[-2]);
355 output_shape_vec.push_back(input_shape[-1]);
356 RETURN_IF_NOT_OK(complex_spec_stretch->Reshape(TensorShape(output_shape_vec)));
357 *output = complex_spec_stretch;
358 return Status::OK();
359 }
360
TimeStretch(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float rate,float hop_length,int32_t n_freq)361 Status TimeStretch(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate, float hop_length,
362 int32_t n_freq) {
363 RETURN_UNEXPECTED_IF_NULL(input);
364 RETURN_UNEXPECTED_IF_NULL(output);
365 std::shared_ptr<Tensor> phase_advance;
366 switch (input->type().value()) {
367 case DataType::DE_FLOAT32:
368 RETURN_IF_NOT_OK(Linspace<float>(&phase_advance, 0, PI * hop_length, n_freq));
369 RETURN_IF_NOT_OK(TimeStretch<float>(input, output, rate, phase_advance));
370 break;
371 case DataType::DE_FLOAT64:
372 RETURN_IF_NOT_OK(Linspace<double>(&phase_advance, 0, PI * hop_length, n_freq));
373 RETURN_IF_NOT_OK(TimeStretch<double>(input, output, rate, phase_advance));
374 break;
375 default:
376 RETURN_IF_NOT_OK(ValidateTensorFloat("TimeStretch", input));
377 }
378 return Status::OK();
379 }
380
PhaseVocoder(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float rate,const std::shared_ptr<Tensor> & phase_advance)381 Status PhaseVocoder(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float rate,
382 const std::shared_ptr<Tensor> &phase_advance) {
383 const int32_t kFrequencePosInComplex = -3;
384 RETURN_IF_NOT_OK(ValidateTensorShape("PhaseVocoder", input->shape().Size() > kDefaultAudioDim && input->IsComplex(),
385 "<..., freq, num_frame, complex=2>"));
386 RETURN_IF_NOT_OK(ValidateTensorNumeric("PhaseVocoder", input));
387 RETURN_IF_NOT_OK(ValidateEqual("PhaseVocoder", "first dimension of 'phase_advance'", phase_advance->shape()[0],
388 "freq dimension length of input tensor", input->shape()[kFrequencePosInComplex]));
389 CHECK_FAIL_RETURN_UNEXPECTED(phase_advance->type() == input->type(),
390 "PhaseVocoder: invalid parameter, data type of phase_advance should be equal to data "
391 "type of input tensor, but got: data type of phase_advance " +
392 phase_advance->type().ToString() + " while data type of input tensor " +
393 input->type().ToString() + ".");
394 std::shared_ptr<Tensor> input_tensor;
395 if (input->type().value() != DataType::DE_FLOAT64) {
396 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
397 RETURN_IF_NOT_OK(TimeStretch<float>(input_tensor, output, rate, phase_advance));
398 } else {
399 RETURN_IF_NOT_OK(TimeStretch<double>(input, output, rate, phase_advance));
400 }
401 return Status::OK();
402 }
403
Dct(std::shared_ptr<Tensor> * output,int n_mfcc,int n_mels,NormMode norm)404 Status Dct(std::shared_ptr<Tensor> *output, int n_mfcc, int n_mels, NormMode norm) {
405 TensorShape dct_shape({n_mels, n_mfcc});
406 RETURN_IF_NOT_OK(Tensor::CreateEmpty(dct_shape, DataType(DataType::DE_FLOAT32), output));
407 auto iter = (*output)->begin<float>();
408 const float sqrt_2 = 1 / sqrt(2.0f);
409 auto sqrt_2_n_mels = static_cast<float>(sqrt(2.0 / n_mels));
410 for (int i = 0; i < n_mels; i++) {
411 for (int j = 0; j < n_mfcc; j++) {
412 // calculate temp:
413 // 1. while norm = None, use 2*cos(PI*(i+0.5)*j/n_mels)
414 // 2. while norm = Ortho, divide the first row by sqrt(2),
415 // then using sqrt(2.0 / n_mels)*cos(PI*(i+0.5)*j/n_mels)
416 auto temp = static_cast<float>(PI / n_mels * (i + 0.5) * j);
417 temp = cos(temp);
418 if (norm == NormMode::kOrtho) {
419 if (j == 0) {
420 temp *= sqrt_2;
421 }
422 temp *= sqrt_2_n_mels;
423 } else {
424 temp *= 2;
425 }
426 (*iter++) = temp;
427 }
428 }
429 return Status::OK();
430 }
431
RandomMaskAlongAxis(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t mask_param,float mask_value,int axis,std::mt19937 * rnd)432 Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_param,
433 float mask_value, int axis, std::mt19937 *rnd) {
434 std::uniform_int_distribution<int32_t> mask_width_value(0, mask_param);
435 TensorShape input_shape = input->shape();
436 int32_t mask_dim_size = axis == 1 ? input_shape[-2] : input_shape[-1];
437 int32_t mask_width = mask_width_value(*rnd);
438 std::uniform_int_distribution<int32_t> min_freq_value(0, mask_dim_size - mask_width);
439 int32_t mask_start = min_freq_value(*rnd);
440
441 return MaskAlongAxis(input, output, mask_width, mask_start, mask_value, axis);
442 }
443
MaskAlongAxis(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t mask_width,int32_t mask_start,float mask_value,int32_t axis)444 Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_width,
445 int32_t mask_start, float mask_value, int32_t axis) {
446 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
447 if (mask_width == 0) {
448 return Status::OK();
449 }
450 if (axis != 2 && axis != 1) {
451 LOG_AND_RETURN_STATUS_SYNTAX_ERROR(
452 "MaskAlongAxis: invalid parameter, 'axis' can only be 1 for Frequency Masking or 2 for Time Masking.");
453 }
454 TensorShape input_shape = (*output)->shape();
455 // squeeze output
456 TensorShape squeeze_shape =
457 TensorShape({input_shape.NumOfElements() / input_shape[-2] / input_shape[-1], input_shape[-2], input_shape[-1]});
458 RETURN_IF_NOT_OK((*output)->Reshape(squeeze_shape));
459
460 int check_dim_ind = (axis == 1) ? -2 : -1;
461 CHECK_FAIL_RETURN_SYNTAX_ERROR(mask_start >= 0 && mask_start <= input_shape[check_dim_ind],
462 "MaskAlongAxis: invalid parameter, 'mask_start' should be less than the length of the "
463 "masked dimension, but got: 'mask_start' " +
464 std::to_string(mask_start) + " and length " +
465 std::to_string(input_shape[check_dim_ind]));
466 CHECK_FAIL_RETURN_SYNTAX_ERROR(
467 mask_start + mask_width <= input_shape[check_dim_ind],
468 "MaskAlongAxis: invalid parameter, the sum of 'mask_start' and 'mask_width' should be no more "
469 "than the length of the masked dimension, but got: 'mask_start' " +
470 std::to_string(mask_start) + ", 'mask_width' " + std::to_string(mask_width) + " and length " +
471 std::to_string(input_shape[check_dim_ind]));
472
473 size_t cell_size = (*output)->type().SizeInBytes();
474
475 if (axis == 1) {
476 // freq
477 const auto kShapeIndex = -2;
478 for (auto ind = 0; ind < (*output)->Size() / input_shape[kShapeIndex] * mask_width; ind++) {
479 int block_num = ind / (mask_width * input_shape[-1]);
480 auto start_pos = ind % (mask_width * input_shape[-1]) + mask_start * input_shape[-1] +
481 input_shape[-1] * input_shape[kShapeIndex] * block_num;
482 auto start_mem_pos = const_cast<uchar *>((*output)->GetBuffer() + start_pos * cell_size);
483 if ((*output)->type() != DataType::DE_FLOAT64) {
484 // tensor float 32
485 auto mask_val = static_cast<float>(mask_value);
486 auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
487 CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error, ret code: " +
488 std::to_string(ret_code) + ".");
489 } else {
490 // tensor float 64
491 auto mask_val = static_cast<double>(mask_value);
492 auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
493 CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error, ret code: " +
494 std::to_string(ret_code) + ".");
495 }
496 }
497 } else {
498 // time
499 for (int ind = 0; ind < (*output)->Size() / input_shape[-1] * mask_width; ind++) {
500 int row_num = ind / mask_width;
501 auto start_pos = ind % mask_width + mask_start + input_shape[-1] * row_num;
502 auto start_mem_pos = const_cast<uchar *>((*output)->GetBuffer() + start_pos * cell_size);
503 if ((*output)->type() != DataType::DE_FLOAT64) {
504 // tensor float 32
505 auto mask_val = static_cast<float>(mask_value);
506 auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
507 CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error, ret code: " +
508 std::to_string(ret_code) + ".");
509 } else {
510 // tensor float 64
511 auto mask_val = static_cast<double>(mask_value);
512 auto ret_code = memcpy_s(start_mem_pos, cell_size, &mask_val, cell_size);
513 CHECK_FAIL_RETURN_UNEXPECTED(ret_code == EOK, "MaskAlongAxis: mask failed, memory copy error. ret code: " +
514 std::to_string(ret_code) + ".");
515 }
516 }
517 }
518 // unsqueeze output
519 RETURN_IF_NOT_OK((*output)->Reshape(input_shape));
520 return Status::OK();
521 }
522
523 template <typename T>
Norm(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float power)524 Status Norm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power) {
525 // calculate the output dimension
526 auto input_size = input->shape().AsVector();
527 auto dim_back = input_size.back();
528 RETURN_IF_NOT_OK(
529 ValidateTensorShape("ComplexNorm", input->IsComplex(), "<..., complex=2>", std::to_string(dim_back)));
530 input_size.pop_back();
531 std::shared_ptr<Tensor> out;
532 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape(input_size), input->type(), &out));
533
534 std::vector<std::thread> threads;
535 size_t count = input->Size();
536 float block_size = 30000;
537
538 auto task = [&input, &out, &power, &count](size_t start, size_t end, size_t num, size_t task_num) {
539 if ((task_num - num) == 1) {
540 end = count;
541 }
542
543 auto itr_start = input->begin<T>();
544 itr_start += start;
545 auto itr_end = input->begin<T>();
546 itr_end += end;
547 auto itr_out = out->begin<T>();
548 size_t offset = start / TWO;
549 itr_out += offset;
550
551 // calculate norm, using: .pow(2.).sum(-1).pow(0.5 * power)
552 for (auto itr = itr_start; itr != itr_end; itr++) {
553 auto a = *itr;
554 ++itr;
555 auto b = *itr;
556 auto res = pow(a, TWO) + pow(b, TWO);
557 *itr_out = static_cast<T>(pow(res, (HALF * power)));
558 itr_out++;
559 }
560 };
561 RETURN_IF_NOT_OK(AudioParallelLaunch(task, count, block_size));
562 *output = out;
563
564 return Status::OK();
565 }
566
ComplexNorm(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float power)567 Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power) {
568 if (input->type().value() >= DataType::DE_BOOL && input->type().value() <= DataType::DE_FLOAT16) {
569 // convert the data type to float
570 std::shared_ptr<Tensor> input_tensor;
571 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
572
573 RETURN_IF_NOT_OK(Norm<float>(input_tensor, output, power));
574 } else if (input->type().value() == DataType::DE_FLOAT32) {
575 RETURN_IF_NOT_OK(Norm<float>(input, output, power));
576 } else if (input->type().value() == DataType::DE_FLOAT64) {
577 RETURN_IF_NOT_OK(Norm<double>(input, output, power));
578 } else {
579 RETURN_IF_NOT_OK(ValidateTensorNumeric("ComplexNorm", input));
580 }
581 return Status::OK();
582 }
583
584 template <typename T>
sgn(T val)585 inline float sgn(T val) {
586 return (val > 0) ? 1 : ((val < 0) ? -1 : 0);
587 }
588
589 template <typename T>
Decoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T mu)590 Status Decoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T mu) {
591 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), input->type(), output));
592 auto itr_out = (*output)->begin<T>();
593 auto itr = input->begin<T>();
594 auto end = input->end<T>();
595
596 while (itr != end) {
597 auto x_mu = *itr;
598 CHECK_FAIL_RETURN_SYNTAX_ERROR(mu != 0, "Decoding: invalid parameter, 'mu' can not be zero.");
599 x_mu = ((x_mu) / mu) * 2 - 1.0;
600 x_mu = sgn(x_mu) * expm1(fabs(x_mu) * log1p(mu)) / mu;
601 *itr_out = x_mu;
602 ++itr_out;
603 ++itr;
604 }
605 return Status::OK();
606 }
607
MuLawDecoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t quantization_channels)608 Status MuLawDecoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
609 int32_t quantization_channels) {
610 if (input->type() != DataType::DE_FLOAT64) {
611 float f_mu = static_cast<float>(quantization_channels) - 1;
612 // convert the data type to float
613 std::shared_ptr<Tensor> input_tensor;
614 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
615 RETURN_IF_NOT_OK(Decoding<float>(input_tensor, output, f_mu));
616 } else {
617 double f_mu = static_cast<double>(quantization_channels) - 1;
618 RETURN_IF_NOT_OK(Decoding<double>(input, output, f_mu));
619 }
620 return Status::OK();
621 }
622
623 template <typename T>
Encoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T mu)624 Status Encoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T mu) {
625 RETURN_IF_NOT_OK(Tensor::CreateEmpty(input->shape(), DataType(DataType::DE_INT32), output));
626 auto itr_out = (*output)->begin<int32_t>();
627 auto itr = input->begin<T>();
628 auto end = input->end<T>();
629
630 while (itr != end) {
631 auto x = *itr;
632 x = sgn(x) * log1p(mu * fabs(x)) / log1p(mu);
633 x = (x + 1) / 2 * mu + 0.5;
634 *itr_out = static_cast<int32_t>(x);
635 ++itr_out;
636 ++itr;
637 }
638 return Status::OK();
639 }
640
MuLawEncoding(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t quantization_channels)641 Status MuLawEncoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output,
642 int32_t quantization_channels) {
643 if (input->type() != DataType::DE_FLOAT64) {
644 float f_mu = static_cast<float>(quantization_channels) - 1;
645 // convert the data type to float
646 std::shared_ptr<Tensor> input_tensor;
647 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
648 RETURN_IF_NOT_OK(Encoding<float>(input_tensor, output, f_mu));
649 } else {
650 double f_mu = static_cast<double>(quantization_channels) - 1;
651 RETURN_IF_NOT_OK(Encoding<double>(input, output, f_mu));
652 }
653 return Status::OK();
654 }
655
656 template <typename T>
FadeIn(std::shared_ptr<Tensor> * output,int32_t fade_in_len,FadeShape fade_shape)657 Status FadeIn(std::shared_ptr<Tensor> *output, int32_t fade_in_len, FadeShape fade_shape) {
658 T start = 0;
659 T end = 1;
660 RETURN_IF_NOT_OK(Linspace<T>(output, start, end, fade_in_len));
661 for (auto iter = (*output)->begin<T>(); iter != (*output)->end<T>(); iter++) {
662 switch (fade_shape) {
663 case FadeShape::kLinear:
664 break;
665 case FadeShape::kExponential:
666 // Compute the scale factor of the exponential function, pow(2.0, *in_ter - 1.0) * (*in_ter)
667 *iter = static_cast<T>(std::pow(2.0, *iter - 1.0) * (*iter));
668 break;
669 case FadeShape::kLogarithmic:
670 // Compute the scale factor of the logarithmic function, log(*in_iter + 0.1) + 1.0
671 *iter = static_cast<T>(std::log10(*iter + 0.1) + 1.0);
672 break;
673 case FadeShape::kQuarterSine:
674 // Compute the scale factor of the quarter_sine function, sin((*in_iter - 1.0) * PI / 2.0)
675 *iter = static_cast<T>(std::sin((*iter) * PI / 2.0));
676 break;
677 case FadeShape::kHalfSine:
678 // Compute the scale factor of the half_sine function, sin((*in_iter) * PI - PI / 2.0) / 2.0 + 0.5
679 *iter = static_cast<T>(std::sin((*iter) * PI - PI / 2.0) / 2.0 + 0.5);
680 break;
681 default:
682 RETURN_STATUS_UNEXPECTED(
683 "The fade_shape parameter only supports these types [FadeShape::kLinear, FadeShape::kExponential, "
684 "FadeShape::kLogarithmic, FadeShape::kQuarterSine, FadeShape::kHalfSine].");
685 }
686 }
687 return Status::OK();
688 }
689
690 template <typename T>
FadeOut(std::shared_ptr<Tensor> * output,int32_t fade_out_len,FadeShape fade_shape)691 Status FadeOut(std::shared_ptr<Tensor> *output, int32_t fade_out_len, FadeShape fade_shape) {
692 T start = 0;
693 T end = 1;
694 RETURN_IF_NOT_OK(Linspace<T>(output, start, end, fade_out_len));
695 for (auto iter = (*output)->begin<T>(); iter != (*output)->end<T>(); iter++) {
696 switch (fade_shape) {
697 case FadeShape::kLinear:
698 // In fade out, invert *out_iter
699 *iter = static_cast<T>(1.0 - *iter);
700 break;
701 case FadeShape::kExponential:
702 // Compute the scale factor of the exponential function
703 *iter = static_cast<T>(std::pow(2.0, -*iter) * (1.0 - *iter));
704 break;
705 case FadeShape::kLogarithmic:
706 // Compute the scale factor of the logarithmic function
707 *iter = static_cast<T>(std::log10(1.1 - *iter) + 1.0);
708 break;
709 case FadeShape::kQuarterSine:
710 // Compute the scale factor of the quarter_sine function
711 *iter = static_cast<T>(std::sin((*iter) * PI / 2.0 + PI / 2.0));
712 break;
713 case FadeShape::kHalfSine:
714 // Compute the scale factor of the half_sine function
715 *iter = static_cast<T>(std::sin((*iter) * PI + PI / 2.0) / 2.0 + 0.5);
716 break;
717 default:
718 RETURN_STATUS_UNEXPECTED(
719 "The fade_shape parameter only supports these types [FadeShape::kLinear, FadeShape::kExponential, "
720 "FadeShape::kLogarithmic, FadeShape::kQuarterSine, FadeShape::kHalfSine].");
721 }
722 }
723 return Status::OK();
724 }
725
726 template <typename T>
Fade(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t fade_in_len,int32_t fade_out_len,FadeShape fade_shape)727 Status Fade(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t fade_in_len,
728 int32_t fade_out_len, FadeShape fade_shape) {
729 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
730 const TensorShape input_shape = input->shape();
731 auto waveform_length = static_cast<int32_t>(input_shape[-1]);
732 RETURN_IF_NOT_OK(ValidateNoGreaterThan("Fade", "fade_in_len", fade_in_len, "length of waveform", waveform_length));
733 RETURN_IF_NOT_OK(ValidateNoGreaterThan("Fade", "fade_out_len", fade_out_len, "length of waveform", waveform_length));
734 auto num_waveform = static_cast<int32_t>(input->Size() / waveform_length);
735 TensorShape toShape = TensorShape({num_waveform, waveform_length});
736 RETURN_IF_NOT_OK((*output)->Reshape(toShape));
737 TensorPtr fade_in;
738 RETURN_IF_NOT_OK(FadeIn<T>(&fade_in, fade_in_len, fade_shape));
739 TensorPtr fade_out;
740 RETURN_IF_NOT_OK(FadeOut<T>(&fade_out, fade_out_len, fade_shape));
741
742 // Add fade in to input tensor
743 auto output_iter = (*output)->begin<T>();
744 for (auto fade_in_iter = fade_in->begin<T>(); fade_in_iter != fade_in->end<T>(); fade_in_iter++) {
745 *output_iter = (*output_iter) * (*fade_in_iter);
746 for (int32_t j = 1; j < num_waveform; j++) {
747 output_iter += waveform_length;
748 *output_iter = (*output_iter) * (*fade_in_iter);
749 }
750 output_iter -= ((num_waveform - 1) * waveform_length);
751 ++output_iter;
752 }
753
754 // Add fade out to input tensor
755 output_iter = (*output)->begin<T>();
756 output_iter += (waveform_length - fade_out_len);
757 for (auto fade_out_iter = fade_out->begin<T>(); fade_out_iter != fade_out->end<T>(); fade_out_iter++) {
758 *output_iter = (*output_iter) * (*fade_out_iter);
759 for (int32_t j = 1; j < num_waveform; j++) {
760 output_iter += waveform_length;
761 *output_iter = (*output_iter) * (*fade_out_iter);
762 }
763 output_iter -= ((num_waveform - 1) * waveform_length);
764 ++output_iter;
765 }
766 (*output)->Reshape(input_shape);
767 return Status::OK();
768 }
769
Fade(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t fade_in_len,int32_t fade_out_len,FadeShape fade_shape)770 Status Fade(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t fade_in_len,
771 int32_t fade_out_len, FadeShape fade_shape) {
772 if (DataType::DE_BOOL <= input->type().value() && input->type().value() <= DataType::DE_FLOAT32) {
773 std::shared_ptr<Tensor> waveform;
774 RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
775 RETURN_IF_NOT_OK(Fade<float>(waveform, output, fade_in_len, fade_out_len, fade_shape));
776 } else if (input->type().value() == DataType::DE_FLOAT64) {
777 RETURN_IF_NOT_OK(Fade<double>(input, output, fade_in_len, fade_out_len, fade_shape));
778 } else {
779 RETURN_IF_NOT_OK(ValidateTensorNumeric("Fade", input));
780 }
781 return Status::OK();
782 }
783
Magphase(const TensorRow & input,TensorRow * output,float power)784 Status Magphase(const TensorRow &input, TensorRow *output, float power) {
785 std::shared_ptr<Tensor> mag;
786 std::shared_ptr<Tensor> phase;
787
788 RETURN_IF_NOT_OK(ComplexNorm(input[0], &mag, power));
789 if (input[0]->type() == DataType(DataType::DE_FLOAT64)) {
790 RETURN_IF_NOT_OK(Angle<double>(input[0], &phase));
791 } else {
792 std::shared_ptr<Tensor> tmp;
793 RETURN_IF_NOT_OK(TypeCast(input[0], &tmp, DataType(DataType::DE_FLOAT32)));
794 RETURN_IF_NOT_OK(Angle<float>(tmp, &phase));
795 }
796 (*output).push_back(mag);
797 (*output).push_back(phase);
798
799 return Status::OK();
800 }
801
MedianSmoothing(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t win_length)802 Status MedianSmoothing(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length) {
803 auto channel = input->shape()[0];
804 auto num_of_frames = input->shape()[1];
805 // Centered windowed
806 int32_t pad_length = (win_length - 1) / 2;
807 int32_t out_length = num_of_frames + pad_length - win_length + 1;
808 TensorShape out_shape({channel, out_length});
809 std::vector<int> signal;
810 std::vector<int> out;
811 std::vector<int> indices(channel * (num_of_frames + pad_length), 0);
812 // "replicate" padding in any dimension
813 for (auto itr = input->begin<int>(); itr != input->end<int>(); ++itr) {
814 signal.push_back(*itr);
815 }
816 for (int i = 0; i < channel; ++i) {
817 for (int j = 0; j < pad_length; ++j) {
818 indices[i * (num_of_frames + pad_length) + j] = signal[i * num_of_frames];
819 }
820 }
821 for (int i = 0; i < channel; ++i) {
822 for (int j = 0; j < num_of_frames; ++j) {
823 indices[i * (num_of_frames + pad_length) + j + pad_length] = signal[i * num_of_frames + j];
824 }
825 }
826 for (int i = 0; i < channel; ++i) {
827 int32_t index = i * (num_of_frames + pad_length);
828 for (int j = 0; j < out_length; ++j) {
829 std::vector<int> tem(indices.begin() + index, indices.begin() + win_length + index);
830 std::sort(tem.begin(), tem.end());
831 out.push_back(tem[pad_length]);
832 ++index;
833 }
834 }
835 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
836 return Status::OK();
837 }
838
DetectPitchFrequency(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,float frame_time,int32_t win_length,int32_t freq_low,int32_t freq_high)839 Status DetectPitchFrequency(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
840 float frame_time, int32_t win_length, int32_t freq_low, int32_t freq_high) {
841 std::shared_ptr<Tensor> nccf;
842 std::shared_ptr<Tensor> indices;
843 std::shared_ptr<Tensor> smooth_indices;
844 // pack batch
845 TensorShape input_shape = input->shape();
846 TensorShape to_shape({input->Size() / input_shape[-1], input_shape[-1]});
847 RETURN_IF_NOT_OK(input->Reshape(to_shape));
848 if (input->type() == DataType(DataType::DE_FLOAT32)) {
849 RETURN_IF_NOT_OK(ComputeNccf<float>(input, &nccf, sample_rate, frame_time, freq_low));
850 RETURN_IF_NOT_OK(FindMaxPerFrame<float>(nccf, &indices, sample_rate, freq_high));
851 } else if (input->type() == DataType(DataType::DE_FLOAT64)) {
852 RETURN_IF_NOT_OK(ComputeNccf<double>(input, &nccf, sample_rate, frame_time, freq_low));
853 RETURN_IF_NOT_OK(FindMaxPerFrame<double>(nccf, &indices, sample_rate, freq_high));
854 } else {
855 RETURN_IF_NOT_OK(ComputeNccf<float16>(input, &nccf, sample_rate, frame_time, freq_low));
856 RETURN_IF_NOT_OK(FindMaxPerFrame<float16>(nccf, &indices, sample_rate, freq_high));
857 }
858 RETURN_IF_NOT_OK(MedianSmoothing(indices, &smooth_indices, win_length));
859
860 // Convert indices to frequency
861 constexpr double EPSILON = 1e-9;
862 TensorShape freq_shape = smooth_indices->shape();
863 std::vector<float> out;
864 for (auto itr_fre = smooth_indices->begin<int>(); itr_fre != smooth_indices->end<int>(); ++itr_fre) {
865 out.push_back(sample_rate / (EPSILON + *itr_fre));
866 }
867
868 // unpack batch
869 auto shape_vec = input_shape.AsVector();
870 shape_vec[shape_vec.size() - 1] = freq_shape[-1];
871 TensorShape out_shape(shape_vec);
872 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out, out_shape, output));
873 return Status::OK();
874 }
875
GenerateWaveTable(std::shared_ptr<Tensor> * output,const DataType & type,Modulation modulation,int32_t table_size,float min,float max,float phase)876 Status GenerateWaveTable(std::shared_ptr<Tensor> *output, const DataType &type, Modulation modulation,
877 int32_t table_size, float min, float max, float phase) {
878 RETURN_UNEXPECTED_IF_NULL(output);
879 CHECK_FAIL_RETURN_UNEXPECTED(table_size > 0,
880 "table_size must be more than 0, but got: " + std::to_string(table_size));
881 auto phase_offset = static_cast<int32_t>(phase / PI / 2 * table_size + 0.5);
882 // get the offset of the i-th
883 std::vector<int32_t> point;
884 for (auto i = 0; i < table_size; i++) {
885 point.push_back((i + phase_offset) % table_size);
886 }
887
888 std::shared_ptr<Tensor> wave_table;
889 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({table_size}), DataType(DataType::DE_FLOAT32), &wave_table));
890
891 auto iter = wave_table->begin<float>();
892
893 if (modulation == Modulation::kSinusoidal) {
894 for (int i = 0; i < table_size; iter++, i++) {
895 // change phase
896 *iter = (sin(point[i] * PI / table_size * 2) + 1) / 2;
897 }
898 } else {
899 for (int i = 0; i < table_size; iter++, i++) {
900 // change phase
901 *iter = point[i] * 2.0 / table_size;
902 // get complete offset
903 auto value = static_cast<int>(4 * point[i] / table_size);
904 // change the value of the square wave according to the number of complete offsets
905 if (value == 0) {
906 *iter = *iter + 0.5;
907 } else if (value == 1 || value == 2) {
908 *iter = 1.5 - *iter;
909 } else if (value == 3) {
910 *iter = *iter - 1.5;
911 }
912 }
913 }
914 for (iter = wave_table->begin<float>(); iter != wave_table->end<float>(); iter++) {
915 *iter = *iter * (max - min) + min;
916 }
917 if (type.IsInt()) {
918 for (iter = wave_table->begin<float>(); iter != wave_table->end<float>(); iter++) {
919 if (*iter < 0) {
920 *iter = *iter - 0.5;
921 } else {
922 *iter = *iter + 0.5;
923 }
924 }
925 RETURN_IF_NOT_OK(TypeCast(wave_table, output, DataType(DataType::DE_INT32)));
926 } else if (type.IsFloat()) {
927 RETURN_IF_NOT_OK(TypeCast(wave_table, output, DataType(DataType::DE_FLOAT32)));
928 }
929
930 return Status::OK();
931 }
932
ReadWaveFile(const std::string & wav_file_dir,std::vector<float> * waveform_vec,int32_t * sample_rate)933 Status ReadWaveFile(const std::string &wav_file_dir, std::vector<float> *waveform_vec, int32_t *sample_rate) {
934 RETURN_UNEXPECTED_IF_NULL(waveform_vec);
935 RETURN_UNEXPECTED_IF_NULL(sample_rate);
936 auto wav_realpath = FileUtils::GetRealPath(wav_file_dir.c_str());
937 if (!wav_realpath.has_value()) {
938 LOG_AND_RETURN_STATUS_SYNTAX_ERROR("Invalid file path, get real path failed: " + wav_file_dir);
939 }
940
941 const float kMaxVal = 32767.0;
942 Path file_path(wav_realpath.value());
943 CHECK_FAIL_RETURN_UNEXPECTED(file_path.Exists() && !file_path.IsDirectory(),
944 "Invalid file path, failed to find waveform file: " + file_path.ToString());
945 std::ifstream in(file_path.ToString(), std::ios::in | std::ios::binary);
946 CHECK_FAIL_RETURN_UNEXPECTED(in.is_open(), "Invalid file, failed to open waveform file: " + file_path.ToString() +
947 ", make sure the file not damaged or permission denied.");
948 auto *header = new WavHeader();
949 in.read(reinterpret_cast<char *>(header), sizeof(WavHeader));
950 *sample_rate = header->sample_rate;
951 float bytes_per_sample = header->bits_per_sample / 8;
952 auto sub_chunk2_size = header->sub_chunk2_size;
953 if (bytes_per_sample == 0) {
954 in.close();
955 delete header;
956 RETURN_STATUS_UNEXPECTED("ReadWaveFile: zero division error, bits per sample of the audio can not be zero.");
957 }
958 int num_samples = sub_chunk2_size / bytes_per_sample;
959 std::unique_ptr<int16_t[]> data = std::make_unique<int16_t[]>(num_samples);
960 in.read(reinterpret_cast<char *>(data.get()), sizeof(int16_t) * num_samples);
961 waveform_vec->resize(num_samples);
962 for (int i = 0; i < num_samples; i++) {
963 (*waveform_vec)[i] = data[i] / kMaxVal;
964 }
965 in.close();
966 delete header;
967 return Status::OK();
968 }
969
ComputeCmnStartAndEnd(int32_t cmn_window,int32_t min_cmn_window,bool center,int32_t idx,int32_t num_frames,int32_t * cmn_window_start_p,int32_t * cmn_window_end_p)970 Status ComputeCmnStartAndEnd(int32_t cmn_window, int32_t min_cmn_window, bool center, int32_t idx, int32_t num_frames,
971 int32_t *cmn_window_start_p, int32_t *cmn_window_end_p) {
972 RETURN_UNEXPECTED_IF_NULL(cmn_window_start_p);
973 RETURN_UNEXPECTED_IF_NULL(cmn_window_end_p);
974 RETURN_IF_NOT_OK(ValidateNonNegative("SlidingWindowCmn", "cmn_window", cmn_window));
975 RETURN_IF_NOT_OK(ValidateNonNegative("SlidingWindowCmn", "min_cmn_window", min_cmn_window));
976 int32_t cmn_window_start = 0, cmn_window_end = 0;
977 const constexpr int window_center = 2;
978 if (center) {
979 cmn_window_start = idx - cmn_window / window_center;
980 cmn_window_end = cmn_window_start + cmn_window;
981 } else {
982 cmn_window_start = idx - cmn_window;
983 cmn_window_end = idx + 1;
984 }
985 if (cmn_window_start < 0) {
986 cmn_window_end -= cmn_window_start;
987 cmn_window_start = 0;
988 }
989 if (!center) {
990 if (cmn_window_end > idx) {
991 cmn_window_end = std::max(idx + 1, min_cmn_window);
992 }
993 }
994 if (cmn_window_end > num_frames) {
995 cmn_window_start -= (cmn_window_end - num_frames);
996 cmn_window_end = num_frames;
997 if (cmn_window_start < 0) {
998 cmn_window_start = 0;
999 }
1000 }
1001
1002 *cmn_window_start_p = cmn_window_start;
1003 *cmn_window_end_p = cmn_window_end;
1004 return Status::OK();
1005 }
1006
1007 template <typename T>
ComputeCmnWaveform(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * cmn_waveform_p,int32_t num_channels,int32_t num_frames,int32_t num_feats,int32_t cmn_window,int32_t min_cmn_window,bool center,bool norm_vars)1008 Status ComputeCmnWaveform(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *cmn_waveform_p,
1009 int32_t num_channels, int32_t num_frames, int32_t num_feats, int32_t cmn_window,
1010 int32_t min_cmn_window, bool center, bool norm_vars) {
1011 using ArrayXT = Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
1012 constexpr int square_num = 2;
1013 int32_t last_window_start = -1, last_window_end = -1;
1014 ArrayXT cur_sum = ArrayXT(num_channels, num_feats);
1015 ArrayXT cur_sum_sq;
1016 if (norm_vars) {
1017 cur_sum_sq = ArrayXT(num_channels, num_feats);
1018 }
1019 for (int i = 0; i < num_frames; ++i) {
1020 int32_t cmn_window_start = 0, cmn_window_end = 0;
1021 RETURN_IF_NOT_OK(
1022 ComputeCmnStartAndEnd(cmn_window, min_cmn_window, center, i, num_frames, &cmn_window_start, &cmn_window_end));
1023 int32_t row = cmn_window_end - cmn_window_start * 2;
1024 int32_t cmn_window_frames = cmn_window_end - cmn_window_start;
1025 for (int32_t m = 0; m < num_channels; ++m) {
1026 if (last_window_start == -1) {
1027 auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1028 it += (m * num_frames * num_feats + cmn_window_start * num_feats);
1029 auto tmp_map = Eigen::Map<ArrayXT>(it, row, num_feats);
1030 if (i > 0) {
1031 cur_sum.row(m) += tmp_map.colwise().sum();
1032 if (norm_vars) {
1033 cur_sum_sq.row(m) += tmp_map.pow(square_num).colwise().sum();
1034 }
1035 } else {
1036 cur_sum.row(m) = tmp_map.colwise().sum();
1037 if (norm_vars) {
1038 cur_sum_sq.row(m) = tmp_map.pow(square_num).colwise().sum();
1039 }
1040 }
1041 } else {
1042 if (cmn_window_start > last_window_start) {
1043 auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1044 it += (m * num_frames * num_feats + last_window_start * num_feats);
1045 auto tmp_map = Eigen::Map<ArrayXT>(it, 1, num_feats);
1046 cur_sum.row(m) -= tmp_map;
1047 if (norm_vars) {
1048 cur_sum_sq.row(m) -= tmp_map.pow(square_num);
1049 }
1050 }
1051 if (cmn_window_end > last_window_end) {
1052 auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1053 it += (m * num_frames * num_feats + last_window_end * num_feats);
1054 auto tmp_map = Eigen::Map<ArrayXT>(it, 1, num_feats);
1055 cur_sum.row(m) += tmp_map;
1056 if (norm_vars) {
1057 cur_sum_sq.row(m) += tmp_map.pow(square_num);
1058 }
1059 }
1060 }
1061
1062 auto it = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1063 auto cmn_it = reinterpret_cast<T *>(const_cast<uchar *>((*cmn_waveform_p)->GetBuffer()));
1064 it += (m * num_frames * num_feats + i * num_feats);
1065 cmn_it += (m * num_frames * num_feats + i * num_feats);
1066 CHECK_FAIL_RETURN_UNEXPECTED(cmn_window_frames != 0,
1067 "SlidingWindowCmn: invalid parameter, 'cmn_window_frames' can not be zero.");
1068 Eigen::Map<ArrayXT>(cmn_it, 1, num_feats) =
1069 Eigen::Map<ArrayXT>(it, 1, num_feats) - cur_sum.row(m) / cmn_window_frames;
1070 if (norm_vars) {
1071 if (cmn_window_frames == 1) {
1072 auto cmn_it_1 = reinterpret_cast<T *>(const_cast<uchar *>((*cmn_waveform_p)->GetBuffer()));
1073 cmn_it_1 += (m * num_frames * num_feats + i * num_feats);
1074 Eigen::Map<ArrayXT>(cmn_it_1, 1, num_feats).setZero();
1075 } else {
1076 auto variance = (Eigen::Map<ArrayXT>(cur_sum_sq.data(), num_channels, num_feats) / cmn_window_frames) -
1077 (cur_sum.pow(2) / std::pow(cmn_window_frames, 2));
1078 auto cmn_it_2 = reinterpret_cast<T *>(const_cast<uchar *>((*cmn_waveform_p)->GetBuffer()));
1079 cmn_it_2 += (m * num_frames * num_feats + i * num_feats);
1080 Eigen::Map<ArrayXT>(cmn_it_2, 1, num_feats) =
1081 Eigen::Map<ArrayXT>(cmn_it_2, 1, num_feats) * (1 / variance.sqrt()).row(m);
1082 }
1083 }
1084 }
1085 last_window_start = cmn_window_start;
1086 last_window_end = cmn_window_end;
1087 }
1088 return Status::OK();
1089 }
1090
1091 template <typename T>
SlidingWindowCmnHelper(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t cmn_window,int32_t min_cmn_window,bool center,bool norm_vars)1092 Status SlidingWindowCmnHelper(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t cmn_window,
1093 int32_t min_cmn_window, bool center, bool norm_vars) {
1094 int32_t num_frames = input->shape()[Tensor::HandleNeg(-2, input->shape().Size())];
1095 int32_t num_feats = input->shape()[Tensor::HandleNeg(-1, input->shape().Size())];
1096
1097 int32_t first_index = 1;
1098 std::vector<dsize_t> input_shape = input->shape().AsVector();
1099 std::for_each(input_shape.begin(), input_shape.end(), [&first_index](const dsize_t &item) { first_index *= item; });
1100 RETURN_IF_NOT_OK(
1101 input->Reshape(TensorShape({static_cast<int>(first_index / (num_frames * num_feats)), num_frames, num_feats})));
1102
1103 auto num_channels = static_cast<int32_t>(input->shape()[0]);
1104 TensorPtr cmn_waveform;
1105 RETURN_IF_NOT_OK(
1106 Tensor::CreateEmpty(TensorShape({num_channels, num_frames, num_feats}), input->type(), &cmn_waveform));
1107 RETURN_IF_NOT_OK(ComputeCmnWaveform<T>(input, &cmn_waveform, num_channels, num_frames, num_feats, cmn_window,
1108 min_cmn_window, center, norm_vars));
1109
1110 std::vector<dsize_t> re_shape = input_shape;
1111 auto r_it = re_shape.rbegin();
1112 *r_it++ = num_feats;
1113 *r_it = num_frames;
1114 RETURN_IF_NOT_OK(cmn_waveform->Reshape(TensorShape(re_shape)));
1115
1116 const constexpr int specify_input_shape = 2;
1117 const constexpr int specify_first_shape = 1;
1118 if (input_shape.size() == specify_input_shape && cmn_waveform->shape()[0] == specify_first_shape) {
1119 cmn_waveform->Squeeze();
1120 }
1121 *output = cmn_waveform;
1122 return Status::OK();
1123 }
1124
SlidingWindowCmn(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t cmn_window,int32_t min_cmn_window,bool center,bool norm_vars)1125 Status SlidingWindowCmn(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t cmn_window,
1126 int32_t min_cmn_window, bool center, bool norm_vars) {
1127 RETURN_IF_NOT_OK(ValidateLowRank("SlidingWindowCmn", input, kDefaultAudioDim, "<..., freq, time>"));
1128
1129 if (input->type().IsNumeric() && input->type().value() != DataType::DE_FLOAT64) {
1130 std::shared_ptr<Tensor> temp;
1131 RETURN_IF_NOT_OK(TypeCast(input, &temp, DataType(DataType::DE_FLOAT32)));
1132 RETURN_IF_NOT_OK(SlidingWindowCmnHelper<float>(temp, output, cmn_window, min_cmn_window, center, norm_vars));
1133 } else if (input->type().value() == DataType::DE_FLOAT64) {
1134 RETURN_IF_NOT_OK(SlidingWindowCmnHelper<double>(input, output, cmn_window, min_cmn_window, center, norm_vars));
1135 } else {
1136 RETURN_IF_NOT_OK(ValidateTensorNumeric("SlidingWindowCmn", input));
1137 }
1138 return Status::OK();
1139 }
1140
1141 template <typename T>
Pad(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t pad_left,int32_t pad_right,BorderType padding_mode,T value=0)1142 Status Pad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t pad_left, int32_t pad_right,
1143 BorderType padding_mode, T value = 0) {
1144 RETURN_IF_NOT_OK(ValidateLowRank("Pad", input, kMinAudioDim, "<..., time>"));
1145 RETURN_IF_NOT_OK(ValidateTensorNumeric("Pad", input));
1146 RETURN_IF_NOT_OK(ValidateNonNegative("Pad", "pad_left", pad_left));
1147 RETURN_IF_NOT_OK(ValidateNonNegative("Pad", "pad_right", pad_right));
1148 TensorShape input_shape = input->shape();
1149 int32_t wave_length = input_shape[-1];
1150 auto num_wavs = static_cast<int32_t>(input->Size() / wave_length);
1151 TensorShape to_shape = TensorShape({num_wavs, wave_length});
1152 RETURN_IF_NOT_OK(input->Reshape(to_shape));
1153 int32_t pad_length = wave_length + pad_left + pad_right;
1154 TensorShape new_shape = TensorShape({num_wavs, pad_length});
1155 RETURN_IF_NOT_OK(Tensor::CreateEmpty(new_shape, input->type(), output));
1156 using MatrixXT = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
1157 using Eigen::Map;
1158 constexpr int pad_mul = 2;
1159 T *input_data = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer()));
1160 T *output_data = reinterpret_cast<T *>(const_cast<uchar *>((*output)->GetBuffer()));
1161 auto input_map = Map<MatrixXT>(input_data, num_wavs, wave_length);
1162 auto output_map = Map<MatrixXT>(output_data, num_wavs, pad_length);
1163 output_map.block(0, pad_left, num_wavs, wave_length) = input_map;
1164 if (padding_mode == BorderType::kConstant) {
1165 output_map.block(0, 0, num_wavs, pad_left).setConstant(value);
1166 output_map.block(0, pad_left + wave_length, num_wavs, pad_right).setConstant(value);
1167 } else if (padding_mode == BorderType::kEdge) {
1168 output_map.block(0, 0, num_wavs, pad_left).colwise() = input_map.col(0);
1169 output_map.block(0, pad_left + wave_length, num_wavs, pad_right).colwise() = input_map.col(wave_length - 1);
1170 } else if (padding_mode == BorderType::kReflect) {
1171 // First, deal with the pad operation on the right.
1172 int32_t current_pad = wave_length - 1;
1173 while (pad_right >= current_pad) {
1174 // current_pad: the length of pad required for current loop.
1175 // pad_right: the length of the remaining pad on the right.
1176 output_map.block(0, pad_left + current_pad + 1, num_wavs, current_pad) =
1177 output_map.block(0, pad_left, num_wavs, current_pad).rowwise().reverse();
1178 pad_right -= current_pad;
1179 current_pad += current_pad;
1180 }
1181 output_map.block(0, pad_length - pad_right, num_wavs, pad_right) =
1182 output_map.block(0, pad_length - pad_right * pad_mul - 1, num_wavs, pad_right).rowwise().reverse();
1183 // Next, deal with the pad operation on the left.
1184 current_pad = wave_length - 1;
1185 while (pad_left >= current_pad) {
1186 // current_pad: the length of pad required for current loop.
1187 // pad_left: the length of the remaining pad on the left.
1188 output_map.block(0, pad_left - current_pad, num_wavs, current_pad) =
1189 output_map.block(0, pad_left + 1, num_wavs, current_pad).rowwise().reverse();
1190 pad_left -= current_pad;
1191 current_pad += current_pad;
1192 }
1193 output_map.block(0, 0, num_wavs, pad_left) =
1194 output_map.block(0, pad_left + 1, num_wavs, pad_left).rowwise().reverse();
1195 } else if (padding_mode == BorderType::kSymmetric) {
1196 // First, deal with the pad operation on the right.
1197 int32_t current_pad = wave_length;
1198 while (pad_right >= current_pad) {
1199 // current_pad: the length of pad required for current loop.
1200 // pad_right: the length of the remaining pad on the right.
1201 output_map.block(0, pad_left + current_pad, num_wavs, current_pad) =
1202 output_map.block(0, pad_left, num_wavs, current_pad).rowwise().reverse();
1203 pad_right -= current_pad;
1204 current_pad += current_pad;
1205 }
1206 output_map.block(0, pad_length - pad_right, num_wavs, pad_right) =
1207 output_map.block(0, pad_length - pad_right * pad_mul, num_wavs, pad_right).rowwise().reverse();
1208 // Next, deal with the pad operation on the left.
1209 current_pad = wave_length;
1210 while (pad_left >= current_pad) {
1211 // current_pad: the length of pad required for current loop.
1212 // pad_left: the length of the remaining pad on the left.
1213 output_map.block(0, pad_left - current_pad, num_wavs, current_pad) =
1214 output_map.block(0, pad_left, num_wavs, current_pad).rowwise().reverse();
1215 pad_left -= current_pad;
1216 current_pad += current_pad;
1217 }
1218 output_map.block(0, 0, num_wavs, pad_left) = output_map.block(0, pad_left, num_wavs, pad_left).rowwise().reverse();
1219 } else {
1220 LOG_AND_RETURN_STATUS_SYNTAX_ERROR("Pad: invalid padding_mode value, check the optional value of BorderType.");
1221 }
1222 std::vector<dsize_t> shape_vec = input_shape.AsVector();
1223 shape_vec[shape_vec.size() - 1] = static_cast<dsize_t>(pad_length);
1224 TensorShape output_shape(shape_vec);
1225 RETURN_IF_NOT_OK((*output)->Reshape(output_shape));
1226 return Status::OK();
1227 }
1228
1229 template <typename T>
ComputeDeltasImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int all_freqs,int n_frame,int n)1230 Status ComputeDeltasImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int all_freqs,
1231 int n_frame, int n) {
1232 using VectorXT = Eigen::Matrix<T, Eigen::Dynamic, 1>;
1233 using MatrixXT = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
1234 using Eigen::Map;
1235 int32_t denom = n * (n + 1) * (n * 2 + 1) / 3;
1236 // twice sum of integer squared
1237 VectorXT kernel = VectorXT::LinSpaced(2 * n + 1, -n, n); // 2n+1
1238 T *input_data = reinterpret_cast<T *>(const_cast<uchar *>(input->GetBuffer())); // [all_freq,n_fram+2n]
1239 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{all_freqs, n_frame}, input->type(), output));
1240 T *output_data = reinterpret_cast<T *>(const_cast<uchar *>((*output)->GetBuffer()));
1241 for (int freq = 0; freq < all_freqs; ++freq) { // conv with im2col
1242 auto input_map = Map<MatrixXT, 0, Eigen::OuterStride<1>>(input_data + freq * (n_frame + 2 * n), n_frame,
1243 2 * n + 1); // n_frmae,2n+1
1244 Map<VectorXT>(output_data + freq * n_frame, n_frame) = (input_map * kernel).array() / T(denom);
1245 }
1246 return Status::OK();
1247 }
1248
Bartlett(std::shared_ptr<Tensor> * output,int len)1249 Status Bartlett(std::shared_ptr<Tensor> *output, int len) {
1250 CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Bartlett: len can not be zero.");
1251 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1252 // Bartlett window function.
1253 auto iter = (*output)->begin<float>();
1254 float twice = 2.0;
1255 for (ptrdiff_t i = 0; i < len; ++i) {
1256 *(iter + i) = 1.0 - std::abs(twice * i / len - 1.0);
1257 }
1258 return Status::OK();
1259 }
1260
Blackman(std::shared_ptr<Tensor> * output,int len)1261 Status Blackman(std::shared_ptr<Tensor> *output, int len) {
1262 CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Blackman: len can not be zero.");
1263 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1264 // Blackman window function.
1265 auto iter = (*output)->begin<float>();
1266 const float alpha = 0.42;
1267 const float half = 0.5;
1268 const float delta = 0.08;
1269 for (ptrdiff_t i = 0; i < len; ++i) {
1270 *(iter + i) = alpha - half * std::cos(TWO * PI * i / len) + delta * std::cos(TWO * TWO * PI * i / len);
1271 }
1272 return Status::OK();
1273 }
1274
Hamming(std::shared_ptr<Tensor> * output,int len,float alpha=0.54,float beta=0.46)1275 Status Hamming(std::shared_ptr<Tensor> *output, int len, float alpha = 0.54, float beta = 0.46) {
1276 CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Hamming: len can not be zero.");
1277 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1278 // Hamming window function.
1279 auto iter = (*output)->begin<float>();
1280 for (ptrdiff_t i = 0; i < len; ++i) {
1281 *(iter + i) = alpha - beta * std::cos(TWO * PI * i / len);
1282 }
1283 return Status::OK();
1284 }
1285
Hann(std::shared_ptr<Tensor> * output,int len)1286 Status Hann(std::shared_ptr<Tensor> *output, int len) {
1287 CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Hann: len can not be zero.");
1288 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1289 // Hann window function.
1290 auto iter = (*output)->begin<float>();
1291 const float half = 0.5;
1292 for (ptrdiff_t i = 0; i < len; ++i) {
1293 *(iter + i) = half - half * std::cos(TWO * PI * i / len);
1294 }
1295 return Status::OK();
1296 }
1297
Kaiser(std::shared_ptr<Tensor> * output,int len,float beta=12.0)1298 Status Kaiser(std::shared_ptr<Tensor> *output, int len, float beta = 12.0) {
1299 #ifdef __APPLE__
1300 return Status(StatusCode::kMDNotImplementedYet, "For macOS, Kaiser window is not supported.");
1301 #else
1302 CHECK_FAIL_RETURN_UNEXPECTED(len != 0, "Kaiser: len can not be zero.");
1303 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({len}), DataType(DataType::DE_FLOAT32), output));
1304 // Kaiser window function.
1305 auto iter = (*output)->begin<float>();
1306 float twice = 2.0;
1307 {
1308 std::unique_lock<std::mutex> _lock(cyl_bessel_mux_);
1309 for (ptrdiff_t i = 0; i < len; ++i) {
1310 *(iter + i) =
1311 std::cyl_bessel_i(0, beta * std::sqrt(1 - std::pow(i * twice / (len)-1.0, TWO))) / std::cyl_bessel_i(0, beta);
1312 }
1313 }
1314 return Status::OK();
1315 #endif
1316 }
1317
Window(std::shared_ptr<Tensor> * output,WindowType window_type,int len)1318 Status Window(std::shared_ptr<Tensor> *output, WindowType window_type, int len) {
1319 switch (window_type) {
1320 case WindowType::kBartlett:
1321 return Bartlett(output, len);
1322 case WindowType::kBlackman:
1323 return Blackman(output, len);
1324 case WindowType::kHamming:
1325 return Hamming(output, len);
1326 case WindowType::kHann:
1327 return Hann(output, len);
1328 case WindowType::kKaiser:
1329 return Kaiser(output, len);
1330 default:
1331 return Hann(output, len);
1332 }
1333 }
1334
1335 // control whether return half of results after stft.
1336 template <typename T>
Onesided(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int n_fft,int n_columns)1337 Status Onesided(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int n_fft, int n_columns) {
1338 std::shared_ptr<Tensor> output_onsided;
1339 RETURN_IF_NOT_OK(
1340 Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft, n_columns, 2}), input->type(), &output_onsided));
1341 auto onside_begin = output_onsided->begin<T>();
1342 auto spec_f_begin = input->begin<T>();
1343 std::vector<int> spec_f_slice = {(n_fft / 2 + 1) * n_columns * 2, n_columns * 2, 2};
1344 for (int r = 0; r < input->shape()[0]; r++) {
1345 for (int i = 0; i < (n_fft / TWO + 1); i++) {
1346 for (int j = 0; j < n_columns; j++) {
1347 ptrdiff_t onside_offset_0 = r * n_fft * n_columns * 2 + i * spec_f_slice[1] + j * spec_f_slice[2];
1348 ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1349 ptrdiff_t onside_offset_1 = onside_offset_0 + 1;
1350 ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1351 *(onside_begin + onside_offset_0) = *(spec_f_begin + spec_f_offset_0);
1352 *(onside_begin + onside_offset_1) = *(spec_f_begin + spec_f_offset_1);
1353 }
1354 }
1355 for (int i = n_fft / 2 + 1; i < n_fft; i++) {
1356 for (int j = 0; j < n_columns; j++) {
1357 ptrdiff_t onside_offset_0 = r * n_fft * n_columns * 2 + i * spec_f_slice[1] + j * spec_f_slice[2];
1358 ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + (n_fft - i) * spec_f_slice[1] + j * spec_f_slice[2];
1359 ptrdiff_t onside_offset_1 = onside_offset_0 + 1;
1360 ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1361 *(onside_begin + onside_offset_0) = *(spec_f_begin + spec_f_offset_0);
1362 *(onside_begin + onside_offset_1) = *(spec_f_begin + spec_f_offset_1);
1363 }
1364 }
1365 }
1366 *output = output_onsided;
1367 return Status::OK();
1368 }
1369
1370 template <typename T>
PowerStft(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float power,int n_columns,int n_length)1371 Status PowerStft(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power, int n_columns,
1372 int n_length) {
1373 auto spec_f_begin = input->begin<T>();
1374 std::vector<int> spec_f_slice = {n_length * n_columns * 2, n_columns * 2, 2};
1375 std::vector<int> spec_p_slice = {n_length * n_columns, n_columns};
1376 std::shared_ptr<Tensor> spec_p;
1377 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[0], n_length, n_columns}), input->type(), &spec_p));
1378 auto spec_p_begin = spec_p->begin<T>();
1379 for (int r = 0; r < input->shape()[0]; r++) {
1380 for (int i = 0; i < n_length; i++) {
1381 for (int j = 0; j < n_columns; j++) {
1382 ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1383 ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1384 ptrdiff_t spec_p_offset = r * spec_p_slice[0] + i * spec_p_slice[1] + j;
1385 T spec_power_0 = *(spec_f_begin + spec_f_offset_0);
1386 T spec_power_1 = *(spec_f_begin + spec_f_offset_1);
1387 *(spec_p_begin + spec_p_offset) =
1388 std::pow(std::sqrt(std::pow(spec_power_0, TWO) + std::pow(spec_power_1, TWO)), power);
1389 }
1390 }
1391 }
1392 *output = spec_p;
1393 return Status::OK();
1394 }
1395
1396 template <typename T>
Stft(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int n_fft,const std::shared_ptr<Tensor> & win,int win_length,int n_columns,bool normalized,float power,bool onesided)1397 Status Stft(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int n_fft,
1398 const std::shared_ptr<Tensor> &win, int win_length, int n_columns, bool normalized, float power,
1399 bool onesided) {
1400 CHECK_FAIL_RETURN_UNEXPECTED(win_length != 0, "Spectrogram: win_length can not be zero.");
1401 double win_sum = 0.;
1402 float twice = 2.0;
1403 for (auto iter_win = win->begin<float>(); iter_win != win->end<float>(); iter_win++) {
1404 win_sum += (*iter_win) * (*iter_win);
1405 }
1406 win_sum = std::sqrt(win_sum);
1407 std::shared_ptr<Tensor> spec_f;
1408
1409 RETURN_IF_NOT_OK(
1410 Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft / 2 + 1, n_columns, 2}), input->type(), &spec_f));
1411
1412 auto spec_f_begin = spec_f->begin<T>();
1413 auto input_win_begin = input->begin<T>();
1414 std::vector<int> spec_f_slice = {(n_fft / 2 + 1) * n_columns * 2, n_columns * 2, 2};
1415 std::vector<int> input_win_slice = {n_columns * win_length, win_length};
1416 std::shared_ptr<Tensor> spec_p;
1417 RETURN_IF_NOT_OK(
1418 Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft / 2 + 1, n_columns}), input->type(), &spec_p));
1419 std::shared_ptr<Tensor> exp_complex;
1420 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({n_fft / 2 + 1, win_length, 2}), input->type(), &exp_complex));
1421 auto exp_complex_begin = exp_complex->begin<T>();
1422 std::vector<int> exp_complex_slice = {win_length * 2, 2};
1423 for (int i = 0; i < (n_fft / TWO + 1); i++) {
1424 for (int k = 0; k <= win_length - 1; k++) {
1425 ptrdiff_t exp_complex_offset_0 = i * exp_complex_slice[0] + k * exp_complex_slice[1];
1426 ptrdiff_t exp_complex_offset_1 = exp_complex_offset_0 + 1;
1427 *(exp_complex_begin + exp_complex_offset_0) = std::cos(twice * PI * i * k / win_length);
1428 *(exp_complex_begin + exp_complex_offset_1) = std::sin(twice * PI * i * k / win_length);
1429 }
1430 }
1431 for (int r = 0; r < input->shape()[0]; r++) {
1432 for (int i = 0; i < (n_fft / TWO + 1); i++) {
1433 for (int j = 0; j < n_columns; j++) {
1434 T spec_f_0 = 0.;
1435 T spec_f_1 = 0.;
1436 ptrdiff_t exp_complex_offset_0 = i * exp_complex_slice[0];
1437 for (int k = 0; k < win_length; k++) {
1438 ptrdiff_t exp_complex_offset_1 = exp_complex_offset_0 + 1;
1439 T exp_complex_a = *(exp_complex_begin + exp_complex_offset_0);
1440 T exp_complex_b = *(exp_complex_begin + exp_complex_offset_1);
1441 ptrdiff_t input_win_offset = r * input_win_slice[0] + j * input_win_slice[1] + k;
1442 T input_value = *(input_win_begin + input_win_offset);
1443 spec_f_0 += input_value * exp_complex_a;
1444 spec_f_1 += -input_value * exp_complex_b;
1445 exp_complex_offset_0 = exp_complex_offset_1 + 1;
1446 }
1447 ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1448 ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1449 *(spec_f_begin + spec_f_offset_0) = spec_f_0;
1450 *(spec_f_begin + spec_f_offset_1) = spec_f_1;
1451 }
1452 }
1453 }
1454 CHECK_FAIL_RETURN_UNEXPECTED(win_sum != 0, "Window: the total value of window function can not be zero.");
1455 if (normalized) {
1456 for (int r = 0; r < input->shape()[0]; r++) {
1457 for (int i = 0; i < (n_fft / TWO + 1); i++) {
1458 for (int j = 0; j < n_columns; j++) {
1459 ptrdiff_t spec_f_offset_0 = r * spec_f_slice[0] + i * spec_f_slice[1] + j * spec_f_slice[2];
1460 ptrdiff_t spec_f_offset_1 = spec_f_offset_0 + 1;
1461 T spec_norm_a = *(spec_f_begin + spec_f_offset_0);
1462 T spec_norm_b = *(spec_f_begin + spec_f_offset_1);
1463 *(spec_f_begin + spec_f_offset_0) = spec_norm_a / win_sum;
1464 *(spec_f_begin + spec_f_offset_1) = spec_norm_b / win_sum;
1465 }
1466 }
1467 }
1468 }
1469 std::shared_ptr<Tensor> output_onsided;
1470 if (!onesided) {
1471 RETURN_IF_NOT_OK(Onesided<T>(spec_f, &output_onsided, n_fft, n_columns));
1472 if (power == 0) {
1473 *output = output_onsided;
1474 return Status::OK();
1475 }
1476 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[0], n_fft, n_columns}), input->type(), &spec_p));
1477 RETURN_IF_NOT_OK(PowerStft<T>(output_onsided, &spec_p, power, n_columns, n_fft));
1478 *output = spec_p;
1479 return Status::OK();
1480 }
1481 if (power == 0) {
1482 *output = spec_f;
1483 return Status::OK();
1484 }
1485 RETURN_IF_NOT_OK(PowerStft<T>(spec_f, &spec_p, power, n_columns, n_fft / TWO + 1));
1486 *output = spec_p;
1487 return Status::OK();
1488 }
1489
1490 template <typename T>
SpectrogramImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int pad,WindowType window,int n_fft,int hop_length,int win_length,float power,bool normalized,bool center,BorderType pad_mode,bool onesided)1491 Status SpectrogramImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int pad,
1492 WindowType window, int n_fft, int hop_length, int win_length, float power, bool normalized,
1493 bool center, BorderType pad_mode, bool onesided) {
1494 std::shared_ptr<Tensor> fft_window_tensor;
1495 std::shared_ptr<Tensor> fft_window_later;
1496 TensorShape shape = input->shape();
1497 std::vector output_shape = shape.AsVector();
1498 output_shape.pop_back();
1499 int input_len = input->shape()[-1];
1500
1501 RETURN_IF_NOT_OK(input->Reshape(TensorShape({input->Size() / input_len, input_len})));
1502
1503 DataType data_type = input->type();
1504 // get the windows
1505 RETURN_IF_NOT_OK(Window(&fft_window_tensor, window, win_length));
1506 if (win_length == 1) {
1507 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &fft_window_tensor));
1508 auto win = fft_window_tensor->begin<float>();
1509 *(win) = 1;
1510 }
1511
1512 // Pad window length
1513 int pad_left = (n_fft - win_length) / 2;
1514 int pad_right = n_fft - win_length - pad_left;
1515 RETURN_IF_NOT_OK(fft_window_tensor->Reshape(TensorShape({1, win_length})));
1516 RETURN_IF_NOT_OK(Pad<float>(fft_window_tensor, &fft_window_later, pad_left, pad_right, BorderType::kConstant));
1517 RETURN_IF_NOT_OK(fft_window_later->Reshape(TensorShape({n_fft})));
1518
1519 int length = input_len + pad * 2 + n_fft;
1520
1521 std::shared_ptr<Tensor> input_data_tensor;
1522 std::shared_ptr<Tensor> input_data_tensor_pad;
1523 RETURN_IF_NOT_OK(
1524 Tensor::CreateEmpty(TensorShape({input->shape()[0], input_len + pad * 2}), data_type, &input_data_tensor_pad));
1525 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input->shape()[0], length}), data_type, &input_data_tensor));
1526
1527 RETURN_IF_NOT_OK(Pad<T>(input, &input_data_tensor_pad, pad, pad, BorderType::kConstant));
1528
1529 if (center) {
1530 RETURN_IF_NOT_OK(Pad<T>(input_data_tensor_pad, &input_data_tensor, n_fft / TWO, n_fft / TWO, pad_mode));
1531 } else {
1532 input_data_tensor = input_data_tensor_pad;
1533 }
1534
1535 CHECK_FAIL_RETURN_UNEXPECTED(n_fft <= input_data_tensor->shape()[-1],
1536 "Spectrogram: n_fft should be more than 0 and less than " +
1537 std::to_string(input_data_tensor->shape()[-1]) +
1538 ", but got n_fft: " + std::to_string(n_fft) + ".");
1539
1540 // calculate the sliding times of the window function
1541 CHECK_FAIL_RETURN_UNEXPECTED(hop_length != 0, "Spectrogram: hop_length can not be zero.");
1542 int n_columns = 0;
1543 while ((1 + n_columns++) * hop_length + n_fft <= input_data_tensor->shape()[-1]) {
1544 }
1545 std::shared_ptr<Tensor> stft_compute;
1546
1547 auto input_begin = input_data_tensor->begin<T>();
1548 std::vector<int> input_win_slice = {n_columns * n_fft, n_fft};
1549 auto iter_win = fft_window_later->begin<float>();
1550 std::shared_ptr<Tensor> input_win;
1551 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({input_data_tensor->shape()[0], n_columns, n_fft}),
1552 input_data_tensor->type(), &input_win));
1553 auto input_win_begin = input_win->begin<T>();
1554 for (int r = 0; r < input_data_tensor->shape()[0]; r++) {
1555 for (int j = 0; j < n_columns; j++) {
1556 for (int k = 0; k < n_fft; k++) {
1557 ptrdiff_t win_offset = k;
1558 float win_value = *(iter_win + win_offset);
1559 ptrdiff_t input_stft_offset = r * input_data_tensor->shape()[-1] + j * hop_length + k;
1560 T input_value = *(input_begin + input_stft_offset);
1561 ptrdiff_t input_win_offset = r * input_win_slice[0] + j * input_win_slice[1] + k;
1562 *(input_win_begin + input_win_offset) = win_value * input_value;
1563 }
1564 }
1565 }
1566 RETURN_IF_NOT_OK(
1567 Stft<T>(input_win, &stft_compute, n_fft, fft_window_later, n_fft, n_columns, normalized, power, onesided));
1568 if (onesided) {
1569 output_shape.push_back(n_fft / TWO + 1);
1570 } else {
1571 output_shape.push_back(n_fft);
1572 }
1573 output_shape.push_back(n_columns);
1574 if (power == 0) {
1575 output_shape.push_back(TWO);
1576 }
1577 // reshape the output
1578 RETURN_IF_NOT_OK(stft_compute->Reshape(TensorShape({output_shape})));
1579 *output = stft_compute;
1580 return Status::OK();
1581 }
1582
Spectrogram(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int pad,WindowType window,int n_fft,int hop_length,int win_length,float power,bool normalized,bool center,BorderType pad_mode,bool onesided)1583 Status Spectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int pad, WindowType window,
1584 int n_fft, int hop_length, int win_length, float power, bool normalized, bool center,
1585 BorderType pad_mode, bool onesided) {
1586 TensorShape input_shape = input->shape();
1587
1588 CHECK_FAIL_RETURN_UNEXPECTED(
1589 input->type().IsNumeric(),
1590 "Spectrogram: input tensor type should be int, float or double, but got: " + input->type().ToString());
1591
1592 CHECK_FAIL_RETURN_UNEXPECTED(input_shape.Size() > 0, "Spectrogram: input tensor is not in shape of <..., time>.");
1593
1594 std::shared_ptr<Tensor> input_tensor;
1595 if (input->type() != DataType::DE_FLOAT64) {
1596 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
1597 return SpectrogramImpl<float>(input_tensor, output, pad, window, n_fft, hop_length, win_length, power, normalized,
1598 center, pad_mode, onesided);
1599 } else {
1600 input_tensor = input;
1601 return SpectrogramImpl<double>(input_tensor, output, pad, window, n_fft, hop_length, win_length, power, normalized,
1602 center, pad_mode, onesided);
1603 }
1604 }
1605
1606 template <typename T>
SpectralCentroidImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int sample_rate,int n_fft,int win_length,int hop_length,int pad,WindowType window)1607 Status SpectralCentroidImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate,
1608 int n_fft, int win_length, int hop_length, int pad, WindowType window) {
1609 std::shared_ptr<Tensor> output_tensor;
1610 std::shared_ptr<Tensor> spectrogram_tensor;
1611 if (input->type() == DataType::DE_FLOAT64) {
1612 SpectrogramImpl<double>(input, &spectrogram_tensor, pad, window, n_fft, hop_length, win_length, 1.0, false, true,
1613 BorderType::kReflect, true);
1614 } else {
1615 SpectrogramImpl<float>(input, &spectrogram_tensor, pad, window, n_fft, hop_length, win_length, 1.0, false, true,
1616 BorderType::kReflect, true);
1617 }
1618 std::shared_ptr<Tensor> freqs;
1619 // sample_rate / TWO is half of sample_rate and n_fft / TWO is half of n_fft
1620 RETURN_IF_NOT_OK(Linspace<T>(&freqs, 0, sample_rate / TWO, 1 + n_fft / TWO));
1621 auto itr_freq = freqs->begin<T>();
1622 int num = freqs->Size();
1623 TensorShape spectrogram_shape = spectrogram_tensor->shape();
1624 int waveform = spectrogram_shape[-1];
1625 int channals = spectrogram_shape[-2];
1626 std::vector output_shape = spectrogram_shape.AsVector();
1627 output_shape[output_shape.size() - TWO] = 1;
1628 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape{output_shape}, input->type(), &output_tensor));
1629 Eigen::MatrixXd freqs_r = Eigen::MatrixXd::Zero(num, 1);
1630 for (int i = 0; i < num; ++i) {
1631 freqs_r(i, 0) = *itr_freq;
1632 itr_freq++;
1633 }
1634 int k_num = spectrogram_tensor->Size() / (waveform * channals);
1635 std::vector<Eigen::MatrixXd> specgram;
1636 std::vector<Eigen::MatrixXd> specgram_result;
1637 std::vector<Eigen::MatrixXd> specgram_sum;
1638 Eigen::MatrixXd tmp = Eigen::MatrixXd::Zero(channals, waveform);
1639 auto itr_spectrogram = spectrogram_tensor->begin<T>();
1640 for (int k = 0; k < k_num; k++) {
1641 for (int i = 0; i < channals; ++i) {
1642 for (int j = 0; j < waveform; ++j) {
1643 tmp(i, j) = *itr_spectrogram;
1644 itr_spectrogram++;
1645 }
1646 }
1647 specgram.push_back(tmp);
1648 (void)specgram_sum.emplace_back(specgram[k].colwise().sum());
1649 }
1650 for (int k = 0; k < k_num; k++) {
1651 for (int i = 0; i < channals; ++i) {
1652 for (int j = 0; j < waveform; ++j) {
1653 tmp(i, j) = freqs_r(i, 0) * specgram[k](i, j);
1654 }
1655 }
1656 (void)specgram_result.emplace_back((tmp).colwise().sum());
1657 }
1658 auto itr_output = output_tensor->begin<T>();
1659 for (int k = 0; k < k_num; k++) {
1660 for (int i = 0; i < waveform; ++i) {
1661 *itr_output = specgram_result[k](0, i) / specgram_sum[k](0, i);
1662 itr_output++;
1663 }
1664 }
1665 *output = output_tensor;
1666 return Status::OK();
1667 }
1668
SpectralCentroid(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int sample_rate,int n_fft,int win_length,int hop_length,int pad,WindowType window)1669 Status SpectralCentroid(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int sample_rate,
1670 int n_fft, int win_length, int hop_length, int pad, WindowType window) {
1671 RETURN_IF_NOT_OK(ValidateLowRank("SpectralCentroid", input, kMinAudioDim, "<..., time>"));
1672 RETURN_IF_NOT_OK(ValidateTensorNumeric("SpectralCentroid", input));
1673
1674 std::shared_ptr<Tensor> input_tensor;
1675 if (input->type() != DataType::DE_FLOAT64) {
1676 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
1677 return SpectralCentroidImpl<float>(input_tensor, output, sample_rate, n_fft, win_length, hop_length, pad, window);
1678 } else {
1679 input_tensor = input;
1680 return SpectralCentroidImpl<double>(input_tensor, output, sample_rate, n_fft, win_length, hop_length, pad, window);
1681 }
1682 }
1683
ComputeDeltas(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t win_length,const BorderType & mode)1684 Status ComputeDeltas(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t win_length,
1685 const BorderType &mode) {
1686 RETURN_IF_NOT_OK(ValidateLowRank("ComputeDeltas", input, kDefaultAudioDim, "<..., freq, time>"));
1687 RETURN_IF_NOT_OK(ValidateTensorNumeric("ComputeDeltas", input));
1688
1689 // reshape Tensor from <..., freq, time> to <-1, time>
1690 auto raw_shape = input->shape();
1691 int32_t n_frames = raw_shape[-1];
1692 int32_t all_freqs = raw_shape.NumOfElements() / n_frames;
1693 RETURN_IF_NOT_OK(input->Reshape(TensorShape{all_freqs, n_frames}));
1694
1695 int32_t n = (win_length - 1) / 2;
1696
1697 std::shared_ptr<Tensor> specgram_local_pad;
1698 if (input->type() == DataType(DataType::DE_FLOAT64)) {
1699 RETURN_IF_NOT_OK(Pad<double>(input, &specgram_local_pad, n, n, mode));
1700 RETURN_IF_NOT_OK(ComputeDeltasImpl<double>(specgram_local_pad, output, all_freqs, n_frames, n));
1701 } else {
1702 std::shared_ptr<Tensor> float_tensor;
1703 RETURN_IF_NOT_OK(TypeCast(input, &float_tensor, DataType(DataType::DE_FLOAT32)));
1704 RETURN_IF_NOT_OK(Pad<float>(float_tensor, &specgram_local_pad, n, n, mode));
1705 RETURN_IF_NOT_OK(ComputeDeltasImpl<float>(specgram_local_pad, output, all_freqs, n_frames, n));
1706 }
1707 RETURN_IF_NOT_OK((*output)->Reshape(raw_shape));
1708 return Status::OK();
1709 }
1710
1711 /// \brief IRFFT.
IRFFT(const Eigen::MatrixXcd & stft_matrix,Eigen::MatrixXd * inverse)1712 Status IRFFT(const Eigen::MatrixXcd &stft_matrix, Eigen::MatrixXd *inverse) {
1713 int32_t n = 2 * (stft_matrix.rows() - 1);
1714 int32_t s = stft_matrix.rows() - 1;
1715 Eigen::FFT<double> fft;
1716 for (int k = 0; k < stft_matrix.cols(); ++k) {
1717 Eigen::VectorXcd output_complex(n);
1718 // pad input
1719 Eigen::VectorXcd input(n);
1720 input.head(s + 1) = stft_matrix.col(k);
1721 auto reverse_pad = stft_matrix.col(k).segment(1, s - 1).colwise().reverse().conjugate();
1722 input.segment(s + 1, s - 1) = reverse_pad;
1723 fft.inv(output_complex, input);
1724 Eigen::VectorXd output_real = output_complex.real().eval();
1725 inverse->col(k) = Eigen::Map<Eigen::MatrixXd>(output_real.data(), n, 1);
1726 }
1727 return Status::OK();
1728 }
1729
1730 /// \brief Overlap Add
OverlapAdd(Eigen::VectorXd * out_buf,const Eigen::MatrixXd & win_inverse_stft,int32_t hop_lengh)1731 Status OverlapAdd(Eigen::VectorXd *out_buf, const Eigen::MatrixXd &win_inverse_stft, int32_t hop_lengh) {
1732 int32_t n_fft = win_inverse_stft.rows();
1733 for (int frame = 0; frame < win_inverse_stft.cols(); frame++) {
1734 int32_t sample = frame * hop_lengh;
1735 out_buf->middleRows(sample, n_fft) += win_inverse_stft.col(frame);
1736 }
1737 return Status::OK();
1738 }
1739
1740 /// \brief Window Sum Square
WindowSumSquare(const Eigen::MatrixXf & window_matrix,Eigen::VectorXf * win_sum_square,const int32_t n_frames,int32_t n_fft,int32_t hop_length)1741 Status WindowSumSquare(const Eigen::MatrixXf &window_matrix, Eigen::VectorXf *win_sum_square, const int32_t n_frames,
1742 int32_t n_fft, int32_t hop_length) {
1743 Eigen::MatrixXf win_norm = window_matrix.array().pow(2);
1744 // window sum square fill
1745 int32_t n = n_fft + hop_length * (n_frames - 1);
1746 // check n_fft
1747 CHECK_FAIL_RETURN_UNEXPECTED(
1748 n_fft == win_norm.rows(),
1749 "GriffinLim: n_fft must be equal to the length of the window during window sum square calculation.");
1750 for (int ind = 0; ind < n_frames; ind++) {
1751 int sample = ind * hop_length;
1752 int end_ss = std::min(n, sample + n_fft);
1753 int end_win = std::max(0, std::min(n_fft, n - sample));
1754 win_sum_square->segment(sample, end_ss - sample) += win_norm.col(0).head(end_win);
1755 }
1756 return Status::OK();
1757 }
1758
1759 /// \brief ISTFT.
1760 /// \param input: Complex matrix of eigen, shape of <freq, time>.
1761 /// \param output: Tensor of shape <time>.
1762 /// \param n_fft: Size of Fourier transform.
1763 /// \param hop_length: The distance between neighboring sliding window frames.
1764 /// \param win_length: The size of window frame and STFT filter.
1765 /// \param window_type: The type of window function.
1766 /// \param center: Whether input was padded on both sides so that the `t`-th frame is centered at time
1767 /// `t * hop_length`.
1768 /// \param normalized: Whether the STFT was normalized.
1769 /// \param onesided: Whether the STFT was onesided.
1770 /// \param length: The amount to trim the signal by (i.e. the original signal length).
1771 /// \return Status code.
1772 template <typename T>
ISTFT(const Eigen::MatrixXcd & stft_matrix,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t hop_length,int32_t win_length,WindowType window_type,bool center,int32_t length)1773 Status ISTFT(const Eigen::MatrixXcd &stft_matrix, std::shared_ptr<Tensor> *output, int32_t n_fft, int32_t hop_length,
1774 int32_t win_length, WindowType window_type, bool center, int32_t length) {
1775 // check input
1776 constexpr int64_t transform_size = 2;
1777 CHECK_FAIL_RETURN_UNEXPECTED(n_fft == ((stft_matrix.rows() - 1) * transform_size),
1778 "GriffinLim: the frequency of the input should equal to n_fft / 2 + 1");
1779
1780 // window
1781 std::shared_ptr<Tensor> ifft_window_tensor;
1782 RETURN_IF_NOT_OK(Window(&ifft_window_tensor, window_type, win_length));
1783 if (win_length == 1) {
1784 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &ifft_window_tensor));
1785 auto win = ifft_window_tensor->begin<float>();
1786 *(win) = 1;
1787 }
1788 // pad window to match n_fft, and add a broadcasting axis
1789 std::shared_ptr<Tensor> ifft_window_pad;
1790 ifft_window_pad = ifft_window_tensor;
1791 if (win_length < n_fft) {
1792 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({n_fft}), DataType(DataType::DE_FLOAT32), &ifft_window_pad));
1793 int pad_left = (n_fft - win_length) / 2;
1794 int pad_right = n_fft - win_length - pad_left;
1795 RETURN_IF_NOT_OK(Pad<float>(ifft_window_tensor, &ifft_window_pad, pad_left, pad_right, BorderType::kConstant));
1796 }
1797
1798 int32_t n_frames = 0;
1799 if ((length != 0) && (hop_length != 0)) {
1800 int32_t padded_length = center ? (length + n_fft) : length;
1801 n_frames = std::min(static_cast<int32_t>(stft_matrix.cols()),
1802 static_cast<int32_t>(std::ceil(static_cast<float>(padded_length) / hop_length)));
1803 } else {
1804 n_frames = stft_matrix.cols();
1805 }
1806 int32_t expected_signal_len = n_fft + hop_length * (n_frames - 1);
1807 Eigen::VectorXd y(expected_signal_len);
1808 y.setZero();
1809
1810 // constrain STFT block sizes to 256 KB
1811 int32_t max_mem_block = std::pow(2, 8) * std::pow(2, 10);
1812 int32_t n_columns = max_mem_block / (stft_matrix.rows() * sizeof(T) * TWO);
1813 n_columns = std::max(n_columns, 1);
1814
1815 // turn window to eigen matrix
1816 auto data_ptr = &*ifft_window_pad->begin<float>();
1817 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> ifft_window_matrix(data_ptr,
1818 ifft_window_pad->shape()[0], 1);
1819 for (int bl_s = 0, frame = 0; bl_s < n_frames;) {
1820 int bl_t = std::min(bl_s + n_columns, n_frames);
1821 // calculate ifft
1822 Eigen::MatrixXcd stft_temp = stft_matrix.middleCols(bl_s, bl_t - bl_s).eval();
1823 Eigen::MatrixXd inverse(TWO * (stft_temp.rows() - 1), stft_temp.cols());
1824 inverse.setZero();
1825 RETURN_IF_NOT_OK(IRFFT(stft_temp, &inverse));
1826 auto ytmp = ifft_window_matrix.template cast<double>().replicate(1, inverse.cols()).cwiseProduct(inverse);
1827 RETURN_IF_NOT_OK(OverlapAdd(&y, ytmp, hop_length));
1828 frame += bl_t - bl_s;
1829 bl_s += n_columns;
1830 }
1831 // normalize by sum of squared window
1832 int32_t n = n_fft + hop_length * (n_frames - 1);
1833 Eigen::VectorXf ifft_win_sum(n);
1834 ifft_win_sum.setZero();
1835 RETURN_IF_NOT_OK(WindowSumSquare(ifft_window_matrix, &ifft_win_sum, n_frames, n_fft, hop_length));
1836
1837 for (int32_t ind = 0; ind < y.rows(); ind++) {
1838 if (ifft_win_sum[ind] > std::numeric_limits<float>::min()) {
1839 y[ind] /= ifft_win_sum[ind];
1840 }
1841 }
1842
1843 std::shared_ptr<Tensor> res_tensor;
1844 if (length == 0 && center) {
1845 int32_t y_start = n_fft / 2;
1846 int32_t y_end = y.rows() - y_start;
1847 auto tmp = y.middleRows(y_start, y_end - y_start);
1848 std::vector<T> y_res(tmp.data(), tmp.data() + tmp.rows() * tmp.cols());
1849 RETURN_IF_NOT_OK(Tensor::CreateFromVector(y_res, TensorShape({tmp.size()}), &res_tensor));
1850 } else {
1851 int32_t start = center ? n_fft / 2 : 0;
1852 auto tmp = y.tail(y.rows() - start);
1853 // fix length
1854 std::vector<T> y_res(tmp.data(), tmp.data() + tmp.rows() * tmp.cols());
1855 if (length > y_res.size()) {
1856 while (y_res.size() != length) {
1857 y_res.push_back(0);
1858 }
1859 } else if (length < tmp.rows()) {
1860 while (y_res.size() != length) {
1861 y_res.pop_back();
1862 }
1863 }
1864 RETURN_IF_NOT_OK(Tensor::CreateFromVector(y_res, TensorShape({length}), &res_tensor));
1865 }
1866 *output = res_tensor;
1867 return Status::OK();
1868 }
1869
1870 template <typename T>
Normalized(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t win_length,WindowType window)1871 Status Normalized(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft,
1872 int32_t win_length, WindowType window) {
1873 RETURN_UNEXPECTED_IF_NULL(input);
1874 RETURN_UNEXPECTED_IF_NULL(output);
1875 // window
1876 std::shared_ptr<Tensor> ifft_win_tensor;
1877 RETURN_IF_NOT_OK(Window(&ifft_win_tensor, window, win_length));
1878 if (win_length == 1) {
1879 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &ifft_win_tensor));
1880 auto win = ifft_win_tensor->begin<float>();
1881 *(win) = 1;
1882 }
1883 // pad window to match n_fft, and add a broadcasting axis
1884 std::shared_ptr<Tensor> ifft_win_pad;
1885 ifft_win_pad = ifft_win_tensor;
1886 if (win_length < n_fft) {
1887 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({n_fft}), DataType(DataType::DE_FLOAT32), &ifft_win_pad));
1888 int pad_left = (n_fft - win_length) / TWO;
1889 int pad_right = n_fft - win_length - pad_left;
1890 RETURN_IF_NOT_OK(Pad<float>(ifft_win_tensor, &ifft_win_pad, pad_left, pad_right, BorderType::kConstant));
1891 }
1892 float win_sum = 0.;
1893 for (auto iter_win = ifft_win_pad->begin<float>(); iter_win != ifft_win_pad->end<float>(); ++iter_win) {
1894 win_sum += (*iter_win) * (*iter_win);
1895 }
1896 win_sum = std::sqrt(win_sum);
1897 for (auto iter_input = input->begin<T>(); iter_input != input->end<T>(); ++iter_input) {
1898 *iter_input = (*iter_input) * win_sum;
1899 }
1900 *output = input;
1901 return Status::OK();
1902 }
1903
1904 template <typename T>
Transistft(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t length,int32_t n_fft,int32_t win_length,int32_t hop_length,WindowType window,bool center,bool onesided)1905 Status Transistft(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length, int32_t n_fft,
1906 int32_t win_length, int32_t hop_length, WindowType window, bool center, bool onesided) {
1907 RETURN_UNEXPECTED_IF_NULL(output);
1908 TensorShape input_shape = input->shape();
1909 const int32_t kThreeAudioDim = 3;
1910 auto m = n_fft / TWO + 1;
1911 auto n = input_shape[-kDefaultAudioDim];
1912 auto r = input_shape[-kThreeAudioDim];
1913 auto size = r * n * input_shape[kThreeAudioDim];
1914 // construct new tensor by input
1915 std::shared_ptr<Tensor> istft_results;
1916 for (int dim = 0; dim < input_shape[0]; dim++) {
1917 // slice and squeeze the first dim
1918 Tensor::TensorIterator<T> itr = input->begin<T>();
1919 itr += dim * size;
1920 // onesided
1921 int a = onesided ? r : m;
1922 // turn tensor to eigen matrixXcd
1923 std::shared_ptr<Tensor> waveform;
1924 Eigen::MatrixXcd stft_matrix(a, n);
1925 for (int32_t index = 0; index < a; index++) {
1926 for (int32_t cnt = 0; cnt < n; cnt++) {
1927 if (itr < (dim + 1) * size + itr) {
1928 T real = *(itr++);
1929 T img = *(itr++);
1930 stft_matrix(index, cnt) = std::complex<double>(real, img);
1931 }
1932 }
1933 }
1934 RETURN_IF_NOT_OK(ISTFT<T>(stft_matrix, &waveform, n_fft, hop_length, win_length, window, center, length));
1935 if (input->shape().Rank() == TWO) {
1936 // do not expand dim
1937 istft_results = waveform;
1938 continue;
1939 }
1940
1941 if (istft_results != nullptr) {
1942 RETURN_IF_NOT_OK(istft_results->InsertTensor({dim}, waveform));
1943 } else {
1944 RETURN_IF_NOT_OK(
1945 Tensor::CreateEmpty(TensorShape({input_shape[0], waveform->shape()[0]}), waveform->type(), &istft_results));
1946 RETURN_IF_NOT_OK(istft_results->InsertTensor({dim}, waveform));
1947 }
1948 }
1949 *output = istft_results;
1950 return Status::OK();
1951 }
1952
1953 template <typename T>
InverseSpectrogramImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t length,int32_t n_fft,int32_t win_length,int32_t hop_length,int32_t pad,WindowType window,bool normalized,bool center,BorderType pad_mode,bool onesided)1954 Status InverseSpectrogramImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length,
1955 int32_t n_fft, int32_t win_length, int32_t hop_length, int32_t pad, WindowType window,
1956 bool normalized, bool center, BorderType pad_mode, bool onesided) {
1957 RETURN_UNEXPECTED_IF_NULL(output);
1958 // pack
1959 TensorShape input_shape = input->shape();
1960 std::vector output_shape = input_shape.AsVector();
1961 TensorShape input_reshape({input->Size() / input_shape[-1] / input_shape[-2] / input_shape[-3], input_shape[-3],
1962 input_shape[-2], input_shape[-1]});
1963 RETURN_IF_NOT_OK(input->Reshape(input_reshape));
1964 // normalized
1965 std::shared_ptr<Tensor> normal_out;
1966 normal_out = input;
1967 if (normalized) {
1968 RETURN_IF_NOT_OK(Normalized<T>(input, &normal_out, n_fft, win_length, window));
1969 }
1970 // construct new tensor by input
1971 std::shared_ptr<Tensor> final_results;
1972 int32_t output_length = length == 0 ? (input_shape[-kDefaultAudioDim] - 1) * hop_length : length + TWO * pad;
1973 RETURN_IF_NOT_OK(
1974 Transistft<T>(normal_out, &final_results, output_length, n_fft, win_length, hop_length, window, center, onesided));
1975 // remove padding from front and backs
1976 TensorShape waveform_shape = final_results->shape();
1977 auto dim1_size = waveform_shape[-1];
1978 auto dim2_size = waveform_shape[-kDefaultAudioDim];
1979 auto cols = waveform_shape[-1] - TWO * pad;
1980 auto start = &*final_results->begin<T>();
1981 auto dim = dim1_size * dim2_size;
1982 std::shared_ptr<Tensor> waveform;
1983 if (length != 0 && (pad > 0)) {
1984 std::vector<float> result(dim2_size * cols);
1985 int32_t index = 0;
1986 for (auto i = 0; i < dim; i += dim1_size) {
1987 int32_t cnt = 0;
1988 for (auto itr = start + i + pad; cnt < cols; ++itr) {
1989 result[index] = *itr;
1990 ++index;
1991 ++cnt;
1992 }
1993 }
1994 TensorShape change_shape({dim2_size, cols});
1995 RETURN_IF_NOT_OK(Tensor::CreateFromVector(result, change_shape, &waveform));
1996 } else {
1997 waveform = final_results;
1998 }
1999 // unpack
2000 output_shape.pop_back();
2001 output_shape.pop_back();
2002 output_shape.pop_back();
2003 output_shape.push_back(cols);
2004 RETURN_IF_NOT_OK(waveform->Reshape(TensorShape(output_shape)));
2005 *output = waveform;
2006 return Status::OK();
2007 }
2008
InverseSpectrogram(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t length,int32_t n_fft,int32_t win_length,int32_t hop_length,int32_t pad,WindowType window,bool normalized,bool center,BorderType pad_mode,bool onesided)2009 Status InverseSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t length,
2010 int32_t n_fft, int32_t win_length, int32_t hop_length, int32_t pad, WindowType window,
2011 bool normalized, bool center, BorderType pad_mode, bool onesided) {
2012 RETURN_UNEXPECTED_IF_NULL(output);
2013 RETURN_IF_NOT_OK(ValidateTensorShape("InverseSpectrogram",
2014 input->shape().Size() > kDefaultAudioDim && input->IsComplex(),
2015 "<..., freq, time, complex=2>"));
2016 RETURN_IF_NOT_OK(ValidateTensorNumeric("InverseSpectrogram", input));
2017 if (input->type().value() != DataType::DE_FLOAT64) {
2018 RETURN_IF_NOT_OK(InverseSpectrogramImpl<float>(input, output, length, n_fft, win_length, hop_length, pad, window,
2019 normalized, center, pad_mode, onesided));
2020 } else {
2021 RETURN_IF_NOT_OK(InverseSpectrogramImpl<double>(input, output, length, n_fft, win_length, hop_length, pad, window,
2022 normalized, center, pad_mode, onesided));
2023 }
2024 return Status::OK();
2025 }
2026
2027 template <typename T>
GriffinLimImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t n_iter,int32_t win_length,int32_t hop_length,WindowType window_type,float power,float momentum,int32_t length,bool rand_init,std::mt19937 * rnd)2028 Status GriffinLimImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft,
2029 int32_t n_iter, int32_t win_length, int32_t hop_length, WindowType window_type, float power,
2030 float momentum, int32_t length, bool rand_init, std::mt19937 *rnd) {
2031 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(input, output));
2032 // pack
2033 TensorShape shape = (*output)->shape();
2034 TensorShape new_shape({(*output)->Size() / shape[-1] / shape[-2], shape[-2], shape[-1]});
2035 RETURN_IF_NOT_OK((*output)->Reshape(new_shape));
2036 // power
2037 CHECK_FAIL_RETURN_UNEXPECTED(power != 0, "GriffinLim: power can not be zero.");
2038 for (auto itr = (*output)->begin<T>(); itr != (*output)->end<T>(); itr++) {
2039 *itr = pow(*itr, 1 / power);
2040 }
2041 // window
2042 std::shared_ptr<Tensor> fft_window_tensor;
2043 RETURN_IF_NOT_OK(Window(&fft_window_tensor, window_type, win_length));
2044 if (win_length == 1) {
2045 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1}), DataType(DataType::DE_FLOAT32), &fft_window_tensor));
2046 auto win = fft_window_tensor->begin<float>();
2047 *(win) = 1;
2048 }
2049 std::shared_ptr<Tensor> final_results;
2050 for (int dim = 0; dim < new_shape[0]; dim++) {
2051 // init complex phase
2052 Eigen::MatrixXcd angles(shape[(-1) * TWO], shape[-1]);
2053 if (rand_init) {
2054 // static std::default_random_engine e;
2055 std::uniform_real_distribution<double> dist(0, 1);
2056 angles = angles.unaryExpr(
2057 [&dist, &rnd](std::complex<double> value) { return std::complex<double>(dist(*rnd), dist(*rnd)); });
2058 } else {
2059 angles = angles.unaryExpr([](std::complex<double> value) { return std::complex<double>(1, 0); });
2060 }
2061 // slice and squeeze the first dim
2062 std::shared_ptr<Tensor> spec_tensor_slice;
2063 RETURN_IF_NOT_OK((*output)->Slice(
2064 &spec_tensor_slice,
2065 std::vector<SliceOption>({SliceOption(std::vector<dsize_t>{dim}), SliceOption(true), SliceOption(true)})));
2066 TensorShape new_slice_shape({shape[-2], shape[-1]});
2067 RETURN_IF_NOT_OK(spec_tensor_slice->Reshape(new_slice_shape));
2068 // turn tensor into eigen MatrixXd
2069 auto data_ptr = &*spec_tensor_slice->begin<T>();
2070 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> spec_matrix_transpose(data_ptr, shape[-1],
2071 shape[(-1) * TWO]);
2072 Eigen::MatrixXd spec_matrix = spec_matrix_transpose.transpose().template cast<double>();
2073 auto stft_complex = angles.cwiseProduct(spec_matrix);
2074
2075 // init tprev zero mat
2076 Eigen::MatrixXcd tprev(shape[(-1) * TWO], shape[-1]);
2077 Eigen::MatrixXcd rebuilt(shape[(-1) * TWO], shape[-1]);
2078 tprev.setZero();
2079 rebuilt.setZero();
2080 for (int iter = 0; iter < n_iter; iter++) {
2081 // istft
2082 std::shared_ptr<Tensor> inverse;
2083 RETURN_IF_NOT_OK(ISTFT<T>(stft_complex, &inverse, n_fft, hop_length, win_length, window_type, true, length));
2084 // stft
2085 std::shared_ptr<Tensor> stft_out;
2086 RETURN_IF_NOT_OK(SpectrogramImpl<T>(inverse, &stft_out, 0, window_type, n_fft, hop_length, win_length, 0, false,
2087 true, BorderType::kReflect, true));
2088
2089 rebuilt.transposeInPlace();
2090 Tensor::TensorIterator<T> itr = stft_out->begin<T>();
2091 rebuilt = rebuilt.unaryExpr([&itr](std::complex<double> value) {
2092 T real = *(itr++);
2093 T img = *(itr++);
2094 return std::complex<double>(real, img);
2095 });
2096 rebuilt.transposeInPlace();
2097 angles = rebuilt.array();
2098
2099 if (momentum != 0) {
2100 tprev = tprev * ((momentum) / (1 + momentum));
2101 angles = angles.array() - tprev.array();
2102 }
2103
2104 float eps = 1e-16;
2105 auto angles_abs = angles.cwiseAbs().eval();
2106 angles_abs.array() += eps;
2107 angles = angles.array() / angles_abs.array();
2108 tprev = rebuilt.array();
2109 }
2110
2111 // istft calculate final phase
2112 auto stft_complex_fin = angles.cwiseProduct(spec_matrix);
2113 std::shared_ptr<Tensor> waveform;
2114 RETURN_IF_NOT_OK(ISTFT<T>(stft_complex_fin, &waveform, n_fft, hop_length, win_length, window_type, true, length));
2115
2116 if (shape.Rank() == TWO) {
2117 // do not expand dim
2118 final_results = waveform;
2119 continue;
2120 }
2121
2122 if (final_results != nullptr) {
2123 RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2124 } else {
2125 RETURN_IF_NOT_OK(
2126 Tensor::CreateEmpty(TensorShape({new_shape[0], waveform->shape()[0]}), waveform->type(), &final_results));
2127 RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2128 }
2129 }
2130 *output = final_results;
2131 return Status::OK();
2132 }
2133
GriffinLim(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_fft,int32_t n_iter,int32_t win_length,int32_t hop_length,WindowType window_type,float power,float momentum,int32_t length,bool rand_init,std::mt19937 * rnd)2134 Status GriffinLim(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_fft, int32_t n_iter,
2135 int32_t win_length, int32_t hop_length, WindowType window_type, float power, float momentum,
2136 int32_t length, bool rand_init, std::mt19937 *rnd) {
2137 std::shared_ptr<Tensor> input_tensor;
2138 if (input->type() != DataType::DE_FLOAT64) {
2139 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
2140 return GriffinLimImpl<float>(input_tensor, output, n_fft, n_iter, win_length, hop_length, window_type, power,
2141 momentum, length, rand_init, rnd);
2142 } else {
2143 input_tensor = input;
2144 return GriffinLimImpl<double>(input_tensor, output, n_fft, n_iter, win_length, hop_length, window_type, power,
2145 momentum, length, rand_init, rnd);
2146 }
2147 }
2148
2149 template <typename T>
InverseMelScaleImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_stft,int32_t n_mels,int32_t sample_rate,float f_min,float f_max,int32_t max_iter,float tolerance_loss,float tolerance_change,float sgd_lr,float sgd_momentum,NormType norm,MelType mel_type,std::mt19937 * rnd)2150 Status InverseMelScaleImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
2151 int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
2152 float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum,
2153 NormType norm, MelType mel_type, std::mt19937 *rnd) {
2154 constexpr int32_t sample_rate_factor = 2;
2155 f_max = std::fabs(f_max) <= std::numeric_limits<float>::epsilon()
2156 ? static_cast<T>(std::floor(sample_rate / sample_rate_factor))
2157 : f_max;
2158 // create fb mat <freq, n_mels>
2159 std::shared_ptr<Tensor> freq_bin_mat;
2160 RETURN_IF_NOT_OK(CreateFbanks<T>(&freq_bin_mat, n_stft, f_min, f_max, n_mels, sample_rate, norm, mel_type));
2161
2162 auto fb_ptr = &*freq_bin_mat->begin<float>();
2163 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_fb(fb_ptr, n_mels, n_stft);
2164 // pack melspec <n, n_mels, time>
2165 TensorShape input_shape = input->shape();
2166 TensorShape input_reshape({input->Size() / input_shape[-1] / input_shape[-2], input_shape[-2], input_shape[-1]});
2167 RETURN_IF_NOT_OK(input->Reshape(input_reshape));
2168 CHECK_FAIL_RETURN_UNEXPECTED(n_mels == input_shape[-1 * TWO],
2169 "InverseMelScale: n_mels must be equal to the penultimate dimension of input.");
2170
2171 int time = input_shape[-1];
2172 int freq = matrix_fb.cols();
2173 // input matrix 3d
2174 std::vector<T> specgram;
2175 // engine for random matrix
2176 std::uniform_real_distribution<T> dist(0, 1);
2177 for (int channel = 0; channel < input_reshape[0]; channel++) {
2178 // slice by first dimension
2179 auto data_ptr = &*input->begin<T>();
2180 Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> input_channel(data_ptr + time * n_mels * channel, time,
2181 n_mels);
2182 // init specgram at n=channel
2183 Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_channel =
2184 Eigen::MatrixXd::Zero(time, freq).unaryExpr([&rnd, &dist](double dummy) { return dist(*rnd); });
2185 std::vector<T> vec_channel(mat_channel.data(), mat_channel.data() + mat_channel.size());
2186 std::shared_ptr<Tensor> param_channel;
2187 TensorShape output_shape = TensorShape({freq, time});
2188 RETURN_IF_NOT_OK(Tensor::CreateFromVector(vec_channel, TensorShape({freq * time}), ¶m_channel));
2189 // sgd
2190 T loss = std::numeric_limits<T>::max();
2191 for (int epoch = 0; epoch < max_iter; epoch++) {
2192 auto pred = mat_channel * (matrix_fb.transpose().template cast<T>());
2193 // cal loss with pred and gt
2194 auto diff = input_channel - pred;
2195 T new_loss = diff.array().square().mean();
2196 // cal grad
2197 auto grad = diff * (matrix_fb.template cast<T>()) * (-1) / time;
2198 Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_grad = grad;
2199 std::vector<T> vec_grad(mat_grad.data(), mat_grad.data() + mat_grad.size());
2200 std::shared_ptr<Tensor> tensor_grad;
2201 RETURN_IF_NOT_OK(Tensor::CreateFromVector(vec_grad, TensorShape({grad.size()}), &tensor_grad));
2202
2203 std::shared_ptr<Tensor> nspec;
2204 RETURN_IF_NOT_OK(SGD<T>(param_channel, &nspec, tensor_grad, sgd_lr, sgd_momentum));
2205
2206 T diff_loss = std::abs(loss - new_loss);
2207 if ((new_loss < tolerance_loss) || (diff_loss < tolerance_change)) {
2208 break;
2209 }
2210 loss = new_loss;
2211 data_ptr = &*nspec->begin<T>();
2212 mat_channel = Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>(data_ptr, time, freq);
2213 // use new mat_channel to update param_channel
2214 RETURN_IF_NOT_OK(Tensor::CreateFromTensor(nspec, ¶m_channel));
2215 }
2216 // clamp and transpose
2217 auto res = mat_channel.cwiseMax(0);
2218 Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic> mat_res = res;
2219 std::vector<T> spec_channel(mat_res.data(), mat_res.data() + mat_res.size());
2220 specgram.insert(specgram.end(), spec_channel.begin(), spec_channel.end());
2221 }
2222 std::shared_ptr<Tensor> final_out;
2223 if (input_shape.Size() > TWO) {
2224 std::vector<int64_t> out_shape_vec = input_shape.AsVector();
2225 out_shape_vec[input_shape.Size() - 1] = time;
2226 out_shape_vec[input_shape.Size() - TWO] = freq;
2227 TensorShape output_shape(out_shape_vec);
2228 RETURN_IF_NOT_OK(Tensor::CreateFromVector(specgram, output_shape, &final_out));
2229 } else {
2230 TensorShape output_shape = TensorShape({input_reshape[0], freq, time});
2231 RETURN_IF_NOT_OK(Tensor::CreateFromVector(specgram, output_shape, &final_out));
2232 }
2233 *output = final_out;
2234 return Status::OK();
2235 }
2236
InverseMelScale(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t n_stft,int32_t n_mels,int32_t sample_rate,float f_min,float f_max,int32_t max_iter,float tolerance_loss,float tolerance_change,float sgd_lr,float sgd_momentum,NormType norm,MelType mel_type,std::mt19937 * rnd)2237 Status InverseMelScale(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t n_stft,
2238 int32_t n_mels, int32_t sample_rate, float f_min, float f_max, int32_t max_iter,
2239 float tolerance_loss, float tolerance_change, float sgd_lr, float sgd_momentum, NormType norm,
2240 MelType mel_type, std::mt19937 *rnd) {
2241 std::shared_ptr<Tensor> input_tensor;
2242 if (input->type() != DataType::DE_FLOAT64) {
2243 RETURN_IF_NOT_OK(TypeCast(input, &input_tensor, DataType(DataType::DE_FLOAT32)));
2244 return InverseMelScaleImpl<float>(input_tensor, output, n_stft, n_mels, sample_rate, f_min, f_max, max_iter,
2245 tolerance_loss, tolerance_change, sgd_lr, sgd_momentum, norm, mel_type, rnd);
2246 } else {
2247 input_tensor = input;
2248 return InverseMelScaleImpl<double>(input_tensor, output, n_stft, n_mels, sample_rate, f_min, f_max, max_iter,
2249 tolerance_loss, tolerance_change, sgd_lr, sgd_momentum, norm, mel_type, rnd);
2250 }
2251 }
2252
2253 template <typename T>
GetSincResampleKernel(const int32_t orig_freq,const int32_t des_freq,ResampleMethod resample_method,int32_t lowpass_filter_width,float rolloff,float beta,DataType datatype,Eigen::MatrixX<T> * kernel_ptr,int32_t * width)2254 Status GetSincResampleKernel(const int32_t orig_freq, const int32_t des_freq, ResampleMethod resample_method,
2255 int32_t lowpass_filter_width, float rolloff, float beta, DataType datatype,
2256 Eigen::MatrixX<T> *kernel_ptr, int32_t *width) {
2257 CHECK_FAIL_RETURN_UNEXPECTED(orig_freq != 0, "Resample: invalid parameter, 'orig_freq' can not be zero.");
2258 CHECK_FAIL_RETURN_UNEXPECTED(des_freq != 0, "Resample: invalid parameter, 'des_freq' can not be zero.");
2259 float base_freq = static_cast<float>(std::min(orig_freq, des_freq));
2260 // Removing the highest frequencies to perform antialiasing filtering. This is
2261 // needed in both upsampling and downsampling
2262 base_freq *= rolloff;
2263 *width = static_cast<int32_t>(ceil(static_cast<float>(lowpass_filter_width * orig_freq) / base_freq));
2264 int32_t kernel_length = 2 * (*width) + orig_freq;
2265 Eigen::MatrixX<T> idx(1, kernel_length);
2266 auto idx_it = idx.data();
2267 for (int32_t i = -(*width); i < (*width + orig_freq); i++) {
2268 *idx_it = static_cast<T>(i) / orig_freq;
2269 ++idx_it;
2270 }
2271 Eigen::VectorX<T> filter(des_freq);
2272 auto filter_it = filter.data();
2273 for (int32_t i = 0; i > -des_freq; i--) {
2274 *filter_it = static_cast<T>(i) / des_freq;
2275 ++filter_it;
2276 }
2277 Eigen::MatrixX<T> kernel_matrix(kernel_length, des_freq);
2278 kernel_matrix.noalias() = filter.replicate(1, kernel_length) + idx.replicate(des_freq, 1);
2279 kernel_matrix *= base_freq;
2280 // clamp
2281 kernel_matrix.noalias() = kernel_matrix.cwiseMin(lowpass_filter_width).cwiseMax(-lowpass_filter_width);
2282 Eigen::MatrixX<T> window;
2283 if (resample_method == ResampleMethod::kSincInterpolation) {
2284 window = kernel_matrix * PI / lowpass_filter_width / TWO;
2285 window.noalias() = window.unaryExpr([=](T x) -> T {
2286 T temp = cos(x);
2287 return temp * temp;
2288 });
2289 } else {
2290 // kaiser_window
2291 #ifdef __APPLE__
2292 RETURN_STATUS_ERROR(StatusCode::kMDNotImplementedYet,
2293 "Resample: ResampleMethod of Kaiser Window is not "
2294 "supported on MacOS yet.");
2295 #else
2296 {
2297 std::unique_lock<std::mutex> _lock(cyl_bessel_mux_);
2298 T beta_bessel = std::cyl_bessel_if(0, beta);
2299 window.noalias() = kernel_matrix.unaryExpr([=](T x) -> T {
2300 return std::cyl_bessel_i(0, beta * sqrt(1 - pow((x / lowpass_filter_width), TWO))) / beta_bessel;
2301 });
2302 }
2303 #endif
2304 }
2305 kernel_matrix *= PI;
2306 T scale = base_freq / orig_freq;
2307 kernel_matrix = kernel_matrix.unaryExpr([=](T x) -> T { return x == 0 ? scale : scale * (sin(x) / x); });
2308 kernel_matrix = kernel_matrix.cwiseProduct(window);
2309 *kernel_ptr = kernel_matrix;
2310 return Status::OK();
2311 }
2312
2313 template <typename T>
Cov1dStride(const Eigen::MatrixX<T> & waveform_pad_matrix,const Eigen::MatrixX<T> & kernel,const int32_t num_waveform,const int32_t orig_freq,const int32_t target_length,const int32_t pad_length)2314 std::shared_ptr<Eigen::MatrixX<T>> Cov1dStride(const Eigen::MatrixX<T> &waveform_pad_matrix,
2315 const Eigen::MatrixX<T> &kernel, const int32_t num_waveform,
2316 const int32_t orig_freq, const int32_t target_length,
2317 const int32_t pad_length) {
2318 Eigen::MatrixX<T> output_matrix(target_length, num_waveform);
2319 Eigen::MatrixX<T> mul_matrix;
2320 const int32_t kernel_x = static_cast<int32_t>(kernel.rows());
2321 const int32_t kernel_y = static_cast<int32_t>(kernel.cols());
2322 const int32_t resample_num = static_cast<int32_t>(ceil(static_cast<float>(target_length) / kernel_x));
2323 Eigen::MatrixX<T> multi_input(kernel_y, resample_num);
2324 for (int32_t i = 0; i < num_waveform; i++) {
2325 for (int32_t j = 0, x_dim = 0; j + kernel_y < pad_length && x_dim < resample_num; j += orig_freq, ++x_dim) {
2326 multi_input.col(x_dim) = waveform_pad_matrix(Eigen::seqN(j, kernel_y), Eigen::seqN(i, 1));
2327 }
2328 mul_matrix.noalias() = kernel * multi_input;
2329 output_matrix.col(i) = Eigen::Map<Eigen::MatrixX<T>>(mul_matrix.data(), target_length, 1);
2330 }
2331 return std::make_shared<Eigen::MatrixX<T>>(output_matrix);
2332 }
2333
2334 template <typename T>
Resample(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float orig_freq,float des_freq,ResampleMethod resample_method,int32_t lowpass_filter_width,float rolloff,float beta)2335 Status Resample(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float orig_freq, float des_freq,
2336 ResampleMethod resample_method, int32_t lowpass_filter_width, float rolloff, float beta) {
2337 const TensorShape input_shape = input->shape();
2338 const int32_t waveform_length = input_shape[-1];
2339 const int32_t num_waveform = input->Size() / waveform_length;
2340 const TensorShape to_shape = TensorShape({num_waveform, waveform_length});
2341 RETURN_IF_NOT_OK(input->Reshape(to_shape));
2342
2343 const int32_t gcd = std::gcd(static_cast<int32_t>(orig_freq), static_cast<int32_t>(des_freq));
2344 CHECK_FAIL_RETURN_UNEXPECTED(gcd != 0, "Resample: gcd cannot be equal to 0.");
2345 const int32_t orig_freq_prime = static_cast<int32_t>(floor(orig_freq / gcd));
2346 const int32_t des_freq_prime = static_cast<int32_t>(floor(des_freq / gcd));
2347 CHECK_FAIL_RETURN_UNEXPECTED(orig_freq_prime != 0, "Resample: invalid parameter, 'orig_freq_prime' cannot be zero.");
2348 Eigen::MatrixX<T> kernel;
2349 int32_t width = 0;
2350 RETURN_IF_NOT_OK(GetSincResampleKernel<T>(orig_freq_prime, des_freq_prime, resample_method, lowpass_filter_width,
2351 rolloff, beta, input->type(), &kernel, &width));
2352 const int32_t ZERO = 0;
2353 const int32_t kernel_rows = static_cast<int32_t>(kernel.rows());
2354 const int32_t kernel_cols = static_cast<int32_t>(kernel.cols());
2355 RETURN_IF_NOT_OK(ValidateGreaterThan("Resample", "kernel.cols()", kernel_cols, "boundary", ZERO));
2356 RETURN_IF_NOT_OK(ValidateGreaterThan("Resample", "kernel.rows()", kernel_rows, "boundary", ZERO));
2357
2358 // padding
2359 const int32_t pad_length = waveform_length + 2 * width + orig_freq_prime;
2360 std::shared_ptr<Tensor> waveform_pad;
2361 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({1, pad_length}), input->type(), &waveform_pad));
2362 RETURN_IF_NOT_OK(Pad<T>(input, &waveform_pad, width, width + orig_freq_prime, BorderType::kConstant));
2363 Eigen::MatrixX<T> waveform_pad_matrix =
2364 Eigen::Map<Eigen::MatrixX<T>>(&*waveform_pad->begin<T>(), waveform_pad->shape()[1], waveform_pad->shape()[0]);
2365 const int32_t target_length =
2366 static_cast<int32_t>(std::ceil(static_cast<double>(des_freq_prime * waveform_length) / orig_freq_prime));
2367
2368 // cov1d with stide = orig_freq_prime
2369 std::shared_ptr<Eigen::MatrixX<T>> output_ptr =
2370 Cov1dStride<T>(waveform_pad_matrix, kernel, num_waveform, orig_freq_prime, target_length, pad_length);
2371 std::vector<dsize_t> shape_vec = input_shape.AsVector();
2372 shape_vec.at(shape_vec.size() - 1) = static_cast<dsize_t>(target_length);
2373 const auto output_data = reinterpret_cast<const uchar *>(output_ptr->data());
2374 RETURN_IF_NOT_OK(Tensor::CreateFromMemory(TensorShape(shape_vec), DataType::FromCType<T>(), output_data, output));
2375 return Status::OK();
2376 }
2377
Resample(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float orig_freq,float des_freq,ResampleMethod resample_method,int32_t lowpass_filter_width,float rolloff,float beta)2378 Status Resample(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float orig_freq, float des_freq,
2379 ResampleMethod resample_method, int32_t lowpass_filter_width, float rolloff, float beta) {
2380 RETURN_IF_NOT_OK(ValidateLowRank("Resample", input, kMinAudioDim, "<..., time>"));
2381 RETURN_IF_NOT_OK(ValidateTensorNumeric("Resample", input));
2382 if (input->type().value() == DataType::DE_FLOAT64) {
2383 RETURN_IF_NOT_OK(
2384 Resample<double>(input, output, orig_freq, des_freq, resample_method, lowpass_filter_width, rolloff, beta));
2385 } else {
2386 std::shared_ptr<Tensor> waveform;
2387 RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
2388 RETURN_IF_NOT_OK(
2389 Resample<float>(waveform, output, orig_freq, des_freq, resample_method, lowpass_filter_width, rolloff, beta));
2390 }
2391 return Status::OK();
2392 }
2393
LFCC(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_filter,int32_t n_lfcc,int32_t dct_type,bool log_lf,int32_t n_fft,int32_t win_length,int32_t hop_length,float f_min,float f_max,int32_t pad,WindowType window,float power,bool normalized,bool center,BorderType pad_mode,bool onesided,NormMode norm)2394 Status LFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2395 int32_t n_filter, int32_t n_lfcc, int32_t dct_type, bool log_lf, int32_t n_fft, int32_t win_length,
2396 int32_t hop_length, float f_min, float f_max, int32_t pad, WindowType window, float power, bool normalized,
2397 bool center, BorderType pad_mode, bool onesided, NormMode norm) {
2398 RETURN_UNEXPECTED_IF_NULL(input);
2399 RETURN_UNEXPECTED_IF_NULL(output);
2400 std::shared_ptr<Tensor> spectrogram;
2401 std::shared_ptr<Tensor> filter_mat;
2402 std::shared_ptr<Tensor> dct_mat;
2403 RETURN_IF_NOT_OK(Spectrogram(input, &spectrogram, pad, window, n_fft, hop_length, win_length, power, normalized,
2404 center, pad_mode, onesided));
2405 RETURN_IF_NOT_OK(
2406 CreateLinearFbanks(&filter_mat, static_cast<int32_t>(floor(n_fft / TWO)) + 1, f_min, f_max, n_filter, sample_rate));
2407 RETURN_IF_NOT_OK(Dct(&dct_mat, n_lfcc, n_filter, norm));
2408 std::shared_ptr<Tensor> spectrogramxfilter;
2409 std::shared_ptr<Tensor> specgram_temp;
2410
2411 TensorShape specgram_shape = spectrogram->shape();
2412 TensorShape specgram_reshape(
2413 {spectrogram->Size() / specgram_shape[-1] / specgram_shape[-2], specgram_shape[-2], specgram_shape[-1]});
2414 RETURN_IF_NOT_OK(spectrogram->Reshape(specgram_reshape));
2415 auto filter_mat_ptr = &*filter_mat->begin<float>();
2416 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_fm(
2417 filter_mat_ptr, n_filter, static_cast<int32_t>(floor(n_fft / TWO)) + 1);
2418 auto dct_mat_ptr = &*dct_mat->begin<float>();
2419 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_dm(dct_mat_ptr, n_lfcc, n_filter);
2420
2421 int rows = specgram_reshape[1];
2422 int cols = specgram_reshape[TWO];
2423
2424 std::vector<float> sf_temp;
2425 Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> mat_res;
2426
2427 for (int c = 0; c < specgram_reshape[0]; c++) {
2428 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
2429 &*spectrogram->begin<float>() + rows * cols * c, cols, rows);
2430 mat_res.noalias() = (matrix_c * matrix_fm.transpose());
2431 std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
2432 sf_temp.insert(sf_temp.end(), vec_c.begin(), vec_c.end());
2433 }
2434 // unpack
2435 std::vector<int64_t> sf_shape_vec = specgram_shape.AsVector();
2436 sf_shape_vec[specgram_shape.Size() - 1] = cols;
2437 sf_shape_vec[specgram_shape.Size() - TWO] = n_filter;
2438 TensorShape sf_shape(sf_shape_vec);
2439 RETURN_IF_NOT_OK(Tensor::CreateFromVector(sf_temp, sf_shape, &spectrogramxfilter));
2440
2441 if (log_lf == true) {
2442 float log_offset = 1e-6;
2443 for (auto itr = spectrogramxfilter->begin<float>(); itr != spectrogramxfilter->end<float>(); ++itr) {
2444 *itr = log(*itr + log_offset);
2445 }
2446 specgram_temp = spectrogramxfilter;
2447 } else {
2448 std::shared_ptr<Tensor> amplitude_to_db;
2449 float multiplier = 10.0;
2450 float db_multiplier = 0.0;
2451 float amin = 1e-10;
2452 float top_db = 80.0;
2453 RETURN_IF_NOT_OK(AmplitudeToDB(spectrogramxfilter, &litude_to_db, multiplier, amin, db_multiplier, top_db));
2454 specgram_temp = amplitude_to_db;
2455 }
2456
2457 TensorShape st_shape = specgram_temp->shape();
2458 TensorShape st_reshape({specgram_temp->Size() / st_shape[-1] / st_shape[-2], st_shape[-2], st_shape[-1]});
2459 RETURN_IF_NOT_OK(specgram_temp->Reshape(st_reshape));
2460
2461 rows = st_reshape[1];
2462 cols = st_reshape[TWO];
2463 std::vector<float> out_temp;
2464
2465 for (int c = 0; c < st_reshape[0]; c++) {
2466 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
2467 &*specgram_temp->begin<float>() + rows * cols * c, cols, rows);
2468 mat_res.noalias() = (matrix_c * matrix_dm.transpose());
2469 std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
2470 out_temp.insert(out_temp.end(), vec_c.begin(), vec_c.end());
2471 }
2472 // unpack
2473 std::vector<int64_t> output_shape_vec = st_shape.AsVector();
2474 output_shape_vec[st_shape.Size() - 1] = cols;
2475 output_shape_vec[st_shape.Size() - TWO] = n_lfcc;
2476 TensorShape output_shape(output_shape_vec);
2477 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_temp, output_shape, output));
2478
2479 return Status::OK();
2480 }
2481
MelSpectrogram(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_fft,int32_t win_length,int32_t hop_length,float f_min,float f_max,int32_t pad,int32_t n_mels,WindowType window,float power,bool normalized,bool center,BorderType pad_mode,bool onesided,NormType norm,MelType mel_scale)2482 Status MelSpectrogram(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2483 int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min, float f_max, int32_t pad,
2484 int32_t n_mels, WindowType window, float power, bool normalized, bool center, BorderType pad_mode,
2485 bool onesided, NormType norm, MelType mel_scale) {
2486 RETURN_UNEXPECTED_IF_NULL(input);
2487 RETURN_UNEXPECTED_IF_NULL(output);
2488 auto input_shape_vec = input->shape().AsVector();
2489 CHECK_FAIL_RETURN_UNEXPECTED(n_fft < TWO * input_shape_vec[input_shape_vec.size() - 1],
2490 "MelSpectrogram: Padding size should be less than the corresponding input dimension.");
2491 std::shared_ptr<Tensor> spectrogram;
2492 RETURN_IF_NOT_OK(Spectrogram(input, &spectrogram, pad, window, n_fft, hop_length, win_length, power, normalized,
2493 center, pad_mode, onesided));
2494 RETURN_IF_NOT_OK(
2495 MelScale<float>(spectrogram, output, n_mels, sample_rate, f_min, f_max, n_fft / TWO + 1, norm, mel_scale));
2496 return Status::OK();
2497 }
2498
MFCC(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_mfcc,int32_t dct_type,bool log_mels,int32_t n_fft,int32_t win_length,int32_t hop_length,float f_min,float f_max,int32_t pad,int32_t n_mels,WindowType window,float power,bool normalized,bool center,BorderType pad_mode,bool onesided,NormType norm,NormMode norm_M,MelType mel_scale)2499 Status MFCC(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate, int32_t n_mfcc,
2500 int32_t dct_type, bool log_mels, int32_t n_fft, int32_t win_length, int32_t hop_length, float f_min,
2501 float f_max, int32_t pad, int32_t n_mels, WindowType window, float power, bool normalized, bool center,
2502 BorderType pad_mode, bool onesided, NormType norm, NormMode norm_M, MelType mel_scale) {
2503 RETURN_UNEXPECTED_IF_NULL(input);
2504 RETURN_UNEXPECTED_IF_NULL(output);
2505 std::shared_ptr<Tensor> mel_spectrogram;
2506 std::shared_ptr<Tensor> dct_mat;
2507 RETURN_IF_NOT_OK(MelSpectrogram(input, &mel_spectrogram, sample_rate, n_fft, win_length, hop_length, f_min, f_max,
2508 pad, n_mels, window, power, normalized, center, pad_mode, onesided, norm, mel_scale));
2509 RETURN_IF_NOT_OK(Dct(&dct_mat, n_mfcc, n_mels, norm_M));
2510 if (log_mels) {
2511 for (auto itr = mel_spectrogram->begin<float>(); itr != mel_spectrogram->end<float>(); ++itr) {
2512 float log_offset = 1e-6;
2513 *itr = log(*itr + log_offset);
2514 }
2515 } else {
2516 std::shared_ptr<Tensor> amplitude_to_db;
2517 float multiplier = 10.0;
2518 float db_multiplier = 0.0;
2519 float amin = 1e-10;
2520 float top_db = 80.0;
2521 RETURN_IF_NOT_OK(AmplitudeToDB(mel_spectrogram, &litude_to_db, multiplier, amin, db_multiplier, top_db));
2522 mel_spectrogram = amplitude_to_db;
2523 }
2524 auto dct_mat_ptr = &*dct_mat->begin<float>();
2525 Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> mat_res;
2526 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_dm(dct_mat_ptr, n_mfcc, n_mels);
2527 TensorShape st_shape = mel_spectrogram->shape();
2528 TensorShape st_reshape({mel_spectrogram->Size() / st_shape[-1] / st_shape[-2], st_shape[-2], st_shape[-1]});
2529 RETURN_IF_NOT_OK(mel_spectrogram->Reshape(st_reshape));
2530
2531 const dsize_t kRowIndex = 1;
2532 const dsize_t kColIndex = 2;
2533 int rows = st_reshape[kRowIndex];
2534 int cols = st_reshape[kColIndex];
2535 std::vector<float> out_temp;
2536
2537 for (int c = 0; c < st_reshape[0]; c++) {
2538 Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>> matrix_c(
2539 &*mel_spectrogram->begin<float>() + rows * cols * c, cols, rows);
2540 mat_res.noalias() = (matrix_c * matrix_dm.transpose());
2541 std::vector<float> vec_c(mat_res.data(), mat_res.data() + mat_res.size());
2542 out_temp.insert(out_temp.end(), vec_c.begin(), vec_c.end());
2543 }
2544 // unpack
2545 std::vector<int64_t> output_shape_vec = st_shape.AsVector();
2546 output_shape_vec[st_shape.Size() - 1] = cols;
2547 output_shape_vec[st_shape.Size() - TWO] = n_mfcc;
2548 TensorShape output_shape(output_shape_vec);
2549 RETURN_IF_NOT_OK(Tensor::CreateFromVector(out_temp, output_shape, output));
2550
2551 return Status::OK();
2552 }
2553
2554 template <typename T>
PitchShiftImpl(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_steps,int32_t bins_per_octave,int32_t n_fft,int32_t win_length,int32_t hop_length,WindowType window)2555 Status PitchShiftImpl(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2556 int32_t n_steps, int32_t bins_per_octave, int32_t n_fft, int32_t win_length, int32_t hop_length,
2557 WindowType window) {
2558 DataType data_type = input->type();
2559 // pack batch
2560 TensorShape shape = input->shape();
2561 int input_len = shape[-1];
2562
2563 RETURN_IF_NOT_OK(input->Reshape(TensorShape({input->Size() / input_len, input_len})));
2564 std::shared_ptr<Tensor> spec_f;
2565
2566 RETURN_IF_NOT_OK(SpectrogramImpl<T>(input, &spec_f, 0, window, n_fft, hop_length, win_length, 0, false, true,
2567 BorderType::kReflect, true));
2568
2569 std::shared_ptr<Tensor> phase_advance;
2570 int spec_f_length = spec_f->shape()[-3];
2571 RETURN_IF_NOT_OK(Linspace<T>(&phase_advance, 0, PI * hop_length, spec_f_length));
2572 TensorShape pshape = phase_advance->shape();
2573 RETURN_IF_NOT_OK(phase_advance->Reshape(TensorShape(phase_advance->shape().AppendDim(1))));
2574 std::shared_ptr<Tensor> spec_stretch;
2575 float a = static_cast<float>(-n_steps) / bins_per_octave;
2576 float rate = pow(2, a);
2577
2578 RETURN_IF_NOT_OK(TimeStretch<T>(spec_f, &spec_stretch, rate, phase_advance));
2579
2580 int ori_len = shape[-1];
2581 int len_stretch = round(ori_len / rate);
2582 std::shared_ptr<Tensor> final_results;
2583 TensorShape tshape = spec_stretch->shape();
2584 TensorShape new_shape({tshape[0], tshape[-3], spec_stretch->Size() / tshape[-3] / tshape[0]});
2585 RETURN_IF_NOT_OK(spec_stretch->Reshape(new_shape));
2586 auto size = tshape[0] * tshape[-3] * spec_stretch->Size() / tshape[-3] / tshape[0];
2587 Tensor::TensorIterator<T> itr = spec_stretch->begin<T>();
2588
2589 for (int dim = 0; dim < new_shape[0]; dim++) {
2590 // slice and squeeze the first dim
2591 const int row = -3;
2592 const int col = -2;
2593 Eigen::MatrixXcd rebuilt(tshape[row], tshape[col]);
2594
2595 for (int j = 0; j < tshape[row]; j++) {
2596 for (int i = 0; i < tshape[col]; i++) {
2597 if (itr < (dim + 1) * size + itr) {
2598 T real = *(itr++);
2599 T img = *(itr++);
2600 rebuilt(j, i) = std::complex<double>(real, img);
2601 }
2602 }
2603 }
2604
2605 std::shared_ptr<Tensor> waveform;
2606
2607 RETURN_IF_NOT_OK(ISTFT<T>(rebuilt, &waveform, n_fft, hop_length, win_length, window, true, len_stretch));
2608
2609 if (spec_stretch->shape().Rank() == TWO) {
2610 // do not expand dim
2611 final_results = waveform;
2612 continue;
2613 }
2614
2615 if (final_results != nullptr) {
2616 RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2617 } else {
2618 RETURN_IF_NOT_OK(
2619 Tensor::CreateEmpty(TensorShape({new_shape[0], waveform->shape()[0]}), waveform->type(), &final_results));
2620 RETURN_IF_NOT_OK(final_results->InsertTensor({dim}, waveform));
2621 }
2622 }
2623
2624 std::shared_ptr<Tensor> waveform_shift;
2625 const int32_t lowpass_filter_width = 6;
2626 const float rolloff = 0.99;
2627 const int new_rate = static_cast<int>(sample_rate / rate);
2628 const float beta = 14.769656459379492;
2629
2630 RETURN_IF_NOT_OK(Resample<T>(final_results, &waveform_shift, new_rate, sample_rate,
2631 ResampleMethod::kSincInterpolation, lowpass_filter_width, rolloff, beta));
2632
2633 int shift_len = waveform_shift->shape()[-1];
2634 std::shared_ptr<Tensor> waveform_end;
2635 if (shift_len > ori_len) {
2636 int32_t waveform_length = waveform_shift->shape()[-1];
2637 int32_t num_waveform = waveform_shift->Size() / waveform_length;
2638 auto ind = waveform_shift->begin<T>();
2639 for (int i = 0; i < num_waveform; i++) {
2640 std::shared_ptr<Tensor> wave_final;
2641 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({ori_len}), input->type(), &wave_final));
2642 if (i != 0) ind += shift_len - ori_len;
2643 for (auto atr = wave_final->begin<T>(); atr != wave_final->end<T>(); atr++) {
2644 *atr = *(ind++);
2645 }
2646 if (waveform_end != nullptr) {
2647 RETURN_IF_NOT_OK(waveform_end->InsertTensor({i}, wave_final));
2648 } else {
2649 RETURN_IF_NOT_OK(Tensor::CreateEmpty(TensorShape({num_waveform, ori_len}), wave_final->type(), &waveform_end));
2650 RETURN_IF_NOT_OK(waveform_end->InsertTensor({i}, wave_final));
2651 }
2652 }
2653 } else {
2654 RETURN_IF_NOT_OK(Pad<T>(waveform_shift, &waveform_end, 0, ori_len - shift_len, BorderType::kConstant));
2655 }
2656
2657 auto output_shape = shape.AsVector();
2658 output_shape.pop_back();
2659 output_shape.push_back(waveform_end->shape()[-1]);
2660 RETURN_IF_NOT_OK(waveform_end->Reshape(TensorShape(output_shape)));
2661 *output = waveform_end;
2662 return Status::OK();
2663 }
2664
PitchShift(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,int32_t sample_rate,int32_t n_steps,int32_t bins_per_octave,int32_t n_fft,int32_t win_length,int32_t hop_length,WindowType window)2665 Status PitchShift(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t sample_rate,
2666 int32_t n_steps, int32_t bins_per_octave, int32_t n_fft, int32_t win_length, int32_t hop_length,
2667 WindowType window) {
2668 RETURN_UNEXPECTED_IF_NULL(input);
2669 RETURN_UNEXPECTED_IF_NULL(output);
2670 RETURN_IF_NOT_OK(ValidateLowRank("PitchShift", input, kMinAudioDim, "<..., time>"));
2671 RETURN_IF_NOT_OK(ValidateTensorNumeric("PitchShift", input));
2672 if (input->type().value() == DataType::DE_FLOAT64) {
2673 return PitchShiftImpl<double>(input, output, sample_rate, n_steps, bins_per_octave, n_fft, win_length, hop_length,
2674 window);
2675 } else {
2676 std::shared_ptr<Tensor> waveform;
2677 RETURN_IF_NOT_OK(TypeCast(input, &waveform, DataType(DataType::DE_FLOAT32)));
2678 return PitchShiftImpl<float>(waveform, output, sample_rate, n_steps, bins_per_octave, n_fft, win_length, hop_length,
2679 window);
2680 }
2681 }
2682 } // namespace dataset
2683 } // namespace mindspore
2684