1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
17 #define MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
18
19 #include <algorithm>
20 #include <cmath>
21 #include <limits>
22 #include <memory>
23 #include <random>
24 #include <string>
25 #include <vector>
26
27 #include "minddata/dataset/core/tensor.h"
28 #include "minddata/dataset/kernels/tensor_op.h"
29 #include "minddata/dataset/util/status.h"
30
31 constexpr double PI = 3.141592653589793;
32
33 namespace mindspore {
34 namespace dataset {
35
36 /// \brief Turn a tensor from the power/amplitude scale to the decibel scale.
37 /// \param input/output: Tensor of shape <..., freq, time>.
38 /// \param multiplier: power - 10, amplitude - 20.
39 /// \param amin: lower bound.
40 /// \param db_multiplier: multiplier for decibels.
41 /// \param top_db: the lower bound for decibels cut-off.
42 /// \return Status code.
43 template <typename T>
AmplitudeToDB(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T multiplier,T amin,T db_multiplier,T top_db)44 Status AmplitudeToDB(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T multiplier, T amin,
45 T db_multiplier, T top_db) {
46 TensorShape input_shape = input->shape();
47 TensorShape to_shape = input_shape.Rank() == 2
48 ? TensorShape({1, 1, input_shape[-2], input_shape[-1]})
49 : TensorShape({input->Size() / (input_shape[-3] * input_shape[-2] * input_shape[-1]),
50 input_shape[-3], input_shape[-2], input_shape[-1]});
51 RETURN_IF_NOT_OK(input->Reshape(to_shape));
52
53 std::vector<T> max_val;
54 int step = to_shape[-3] * input_shape[-2] * input_shape[-1];
55 int cnt = 0;
56 T temp_max = std::numeric_limits<T>::lowest();
57 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
58 // do clamp
59 *itr = *itr < amin ? log10(amin) * multiplier : log10(*itr) * multiplier;
60 *itr -= multiplier * db_multiplier;
61 // calculate max by axis
62 cnt++;
63 if ((*itr) > temp_max) temp_max = *itr;
64 if (cnt % step == 0) {
65 max_val.push_back(temp_max);
66 temp_max = std::numeric_limits<T>::lowest();
67 }
68 }
69
70 if (!std::isnan(top_db)) {
71 int ind = 0;
72 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++, ind++) {
73 float lower_bound = max_val[ind / step] - top_db;
74 *itr = std::max((*itr), static_cast<T>(lower_bound));
75 }
76 }
77 RETURN_IF_NOT_OK(input->Reshape(input_shape));
78 *output = input;
79 return Status::OK();
80 }
81
82 /// \brief Calculate the angles of the complex numbers.
83 /// \param input/output: Tensor of shape <..., time>.
84 template <typename T>
Angle(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output)85 Status Angle(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output) {
86 TensorShape shape = input->shape();
87 std::vector output_shape = shape.AsVector();
88 output_shape.pop_back();
89 std::shared_ptr<Tensor> output_tensor;
90 std::vector<T> out;
91 T o;
92 T x;
93 T y;
94 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
95 x = static_cast<T>(*itr);
96 itr++;
97 y = static_cast<T>(*itr);
98 o = std::atan2(y, x);
99 out.emplace_back(o);
100 }
101 // Generate multidimensional results corresponding to input
102 Tensor::CreateFromVector(out, TensorShape{output_shape}, &output_tensor);
103 *output = output_tensor;
104 return Status::OK();
105 }
106
107 /// \brief Perform a biquad filter of input tensor.
108 /// \param input/output: Tensor of shape <..., time>.
109 /// \param a0: denominator coefficient of current output y[n], typically 1.
110 /// \param a1: denominator coefficient of current output y[n-1].
111 /// \param a2: denominator coefficient of current output y[n-2].
112 /// \param b0: numerator coefficient of current input, x[n].
113 /// \param b1: numerator coefficient of input one time step ago x[n-1].
114 /// \param b2: numerator coefficient of input two time steps ago x[n-2].
115 /// \return Status code.
116 template <typename T>
Biquad(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T b0,T b1,T b2,T a0,T a1,T a2)117 Status Biquad(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T b0, T b1, T b2, T a0, T a1,
118 T a2) {
119 std::vector<T> a_coeffs;
120 std::vector<T> b_coeffs;
121 a_coeffs.push_back(a0);
122 a_coeffs.push_back(a1);
123 a_coeffs.push_back(a2);
124 b_coeffs.push_back(b0);
125 b_coeffs.push_back(b1);
126 b_coeffs.push_back(b2);
127 return LFilter(input, output, a_coeffs, b_coeffs, true);
128 }
129
130 /// \brief Apply contrast effect.
131 /// \param input/output: Tensor of shape <..., time>.
132 /// \param enhancement_amount: controls the amount of the enhancement.
133 /// \return Status code.
134 template <typename T>
Contrast(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T enhancement_amount)135 Status Contrast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T enhancement_amount) {
136 const float enhancement_zoom = 750.0;
137 T enhancement_amount_value = enhancement_amount / enhancement_zoom;
138 TensorShape output_shape{input->shape()};
139 std::shared_ptr<Tensor> out;
140 RETURN_IF_NOT_OK(Tensor::CreateEmpty(output_shape, input->type(), &out));
141 auto itr_out = out->begin<T>();
142 for (auto itr_in = input->begin<T>(); itr_in != input->end<T>(); itr_in++) {
143 T temp1, temp2 = 0;
144 temp1 = static_cast<T>(*itr_in) * (PI / 2);
145 temp2 = enhancement_amount_value * std::sin(temp1 * 4);
146 *itr_out = std::sin(temp1 + temp2);
147 itr_out++;
148 }
149 *output = out;
150 return Status::OK();
151 }
152
153 /// \brief Apply a DC shift to the audio.
154 /// \param input/output: Tensor of shape <...,time>.
155 /// \param shift: the amount to shift the audio.
156 /// \param limiter_gain: used only on peaks to prevent clipping.
157 /// \return Status code.
158 template <typename T>
DCShift(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,float shift,float limiter_gain)159 Status DCShift(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float shift, float limiter_gain) {
160 float limiter_threshold = 0.0;
161 if (shift != limiter_gain && shift != 0) {
162 limiter_threshold = 1.0 - (std::abs(shift) - limiter_gain);
163 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
164 if (*itr > limiter_threshold && shift > 0) {
165 T peak = (*itr - limiter_threshold) * limiter_gain / (1 - limiter_threshold);
166 T sample = (peak + limiter_threshold + shift);
167 *itr = sample > limiter_threshold ? limiter_threshold : sample;
168 } else if (*itr < -limiter_threshold && shift < 0) {
169 T peak = (*itr + limiter_threshold) * limiter_gain / (1 - limiter_threshold);
170 T sample = (peak + limiter_threshold + shift);
171 *itr = sample < -limiter_threshold ? -limiter_threshold : sample;
172 } else {
173 T sample = (*itr + shift);
174 *itr = (sample > 1 || sample < -1) ? (sample > 1 ? 1 : -1) : sample;
175 }
176 }
177 } else {
178 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
179 T sample = (*itr + shift);
180 *itr = sample > 1 || sample < -1 ? (sample > 1 ? 1 : -1) : sample;
181 }
182 }
183 *output = input;
184 return Status::OK();
185 }
186
187 /// \brief Perform an IIR filter by evaluating difference equation.
188 /// \param input/output: Tensor of shape <..., time>
189 /// \param a_coeffs: denominator coefficients of difference equation of dimension of (n_order + 1).
190 /// \param b_coeffs: numerator coefficients of difference equation of dimension of (n_order + 1).
191 /// \param clamp: If True, clamp the output signal to be in the range [-1, 1] (Default: True).
192 /// \return Status code
193 template <typename T>
LFilter(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,std::vector<T> a_coeffs,std::vector<T> b_coeffs,bool clamp)194 Status LFilter(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, std::vector<T> a_coeffs,
195 std::vector<T> b_coeffs, bool clamp) {
196 // pack batch
197 TensorShape input_shape = input->shape();
198 TensorShape toShape({input->Size() / input_shape[-1], input_shape[-1]});
199 input->Reshape(toShape);
200 auto shape_0 = input->shape()[0];
201 auto shape_1 = input->shape()[1];
202 std::vector<T> signal;
203 std::shared_ptr<Tensor> out;
204 std::vector<T> out_vect(shape_0 * shape_1);
205 size_t x_idx = 0;
206 size_t channel_idx = 1;
207 size_t m_num_order = b_coeffs.size() - 1;
208 size_t m_den_order = a_coeffs.size() - 1;
209 // init A_coeffs and B_coeffs by div(a0)
210 for (size_t i = 1; i < a_coeffs.size(); i++) {
211 a_coeffs[i] /= a_coeffs[0];
212 }
213 for (size_t i = 0; i < b_coeffs.size(); i++) {
214 b_coeffs[i] /= a_coeffs[0];
215 }
216 // Sliding window
217 T *m_px = new T[m_num_order + 1];
218 T *m_py = new T[m_den_order + 1];
219
220 // Tensor -> vector
221 for (auto itr = input->begin<T>(); itr != input->end<T>();) {
222 while (x_idx < shape_1 * channel_idx) {
223 signal.push_back(*itr);
224 itr++;
225 x_idx++;
226 }
227 // Sliding window
228 for (size_t j = 0; j < m_den_order; j++) {
229 m_px[j] = static_cast<T>(0);
230 }
231 for (size_t j = 0; j <= m_den_order; j++) {
232 m_py[j] = static_cast<T>(0);
233 }
234 // Each channel is processed with the sliding window
235 for (size_t i = x_idx - shape_1; i < x_idx; i++) {
236 m_px[m_num_order] = signal[i];
237 for (size_t j = 0; j < m_num_order + 1; j++) {
238 m_py[m_num_order] += b_coeffs[j] * m_px[m_num_order - j];
239 }
240 for (size_t j = 1; j < m_den_order + 1; j++) {
241 m_py[m_num_order] -= a_coeffs[j] * m_py[m_num_order - j];
242 }
243 if (clamp) {
244 if (m_py[m_num_order] > static_cast<T>(1.))
245 out_vect[i] = static_cast<T>(1.);
246 else if (m_py[m_num_order] < static_cast<T>(-1.))
247 out_vect[i] = static_cast<T>(-1.);
248 else
249 out_vect[i] = m_py[m_num_order];
250 } else {
251 out_vect[i] = m_py[m_num_order];
252 }
253 if (i + 1 == x_idx) continue;
254 for (size_t j = 0; j < m_num_order; j++) {
255 m_px[j] = m_px[j + 1];
256 }
257 for (size_t j = 0; j < m_num_order; j++) {
258 m_py[j] = m_py[j + 1];
259 }
260 m_py[m_num_order] = static_cast<T>(0);
261 }
262 if (x_idx % shape_1 == 0) {
263 ++channel_idx;
264 }
265 }
266 // unpack batch
267 Tensor::CreateFromVector(out_vect, input_shape, &out);
268 *output = out;
269 delete[] m_px;
270 delete[] m_py;
271 return Status::OK();
272 }
273
274 /// \brief Stretch STFT in time at a given rate, without changing the pitch.
275 /// \param input: Tensor of shape <..., freq, time>.
276 /// \param rate: Stretch factor.
277 /// \param phase_advance: Expected phase advance in each bin.
278 /// \param output: Tensor after stretch in time domain.
279 /// \return Status code.
280 Status TimeStretch(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, float rate, float hop_length,
281 float n_freq);
282
283 /// \brief Apply a mask along axis.
284 /// \param input: Tensor of shape <..., freq, time>.
285 /// \param output: Tensor of shape <..., freq, time>.
286 /// \param mask_param: Number of columns to be masked will be uniformly sampled from [0, mask_param].
287 /// \param mask_value: Value to assign to the masked columns.
288 /// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
289 /// \param rnd: Number generator.
290 /// \return Status code.
291 Status RandomMaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_param,
292 float mask_value, int axis, std::mt19937 rnd);
293
294 /// \brief Apply a mask along axis. All examples will have the same mask interval.
295 /// \param input: Tensor of shape <..., freq, time>.
296 /// \param output: Tensor of shape <..., freq, time>.
297 /// \param mask_width: The width of the mask.
298 /// \param mask_start: Starting position of the mask.
299 /// Mask will be applied from indices [mask_start, mask_start + mask_width).
300 /// \param mask_value: Value to assign to the masked columns.
301 /// \param axis: Axis to apply masking on (1 -> frequency, 2 -> time).
302 /// \return Status code.
303 Status MaskAlongAxis(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t mask_width,
304 int32_t mask_start, float mask_value, int32_t axis);
305
306 /// \brief Compute the norm of complex tensor input.
307 /// \param power Power of the norm description (optional).
308 /// \param input Tensor shape of <..., complex=2>.
309 /// \param output Tensor shape of <..., >.
310 /// \return Status code.
311 Status ComplexNorm(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, float power);
312
313 /// \brief Decode mu-law encoded signal.
314 /// \param input Tensor of shape <..., time>.
315 /// \param output Tensor of shape <..., time>.
316 /// \param quantization_channels Number of channels.
317 /// \return Status code.
318 Status MuLawDecoding(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int quantization_channels);
319
320 /// \brief Add a fade in and/or fade out to an input.
321 /// \param[in] input: The input tensor.
322 /// \param[out] output: Added fade in and/or fade out audio with the same shape.
323 /// \param[in] fade_in_len: Length of fade-in (time frames).
324 /// \param[in] fade_out_len: Length of fade-out (time frames).
325 /// \param[in] fade_shape: Shape of fade.
326 Status Fade(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, int32_t fade_in_len,
327 int32_t fade_out_len, FadeShape fade_shape);
328
329 /// \brief Add a volume to an waveform.
330 /// \param input/output: Tensor of shape <..., time>.
331 /// \param gain: Gain value, varies according to the value of gain_type.
332 /// \param gain_type: Type of gain, should be one of [GainType::kAmplitude, GainType::kDb, GainType::kPower].
333 /// \return Status code.
334 template <typename T>
Vol(const std::shared_ptr<Tensor> & input,std::shared_ptr<Tensor> * output,T gain,GainType gain_type)335 Status Vol(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, T gain, GainType gain_type) {
336 const T lower_bound = -1;
337 const T upper_bound = 1;
338
339 // DB is a unit which converts a numeric value into decibel scale and for conversion, we have to use log10
340 // A(in dB) = 20log10(A in amplitude)
341 // When referring to measurements of power quantities, a ratio can be expressed as a level in decibels by evaluating
342 // ten times the base-10 logarithm of the ratio of the measured quantity to reference value
343 // A(in dB) = 10log10(A in power)
344 const int power_factor_div = 20;
345 const int power_factor_mul = 10;
346 const int base = 10;
347
348 if (gain_type == GainType::kDb) {
349 if (gain != 0) {
350 gain = std::pow(base, (gain / power_factor_div));
351 }
352 } else if (gain_type == GainType::kPower) {
353 gain = power_factor_mul * std::log10(gain);
354 gain = std::pow(base, (gain / power_factor_div));
355 }
356
357 for (auto itr = input->begin<T>(); itr != input->end<T>(); itr++) {
358 if (gain != 0 || gain_type == GainType::kAmplitude) {
359 *itr = (*itr) * gain;
360 }
361 *itr = std::min(std::max((*itr), lower_bound), upper_bound);
362 }
363
364 *output = input;
365
366 return Status::OK();
367 }
368
369 /// \brief Separate a complex-valued spectrogram with shape (…, 2) into its magnitude and phase.
370 /// \param input: Complex tensor.
371 /// \param output: The magnitude and phase of the complex tensor.
372 /// \param power: Power of the norm.
373 Status Magphase(const TensorRow &input, TensorRow *output, float power);
374
375 } // namespace dataset
376 } // namespace mindspore
377 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_AUDIO_KERNELS_AUDIO_UTILS_H_
378