• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
2# SPDX-License-Identifier: MIT
3
4"""Class used to extract the Mel-frequency cepstral coefficients from a given audio frame."""
5
6import numpy as np
7
8
9class MFCCParams:
10    def __init__(self, sampling_freq, num_fbank_bins,
11                 mel_lo_freq, mel_hi_freq, num_mfcc_feats, frame_len, use_htk_method, n_FFT):
12        self.sampling_freq = sampling_freq
13        self.num_fbank_bins = num_fbank_bins
14        self.mel_lo_freq = mel_lo_freq
15        self.mel_hi_freq = mel_hi_freq
16        self.num_mfcc_feats = num_mfcc_feats
17        self.frame_len = frame_len
18        self.use_htk_method = use_htk_method
19        self.n_FFT = n_FFT
20
21
22class MFCC:
23
24    def __init__(self, mfcc_params):
25        self.mfcc_params = mfcc_params
26        self.FREQ_STEP = 200.0 / 3
27        self.MIN_LOG_HZ = 1000.0
28        self.MIN_LOG_MEL = self.MIN_LOG_HZ / self.FREQ_STEP
29        self.LOG_STEP = 1.8562979903656 / 27.0
30        self.__frame_len_padded = int(2 ** (np.ceil((np.log(self.mfcc_params.frame_len) / np.log(2.0)))))
31        self.__filter_bank_initialised = False
32        self.__frame = np.zeros(self.__frame_len_padded)
33        self.__buffer = np.zeros(self.__frame_len_padded)
34        self.__filter_bank_filter_first = np.zeros(self.mfcc_params.num_fbank_bins)
35        self.__filter_bank_filter_last = np.zeros(self.mfcc_params.num_fbank_bins)
36        self.__mel_energies = np.zeros(self.mfcc_params.num_fbank_bins)
37        self.__dct_matrix = self.create_dct_matrix(self.mfcc_params.num_fbank_bins, self.mfcc_params.num_mfcc_feats)
38        self.__mel_filter_bank = self.create_mel_filter_bank()
39        self.__np_mel_bank = np.zeros([self.mfcc_params.num_fbank_bins, int(self.mfcc_params.n_FFT / 2) + 1])
40
41        for i in range(self.mfcc_params.num_fbank_bins):
42            k = 0
43            for j in range(int(self.__filter_bank_filter_first[i]), int(self.__filter_bank_filter_last[i]) + 1):
44                self.__np_mel_bank[i, j] = self.__mel_filter_bank[i][k]
45                k += 1
46
47    def mel_scale(self, freq, use_htk_method):
48        """
49        Gets the mel scale for a particular sample frequency.
50
51        Args:
52            freq: The sampling frequency.
53            use_htk_method: Boolean to set whether to use HTK method or not.
54
55        Returns:
56            the mel scale
57        """
58        if use_htk_method:
59            return 1127.0 * np.log(1.0 + freq / 700.0)
60        else:
61            mel = freq / self.FREQ_STEP
62
63        if freq >= self.MIN_LOG_HZ:
64            mel = self.MIN_LOG_MEL + np.log(freq / self.MIN_LOG_HZ) / self.LOG_STEP
65        return mel
66
67    def inv_mel_scale(self, mel_freq, use_htk_method):
68        """
69        Gets the sample frequency for a particular mel.
70
71        Args:
72            mel_freq: The mel frequency.
73            use_htk_method: Boolean to set whether to use HTK method or not.
74
75        Returns:
76            the sample frequency
77        """
78        if use_htk_method:
79            return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0)
80        else:
81            freq = self.FREQ_STEP * mel_freq
82
83            if mel_freq >= self.MIN_LOG_MEL:
84                freq = self.MIN_LOG_HZ * np.exp(self.LOG_STEP * (mel_freq - self.MIN_LOG_MEL))
85            return freq
86
87    def mfcc_compute(self, audio_data):
88        """
89        Extracts the MFCC for a single frame.
90
91        Args:
92            audio_data: The audio data to process.
93
94        Returns:
95            the MFCC features
96        """
97        if len(audio_data) != self.mfcc_params.frame_len:
98            raise ValueError(
99                f"audio_data buffer size {len(audio_data)} does not match the frame length {self.mfcc_params.frame_len}")
100
101        audio_data = np.array(audio_data)
102        spec = np.abs(np.fft.rfft(np.hanning(self.mfcc_params.n_FFT + 1)[0:self.mfcc_params.n_FFT] * audio_data,
103                                  self.mfcc_params.n_FFT)) ** 2
104        mel_energy = np.dot(self.__np_mel_bank.astype(np.float32),
105                            np.transpose(spec).astype(np.float32))
106
107        mel_energy += 1e-10
108        log_mel_energy = 10.0 * np.log10(mel_energy)
109        top_db = 80.0
110
111        log_mel_energy = np.maximum(log_mel_energy, log_mel_energy.max() - top_db)
112
113        mfcc_feats = np.dot(self.__dct_matrix, log_mel_energy)
114
115        return mfcc_feats
116
117    def create_dct_matrix(self, num_fbank_bins, num_mfcc_feats):
118        """
119        Creates the Discrete Cosine Transform matrix to be used in the compute function.
120
121        Args:
122            num_fbank_bins: The number of filter bank bins
123            num_mfcc_feats: the number of MFCC features
124
125        Returns:
126            the DCT matrix
127        """
128        dct_m = np.zeros(num_fbank_bins * num_mfcc_feats)
129        for k in range(num_mfcc_feats):
130            for n in range(num_fbank_bins):
131                if k == 0:
132                    dct_m[(k * num_fbank_bins) + n] = 2 * np.sqrt(1 / (4 * num_fbank_bins)) * np.cos(
133                        (np.pi / num_fbank_bins) * (n + 0.5) * k)
134                else:
135                    dct_m[(k * num_fbank_bins) + n] = 2 * np.sqrt(1 / (2 * num_fbank_bins)) * np.cos(
136                        (np.pi / num_fbank_bins) * (n + 0.5) * k)
137
138        dct_m = np.reshape(dct_m, [self.mfcc_params.num_mfcc_feats, self.mfcc_params.num_fbank_bins])
139        return dct_m
140
141    def create_mel_filter_bank(self):
142        """
143        Creates the Mel filter bank.
144
145        Returns:
146            the mel filter bank
147        """
148        num_fft_bins = int(self.__frame_len_padded / 2)
149        fft_bin_width = self.mfcc_params.sampling_freq / self.__frame_len_padded
150
151        mel_low_freq = self.mel_scale(self.mfcc_params.mel_lo_freq, False)
152        mel_high_freq = self.mel_scale(self.mfcc_params.mel_hi_freq, False)
153        mel_freq_delta = (mel_high_freq - mel_low_freq) / (self.mfcc_params.num_fbank_bins + 1)
154
155        this_bin = np.zeros(num_fft_bins)
156        mel_fbank = [0] * self.mfcc_params.num_fbank_bins
157
158        for bin_num in range(self.mfcc_params.num_fbank_bins):
159            left_mel = mel_low_freq + bin_num * mel_freq_delta
160            center_mel = mel_low_freq + (bin_num + 1) * mel_freq_delta
161            right_mel = mel_low_freq + (bin_num + 2) * mel_freq_delta
162            first_index = last_index = -1
163
164            for i in range(num_fft_bins):
165                freq = (fft_bin_width * i)
166                mel = self.mel_scale(freq, False)
167                this_bin[i] = 0.0
168
169                if (mel > left_mel) and (mel < right_mel):
170                    if mel <= center_mel:
171                        weight = (mel - left_mel) / (center_mel - left_mel)
172                    else:
173                        weight = (right_mel - mel) / (right_mel - center_mel)
174
175                    enorm = 2.0 / (self.inv_mel_scale(right_mel, False) - self.inv_mel_scale(left_mel, False))
176                    weight *= enorm
177                    this_bin[i] = weight
178
179                    if first_index == -1:
180                        first_index = i
181                    last_index = i
182
183            self.__filter_bank_filter_first[bin_num] = first_index
184            self.__filter_bank_filter_last[bin_num] = last_index
185            mel_fbank[bin_num] = np.zeros(last_index - first_index + 1)
186            j = 0
187
188            for i in range(first_index, last_index + 1):
189                mel_fbank[bin_num][j] = this_bin[i]
190                j += 1
191
192        return mel_fbank
193
194
195class Preprocessor:
196
197    def __init__(self, mfcc, model_input_size, stride):
198        self.model_input_size = model_input_size
199        self.stride = stride
200
201        # Savitzky - Golay differential filters
202        self.__savgol_order1_coeffs = np.array([6.66666667e-02, 5.00000000e-02, 3.33333333e-02,
203                                                1.66666667e-02, -3.46944695e-18, -1.66666667e-02,
204                                                -3.33333333e-02, -5.00000000e-02, -6.66666667e-02])
205
206        self.savgol_order2_coeffs = np.array([0.06060606, 0.01515152, -0.01731602,
207                                              -0.03679654, -0.04329004, -0.03679654,
208                                              -0.01731602, 0.01515152, 0.06060606])
209
210        self.__mfcc_calc = mfcc
211
212    def __normalize(self, values):
213        """
214        Normalize values to mean 0 and std 1
215        """
216        ret_val = (values - np.mean(values)) / np.std(values)
217        return ret_val
218
219    def __get_features(self, features, mfcc_instance, audio_data):
220        idx = 0
221        while len(features) < self.model_input_size * mfcc_instance.mfcc_params.num_mfcc_feats:
222            features.extend(mfcc_instance.mfcc_compute(audio_data[idx:idx + int(mfcc_instance.mfcc_params.frame_len)]))
223            idx += self.stride
224
225    def extract_features(self, audio_data):
226        """
227        Extracts the MFCC features, and calculates each features first and second order derivative.
228        The matrix returned should be sized appropriately for input to the model, based
229        on the model info specified in the MFCC instance.
230
231        Args:
232            mfcc_instance: The instance of MFCC used for this calculation
233            audio_data: the audio data to be used for this calculation
234        Returns:
235            the derived MFCC feature vector, sized appropriately for inference
236        """
237
238        num_samples_per_inference = ((self.model_input_size - 1)
239                                     * self.stride) + self.__mfcc_calc.mfcc_params.frame_len
240        if len(audio_data) < num_samples_per_inference:
241            raise ValueError("audio_data size for feature extraction is smaller than "
242                             "the expected number of samples needed for inference")
243
244        features = []
245        self.__get_features(features, self.__mfcc_calc, np.asarray(audio_data))
246        features = np.reshape(np.array(features), (self.model_input_size, self.__mfcc_calc.mfcc_params.num_mfcc_feats))
247
248        mfcc_delta_np = np.zeros_like(features)
249        mfcc_delta2_np = np.zeros_like(features)
250
251        for i in range(features.shape[1]):
252            idelta = np.convolve(features[:, i], self.__savgol_order1_coeffs, 'same')
253            mfcc_delta_np[:, i] = (idelta)
254            ideltadelta = np.convolve(features[:, i], self.savgol_order2_coeffs, 'same')
255            mfcc_delta2_np[:, i] = (ideltadelta)
256
257        features = np.concatenate((self.__normalize(features), self.__normalize(mfcc_delta_np),
258                                   self.__normalize(mfcc_delta2_np)), axis=1)
259
260        return np.float32(features)
261