1 /*
2 * Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/audio_processing/agc2/rnn_vad/rnn.h"
12
13 // Defines WEBRTC_ARCH_X86_FAMILY, used below.
14 #include "rtc_base/system/arch.h"
15
16 #if defined(WEBRTC_HAS_NEON)
17 #include <arm_neon.h>
18 #endif
19 #if defined(WEBRTC_ARCH_X86_FAMILY)
20 #include <emmintrin.h>
21 #endif
22 #include <algorithm>
23 #include <array>
24 #include <cmath>
25 #include <numeric>
26
27 #include "rtc_base/checks.h"
28 #include "rtc_base/logging.h"
29 #include "third_party/rnnoise/src/rnn_activations.h"
30 #include "third_party/rnnoise/src/rnn_vad_weights.h"
31
32 namespace webrtc {
33 namespace rnn_vad {
34 namespace {
35
36 using rnnoise::kWeightsScale;
37
38 using rnnoise::kInputLayerInputSize;
39 static_assert(kFeatureVectorSize == kInputLayerInputSize, "");
40 using rnnoise::kInputDenseBias;
41 using rnnoise::kInputDenseWeights;
42 using rnnoise::kInputLayerOutputSize;
43 static_assert(kInputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
44 "Increase kFullyConnectedLayersMaxUnits.");
45
46 using rnnoise::kHiddenGruBias;
47 using rnnoise::kHiddenGruRecurrentWeights;
48 using rnnoise::kHiddenGruWeights;
49 using rnnoise::kHiddenLayerOutputSize;
50 static_assert(kHiddenLayerOutputSize <= kRecurrentLayersMaxUnits,
51 "Increase kRecurrentLayersMaxUnits.");
52
53 using rnnoise::kOutputDenseBias;
54 using rnnoise::kOutputDenseWeights;
55 using rnnoise::kOutputLayerOutputSize;
56 static_assert(kOutputLayerOutputSize <= kFullyConnectedLayersMaxUnits,
57 "Increase kFullyConnectedLayersMaxUnits.");
58
59 using rnnoise::SigmoidApproximated;
60 using rnnoise::TansigApproximated;
61
RectifiedLinearUnit(float x)62 inline float RectifiedLinearUnit(float x) {
63 return x < 0.f ? 0.f : x;
64 }
65
GetScaledParams(rtc::ArrayView<const int8_t> params)66 std::vector<float> GetScaledParams(rtc::ArrayView<const int8_t> params) {
67 std::vector<float> scaled_params(params.size());
68 std::transform(params.begin(), params.end(), scaled_params.begin(),
69 [](int8_t x) -> float {
70 return rnnoise::kWeightsScale * static_cast<float>(x);
71 });
72 return scaled_params;
73 }
74
75 // TODO(bugs.chromium.org/10480): Hard-code optimized layout and remove this
76 // function to improve setup time.
77 // Casts and scales |weights| and re-arranges the layout.
GetPreprocessedFcWeights(rtc::ArrayView<const int8_t> weights,size_t output_size)78 std::vector<float> GetPreprocessedFcWeights(
79 rtc::ArrayView<const int8_t> weights,
80 size_t output_size) {
81 if (output_size == 1) {
82 return GetScaledParams(weights);
83 }
84 // Transpose, scale and cast.
85 const size_t input_size = rtc::CheckedDivExact(weights.size(), output_size);
86 std::vector<float> w(weights.size());
87 for (size_t o = 0; o < output_size; ++o) {
88 for (size_t i = 0; i < input_size; ++i) {
89 w[o * input_size + i] = rnnoise::kWeightsScale *
90 static_cast<float>(weights[i * output_size + o]);
91 }
92 }
93 return w;
94 }
95
96 constexpr size_t kNumGruGates = 3; // Update, reset, output.
97
98 // TODO(bugs.chromium.org/10480): Hard-coded optimized layout and remove this
99 // function to improve setup time.
100 // Casts and scales |tensor_src| for a GRU layer and re-arranges the layout.
101 // It works both for weights, recurrent weights and bias.
GetPreprocessedGruTensor(rtc::ArrayView<const int8_t> tensor_src,size_t output_size)102 std::vector<float> GetPreprocessedGruTensor(
103 rtc::ArrayView<const int8_t> tensor_src,
104 size_t output_size) {
105 // Transpose, cast and scale.
106 // |n| is the size of the first dimension of the 3-dim tensor |weights|.
107 const size_t n =
108 rtc::CheckedDivExact(tensor_src.size(), output_size * kNumGruGates);
109 const size_t stride_src = kNumGruGates * output_size;
110 const size_t stride_dst = n * output_size;
111 std::vector<float> tensor_dst(tensor_src.size());
112 for (size_t g = 0; g < kNumGruGates; ++g) {
113 for (size_t o = 0; o < output_size; ++o) {
114 for (size_t i = 0; i < n; ++i) {
115 tensor_dst[g * stride_dst + o * n + i] =
116 rnnoise::kWeightsScale *
117 static_cast<float>(
118 tensor_src[i * stride_src + g * output_size + o]);
119 }
120 }
121 }
122 return tensor_dst;
123 }
124
ComputeGruUpdateResetGates(size_t input_size,size_t output_size,rtc::ArrayView<const float> weights,rtc::ArrayView<const float> recurrent_weights,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> input,rtc::ArrayView<const float> state,rtc::ArrayView<float> gate)125 void ComputeGruUpdateResetGates(size_t input_size,
126 size_t output_size,
127 rtc::ArrayView<const float> weights,
128 rtc::ArrayView<const float> recurrent_weights,
129 rtc::ArrayView<const float> bias,
130 rtc::ArrayView<const float> input,
131 rtc::ArrayView<const float> state,
132 rtc::ArrayView<float> gate) {
133 for (size_t o = 0; o < output_size; ++o) {
134 gate[o] = bias[o];
135 for (size_t i = 0; i < input_size; ++i) {
136 gate[o] += input[i] * weights[o * input_size + i];
137 }
138 for (size_t s = 0; s < output_size; ++s) {
139 gate[o] += state[s] * recurrent_weights[o * output_size + s];
140 }
141 gate[o] = SigmoidApproximated(gate[o]);
142 }
143 }
144
ComputeGruOutputGate(size_t input_size,size_t output_size,rtc::ArrayView<const float> weights,rtc::ArrayView<const float> recurrent_weights,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> input,rtc::ArrayView<const float> state,rtc::ArrayView<const float> reset,rtc::ArrayView<float> gate)145 void ComputeGruOutputGate(size_t input_size,
146 size_t output_size,
147 rtc::ArrayView<const float> weights,
148 rtc::ArrayView<const float> recurrent_weights,
149 rtc::ArrayView<const float> bias,
150 rtc::ArrayView<const float> input,
151 rtc::ArrayView<const float> state,
152 rtc::ArrayView<const float> reset,
153 rtc::ArrayView<float> gate) {
154 for (size_t o = 0; o < output_size; ++o) {
155 gate[o] = bias[o];
156 for (size_t i = 0; i < input_size; ++i) {
157 gate[o] += input[i] * weights[o * input_size + i];
158 }
159 for (size_t s = 0; s < output_size; ++s) {
160 gate[o] += state[s] * recurrent_weights[o * output_size + s] * reset[s];
161 }
162 gate[o] = RectifiedLinearUnit(gate[o]);
163 }
164 }
165
166 // Gated recurrent unit (GRU) layer un-optimized implementation.
ComputeGruLayerOutput(size_t input_size,size_t output_size,rtc::ArrayView<const float> input,rtc::ArrayView<const float> weights,rtc::ArrayView<const float> recurrent_weights,rtc::ArrayView<const float> bias,rtc::ArrayView<float> state)167 void ComputeGruLayerOutput(size_t input_size,
168 size_t output_size,
169 rtc::ArrayView<const float> input,
170 rtc::ArrayView<const float> weights,
171 rtc::ArrayView<const float> recurrent_weights,
172 rtc::ArrayView<const float> bias,
173 rtc::ArrayView<float> state) {
174 RTC_DCHECK_EQ(input_size, input.size());
175 // Stride and offset used to read parameter arrays.
176 const size_t stride_in = input_size * output_size;
177 const size_t stride_out = output_size * output_size;
178
179 // Update gate.
180 std::array<float, kRecurrentLayersMaxUnits> update;
181 ComputeGruUpdateResetGates(
182 input_size, output_size, weights.subview(0, stride_in),
183 recurrent_weights.subview(0, stride_out), bias.subview(0, output_size),
184 input, state, update);
185
186 // Reset gate.
187 std::array<float, kRecurrentLayersMaxUnits> reset;
188 ComputeGruUpdateResetGates(
189 input_size, output_size, weights.subview(stride_in, stride_in),
190 recurrent_weights.subview(stride_out, stride_out),
191 bias.subview(output_size, output_size), input, state, reset);
192
193 // Output gate.
194 std::array<float, kRecurrentLayersMaxUnits> output;
195 ComputeGruOutputGate(
196 input_size, output_size, weights.subview(2 * stride_in, stride_in),
197 recurrent_weights.subview(2 * stride_out, stride_out),
198 bias.subview(2 * output_size, output_size), input, state, reset, output);
199
200 // Update output through the update gates and update the state.
201 for (size_t o = 0; o < output_size; ++o) {
202 output[o] = update[o] * state[o] + (1.f - update[o]) * output[o];
203 state[o] = output[o];
204 }
205 }
206
207 // Fully connected layer un-optimized implementation.
ComputeFullyConnectedLayerOutput(size_t input_size,size_t output_size,rtc::ArrayView<const float> input,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> weights,rtc::FunctionView<float (float)> activation_function,rtc::ArrayView<float> output)208 void ComputeFullyConnectedLayerOutput(
209 size_t input_size,
210 size_t output_size,
211 rtc::ArrayView<const float> input,
212 rtc::ArrayView<const float> bias,
213 rtc::ArrayView<const float> weights,
214 rtc::FunctionView<float(float)> activation_function,
215 rtc::ArrayView<float> output) {
216 RTC_DCHECK_EQ(input.size(), input_size);
217 RTC_DCHECK_EQ(bias.size(), output_size);
218 RTC_DCHECK_EQ(weights.size(), input_size * output_size);
219 for (size_t o = 0; o < output_size; ++o) {
220 output[o] = bias[o];
221 // TODO(bugs.chromium.org/9076): Benchmark how different layouts for
222 // |weights_| change the performance across different platforms.
223 for (size_t i = 0; i < input_size; ++i) {
224 output[o] += input[i] * weights[o * input_size + i];
225 }
226 output[o] = activation_function(output[o]);
227 }
228 }
229
230 #if defined(WEBRTC_ARCH_X86_FAMILY)
231 // Fully connected layer SSE2 implementation.
ComputeFullyConnectedLayerOutputSse2(size_t input_size,size_t output_size,rtc::ArrayView<const float> input,rtc::ArrayView<const float> bias,rtc::ArrayView<const float> weights,rtc::FunctionView<float (float)> activation_function,rtc::ArrayView<float> output)232 void ComputeFullyConnectedLayerOutputSse2(
233 size_t input_size,
234 size_t output_size,
235 rtc::ArrayView<const float> input,
236 rtc::ArrayView<const float> bias,
237 rtc::ArrayView<const float> weights,
238 rtc::FunctionView<float(float)> activation_function,
239 rtc::ArrayView<float> output) {
240 RTC_DCHECK_EQ(input.size(), input_size);
241 RTC_DCHECK_EQ(bias.size(), output_size);
242 RTC_DCHECK_EQ(weights.size(), input_size * output_size);
243 const size_t input_size_by_4 = input_size >> 2;
244 const size_t offset = input_size & ~3;
245 __m128 sum_wx_128;
246 const float* v = reinterpret_cast<const float*>(&sum_wx_128);
247 for (size_t o = 0; o < output_size; ++o) {
248 // Perform 128 bit vector operations.
249 sum_wx_128 = _mm_set1_ps(0);
250 const float* x_p = input.data();
251 const float* w_p = weights.data() + o * input_size;
252 for (size_t i = 0; i < input_size_by_4; ++i, x_p += 4, w_p += 4) {
253 sum_wx_128 = _mm_add_ps(sum_wx_128,
254 _mm_mul_ps(_mm_loadu_ps(x_p), _mm_loadu_ps(w_p)));
255 }
256 // Perform non-vector operations for any remaining items, sum up bias term
257 // and results from the vectorized code, and apply the activation function.
258 output[o] = activation_function(
259 std::inner_product(input.begin() + offset, input.end(),
260 weights.begin() + o * input_size + offset,
261 bias[o] + v[0] + v[1] + v[2] + v[3]));
262 }
263 }
264 #endif
265
266 } // namespace
267
FullyConnectedLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,rtc::FunctionView<float (float)> activation_function,Optimization optimization)268 FullyConnectedLayer::FullyConnectedLayer(
269 const size_t input_size,
270 const size_t output_size,
271 const rtc::ArrayView<const int8_t> bias,
272 const rtc::ArrayView<const int8_t> weights,
273 rtc::FunctionView<float(float)> activation_function,
274 Optimization optimization)
275 : input_size_(input_size),
276 output_size_(output_size),
277 bias_(GetScaledParams(bias)),
278 weights_(GetPreprocessedFcWeights(weights, output_size)),
279 activation_function_(activation_function),
280 optimization_(optimization) {
281 RTC_DCHECK_LE(output_size_, kFullyConnectedLayersMaxUnits)
282 << "Static over-allocation of fully-connected layers output vectors is "
283 "not sufficient.";
284 RTC_DCHECK_EQ(output_size_, bias_.size())
285 << "Mismatching output size and bias terms array size.";
286 RTC_DCHECK_EQ(input_size_ * output_size_, weights_.size())
287 << "Mismatching input-output size and weight coefficients array size.";
288 }
289
290 FullyConnectedLayer::~FullyConnectedLayer() = default;
291
GetOutput() const292 rtc::ArrayView<const float> FullyConnectedLayer::GetOutput() const {
293 return rtc::ArrayView<const float>(output_.data(), output_size_);
294 }
295
ComputeOutput(rtc::ArrayView<const float> input)296 void FullyConnectedLayer::ComputeOutput(rtc::ArrayView<const float> input) {
297 switch (optimization_) {
298 #if defined(WEBRTC_ARCH_X86_FAMILY)
299 case Optimization::kSse2:
300 ComputeFullyConnectedLayerOutputSse2(input_size_, output_size_, input,
301 bias_, weights_,
302 activation_function_, output_);
303 break;
304 #endif
305 #if defined(WEBRTC_HAS_NEON)
306 case Optimization::kNeon:
307 // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
308 ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
309 weights_, activation_function_, output_);
310 break;
311 #endif
312 default:
313 ComputeFullyConnectedLayerOutput(input_size_, output_size_, input, bias_,
314 weights_, activation_function_, output_);
315 }
316 }
317
GatedRecurrentLayer(const size_t input_size,const size_t output_size,const rtc::ArrayView<const int8_t> bias,const rtc::ArrayView<const int8_t> weights,const rtc::ArrayView<const int8_t> recurrent_weights,Optimization optimization)318 GatedRecurrentLayer::GatedRecurrentLayer(
319 const size_t input_size,
320 const size_t output_size,
321 const rtc::ArrayView<const int8_t> bias,
322 const rtc::ArrayView<const int8_t> weights,
323 const rtc::ArrayView<const int8_t> recurrent_weights,
324 Optimization optimization)
325 : input_size_(input_size),
326 output_size_(output_size),
327 bias_(GetPreprocessedGruTensor(bias, output_size)),
328 weights_(GetPreprocessedGruTensor(weights, output_size)),
329 recurrent_weights_(
330 GetPreprocessedGruTensor(recurrent_weights, output_size)),
331 optimization_(optimization) {
332 RTC_DCHECK_LE(output_size_, kRecurrentLayersMaxUnits)
333 << "Static over-allocation of recurrent layers state vectors is not "
334 "sufficient.";
335 RTC_DCHECK_EQ(kNumGruGates * output_size_, bias_.size())
336 << "Mismatching output size and bias terms array size.";
337 RTC_DCHECK_EQ(kNumGruGates * input_size_ * output_size_, weights_.size())
338 << "Mismatching input-output size and weight coefficients array size.";
339 RTC_DCHECK_EQ(kNumGruGates * output_size_ * output_size_,
340 recurrent_weights_.size())
341 << "Mismatching input-output size and recurrent weight coefficients array"
342 " size.";
343 Reset();
344 }
345
346 GatedRecurrentLayer::~GatedRecurrentLayer() = default;
347
GetOutput() const348 rtc::ArrayView<const float> GatedRecurrentLayer::GetOutput() const {
349 return rtc::ArrayView<const float>(state_.data(), output_size_);
350 }
351
Reset()352 void GatedRecurrentLayer::Reset() {
353 state_.fill(0.f);
354 }
355
ComputeOutput(rtc::ArrayView<const float> input)356 void GatedRecurrentLayer::ComputeOutput(rtc::ArrayView<const float> input) {
357 switch (optimization_) {
358 #if defined(WEBRTC_ARCH_X86_FAMILY)
359 case Optimization::kSse2:
360 // TODO(bugs.chromium.org/10480): Handle Optimization::kSse2.
361 ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
362 recurrent_weights_, bias_, state_);
363 break;
364 #endif
365 #if defined(WEBRTC_HAS_NEON)
366 case Optimization::kNeon:
367 // TODO(bugs.chromium.org/10480): Handle Optimization::kNeon.
368 ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
369 recurrent_weights_, bias_, state_);
370 break;
371 #endif
372 default:
373 ComputeGruLayerOutput(input_size_, output_size_, input, weights_,
374 recurrent_weights_, bias_, state_);
375 }
376 }
377
RnnBasedVad()378 RnnBasedVad::RnnBasedVad()
379 : input_layer_(kInputLayerInputSize,
380 kInputLayerOutputSize,
381 kInputDenseBias,
382 kInputDenseWeights,
383 TansigApproximated,
384 DetectOptimization()),
385 hidden_layer_(kInputLayerOutputSize,
386 kHiddenLayerOutputSize,
387 kHiddenGruBias,
388 kHiddenGruWeights,
389 kHiddenGruRecurrentWeights,
390 DetectOptimization()),
391 output_layer_(kHiddenLayerOutputSize,
392 kOutputLayerOutputSize,
393 kOutputDenseBias,
394 kOutputDenseWeights,
395 SigmoidApproximated,
396 DetectOptimization()) {
397 // Input-output chaining size checks.
398 RTC_DCHECK_EQ(input_layer_.output_size(), hidden_layer_.input_size())
399 << "The input and the hidden layers sizes do not match.";
400 RTC_DCHECK_EQ(hidden_layer_.output_size(), output_layer_.input_size())
401 << "The hidden and the output layers sizes do not match.";
402 }
403
404 RnnBasedVad::~RnnBasedVad() = default;
405
Reset()406 void RnnBasedVad::Reset() {
407 hidden_layer_.Reset();
408 }
409
ComputeVadProbability(rtc::ArrayView<const float,kFeatureVectorSize> feature_vector,bool is_silence)410 float RnnBasedVad::ComputeVadProbability(
411 rtc::ArrayView<const float, kFeatureVectorSize> feature_vector,
412 bool is_silence) {
413 if (is_silence) {
414 Reset();
415 return 0.f;
416 }
417 input_layer_.ComputeOutput(feature_vector);
418 hidden_layer_.ComputeOutput(input_layer_.GetOutput());
419 output_layer_.ComputeOutput(hidden_layer_.GetOutput());
420 const auto vad_output = output_layer_.GetOutput();
421 return vad_output[0];
422 }
423
424 } // namespace rnn_vad
425 } // namespace webrtc
426