1 /*
2 * Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "modules/audio_coding/codecs/cng/audio_encoder_cng.h"
12
13 #include <cstdint>
14 #include <memory>
15 #include <utility>
16
17 #include "absl/types/optional.h"
18 #include "api/units/time_delta.h"
19 #include "modules/audio_coding/codecs/cng/webrtc_cng.h"
20 #include "rtc_base/checks.h"
21
22 namespace webrtc {
23
24 namespace {
25
26 const int kMaxFrameSizeMs = 60;
27
28 class AudioEncoderCng final : public AudioEncoder {
29 public:
30 explicit AudioEncoderCng(AudioEncoderCngConfig&& config);
31 ~AudioEncoderCng() override;
32
33 // Not copyable or moveable.
34 AudioEncoderCng(const AudioEncoderCng&) = delete;
35 AudioEncoderCng(AudioEncoderCng&&) = delete;
36 AudioEncoderCng& operator=(const AudioEncoderCng&) = delete;
37 AudioEncoderCng& operator=(AudioEncoderCng&&) = delete;
38
39 int SampleRateHz() const override;
40 size_t NumChannels() const override;
41 int RtpTimestampRateHz() const override;
42 size_t Num10MsFramesInNextPacket() const override;
43 size_t Max10MsFramesInAPacket() const override;
44 int GetTargetBitrate() const override;
45 EncodedInfo EncodeImpl(uint32_t rtp_timestamp,
46 rtc::ArrayView<const int16_t> audio,
47 rtc::Buffer* encoded) override;
48 void Reset() override;
49 bool SetFec(bool enable) override;
50 bool SetDtx(bool enable) override;
51 bool SetApplication(Application application) override;
52 void SetMaxPlaybackRate(int frequency_hz) override;
53 rtc::ArrayView<std::unique_ptr<AudioEncoder>> ReclaimContainedEncoders()
54 override;
55 void OnReceivedUplinkPacketLossFraction(
56 float uplink_packet_loss_fraction) override;
57 void OnReceivedUplinkBandwidth(
58 int target_audio_bitrate_bps,
59 absl::optional<int64_t> bwe_period_ms) override;
60 absl::optional<std::pair<TimeDelta, TimeDelta>> GetFrameLengthRange()
61 const override;
62
63 private:
64 EncodedInfo EncodePassive(size_t frames_to_encode, rtc::Buffer* encoded);
65 EncodedInfo EncodeActive(size_t frames_to_encode, rtc::Buffer* encoded);
66 size_t SamplesPer10msFrame() const;
67
68 std::unique_ptr<AudioEncoder> speech_encoder_;
69 const int cng_payload_type_;
70 const int num_cng_coefficients_;
71 const int sid_frame_interval_ms_;
72 std::vector<int16_t> speech_buffer_;
73 std::vector<uint32_t> rtp_timestamps_;
74 bool last_frame_active_;
75 std::unique_ptr<Vad> vad_;
76 std::unique_ptr<ComfortNoiseEncoder> cng_encoder_;
77 };
78
AudioEncoderCng(AudioEncoderCngConfig && config)79 AudioEncoderCng::AudioEncoderCng(AudioEncoderCngConfig&& config)
80 : speech_encoder_((static_cast<void>([&] {
81 RTC_CHECK(config.IsOk()) << "Invalid configuration.";
82 }()),
83 std::move(config.speech_encoder))),
84 cng_payload_type_(config.payload_type),
85 num_cng_coefficients_(config.num_cng_coefficients),
86 sid_frame_interval_ms_(config.sid_frame_interval_ms),
87 last_frame_active_(true),
88 vad_(config.vad ? std::unique_ptr<Vad>(config.vad)
89 : CreateVad(config.vad_mode)),
90 cng_encoder_(new ComfortNoiseEncoder(SampleRateHz(),
91 sid_frame_interval_ms_,
92 num_cng_coefficients_)) {}
93
94 AudioEncoderCng::~AudioEncoderCng() = default;
95
SampleRateHz() const96 int AudioEncoderCng::SampleRateHz() const {
97 return speech_encoder_->SampleRateHz();
98 }
99
NumChannels() const100 size_t AudioEncoderCng::NumChannels() const {
101 return 1;
102 }
103
RtpTimestampRateHz() const104 int AudioEncoderCng::RtpTimestampRateHz() const {
105 return speech_encoder_->RtpTimestampRateHz();
106 }
107
Num10MsFramesInNextPacket() const108 size_t AudioEncoderCng::Num10MsFramesInNextPacket() const {
109 return speech_encoder_->Num10MsFramesInNextPacket();
110 }
111
Max10MsFramesInAPacket() const112 size_t AudioEncoderCng::Max10MsFramesInAPacket() const {
113 return speech_encoder_->Max10MsFramesInAPacket();
114 }
115
GetTargetBitrate() const116 int AudioEncoderCng::GetTargetBitrate() const {
117 return speech_encoder_->GetTargetBitrate();
118 }
119
EncodeImpl(uint32_t rtp_timestamp,rtc::ArrayView<const int16_t> audio,rtc::Buffer * encoded)120 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeImpl(
121 uint32_t rtp_timestamp,
122 rtc::ArrayView<const int16_t> audio,
123 rtc::Buffer* encoded) {
124 const size_t samples_per_10ms_frame = SamplesPer10msFrame();
125 RTC_CHECK_EQ(speech_buffer_.size(),
126 rtp_timestamps_.size() * samples_per_10ms_frame);
127 rtp_timestamps_.push_back(rtp_timestamp);
128 RTC_DCHECK_EQ(samples_per_10ms_frame, audio.size());
129 speech_buffer_.insert(speech_buffer_.end(), audio.cbegin(), audio.cend());
130 const size_t frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket();
131 if (rtp_timestamps_.size() < frames_to_encode) {
132 return EncodedInfo();
133 }
134 RTC_CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs)
135 << "Frame size cannot be larger than " << kMaxFrameSizeMs
136 << " ms when using VAD/CNG.";
137
138 // Group several 10 ms blocks per VAD call. Call VAD once or twice using the
139 // following split sizes:
140 // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms;
141 // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms.
142 size_t blocks_in_first_vad_call =
143 (frames_to_encode > 3 ? 3 : frames_to_encode);
144 if (frames_to_encode == 4)
145 blocks_in_first_vad_call = 2;
146 RTC_CHECK_GE(frames_to_encode, blocks_in_first_vad_call);
147 const size_t blocks_in_second_vad_call =
148 frames_to_encode - blocks_in_first_vad_call;
149
150 // Check if all of the buffer is passive speech. Start with checking the first
151 // block.
152 Vad::Activity activity = vad_->VoiceActivity(
153 &speech_buffer_[0], samples_per_10ms_frame * blocks_in_first_vad_call,
154 SampleRateHz());
155 if (activity == Vad::kPassive && blocks_in_second_vad_call > 0) {
156 // Only check the second block if the first was passive.
157 activity = vad_->VoiceActivity(
158 &speech_buffer_[samples_per_10ms_frame * blocks_in_first_vad_call],
159 samples_per_10ms_frame * blocks_in_second_vad_call, SampleRateHz());
160 }
161
162 EncodedInfo info;
163 switch (activity) {
164 case Vad::kPassive: {
165 info = EncodePassive(frames_to_encode, encoded);
166 last_frame_active_ = false;
167 break;
168 }
169 case Vad::kActive: {
170 info = EncodeActive(frames_to_encode, encoded);
171 last_frame_active_ = true;
172 break;
173 }
174 case Vad::kError: {
175 FATAL(); // Fails only if fed invalid data.
176 break;
177 }
178 }
179
180 speech_buffer_.erase(
181 speech_buffer_.begin(),
182 speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame);
183 rtp_timestamps_.erase(rtp_timestamps_.begin(),
184 rtp_timestamps_.begin() + frames_to_encode);
185 return info;
186 }
187
Reset()188 void AudioEncoderCng::Reset() {
189 speech_encoder_->Reset();
190 speech_buffer_.clear();
191 rtp_timestamps_.clear();
192 last_frame_active_ = true;
193 vad_->Reset();
194 cng_encoder_.reset(new ComfortNoiseEncoder(
195 SampleRateHz(), sid_frame_interval_ms_, num_cng_coefficients_));
196 }
197
SetFec(bool enable)198 bool AudioEncoderCng::SetFec(bool enable) {
199 return speech_encoder_->SetFec(enable);
200 }
201
SetDtx(bool enable)202 bool AudioEncoderCng::SetDtx(bool enable) {
203 return speech_encoder_->SetDtx(enable);
204 }
205
SetApplication(Application application)206 bool AudioEncoderCng::SetApplication(Application application) {
207 return speech_encoder_->SetApplication(application);
208 }
209
SetMaxPlaybackRate(int frequency_hz)210 void AudioEncoderCng::SetMaxPlaybackRate(int frequency_hz) {
211 speech_encoder_->SetMaxPlaybackRate(frequency_hz);
212 }
213
214 rtc::ArrayView<std::unique_ptr<AudioEncoder>>
ReclaimContainedEncoders()215 AudioEncoderCng::ReclaimContainedEncoders() {
216 return rtc::ArrayView<std::unique_ptr<AudioEncoder>>(&speech_encoder_, 1);
217 }
218
OnReceivedUplinkPacketLossFraction(float uplink_packet_loss_fraction)219 void AudioEncoderCng::OnReceivedUplinkPacketLossFraction(
220 float uplink_packet_loss_fraction) {
221 speech_encoder_->OnReceivedUplinkPacketLossFraction(
222 uplink_packet_loss_fraction);
223 }
224
OnReceivedUplinkBandwidth(int target_audio_bitrate_bps,absl::optional<int64_t> bwe_period_ms)225 void AudioEncoderCng::OnReceivedUplinkBandwidth(
226 int target_audio_bitrate_bps,
227 absl::optional<int64_t> bwe_period_ms) {
228 speech_encoder_->OnReceivedUplinkBandwidth(target_audio_bitrate_bps,
229 bwe_period_ms);
230 }
231
232 absl::optional<std::pair<TimeDelta, TimeDelta>>
GetFrameLengthRange() const233 AudioEncoderCng::GetFrameLengthRange() const {
234 return speech_encoder_->GetFrameLengthRange();
235 }
236
EncodePassive(size_t frames_to_encode,rtc::Buffer * encoded)237 AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
238 size_t frames_to_encode,
239 rtc::Buffer* encoded) {
240 bool force_sid = last_frame_active_;
241 bool output_produced = false;
242 const size_t samples_per_10ms_frame = SamplesPer10msFrame();
243 AudioEncoder::EncodedInfo info;
244
245 for (size_t i = 0; i < frames_to_encode; ++i) {
246 // It's important not to pass &info.encoded_bytes directly to
247 // WebRtcCng_Encode(), since later loop iterations may return zero in
248 // that value, in which case we don't want to overwrite any value from
249 // an earlier iteration.
250 size_t encoded_bytes_tmp =
251 cng_encoder_->Encode(rtc::ArrayView<const int16_t>(
252 &speech_buffer_[i * samples_per_10ms_frame],
253 samples_per_10ms_frame),
254 force_sid, encoded);
255
256 if (encoded_bytes_tmp > 0) {
257 RTC_CHECK(!output_produced);
258 info.encoded_bytes = encoded_bytes_tmp;
259 output_produced = true;
260 force_sid = false;
261 }
262 }
263
264 info.encoded_timestamp = rtp_timestamps_.front();
265 info.payload_type = cng_payload_type_;
266 info.send_even_if_empty = true;
267 info.speech = false;
268 return info;
269 }
270
EncodeActive(size_t frames_to_encode,rtc::Buffer * encoded)271 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(size_t frames_to_encode,
272 rtc::Buffer* encoded) {
273 const size_t samples_per_10ms_frame = SamplesPer10msFrame();
274 AudioEncoder::EncodedInfo info;
275 for (size_t i = 0; i < frames_to_encode; ++i) {
276 info =
277 speech_encoder_->Encode(rtp_timestamps_.front(),
278 rtc::ArrayView<const int16_t>(
279 &speech_buffer_[i * samples_per_10ms_frame],
280 samples_per_10ms_frame),
281 encoded);
282 if (i + 1 == frames_to_encode) {
283 RTC_CHECK_GT(info.encoded_bytes, 0) << "Encoder didn't deliver data.";
284 } else {
285 RTC_CHECK_EQ(info.encoded_bytes, 0)
286 << "Encoder delivered data too early.";
287 }
288 }
289 return info;
290 }
291
SamplesPer10msFrame() const292 size_t AudioEncoderCng::SamplesPer10msFrame() const {
293 return rtc::CheckedDivExact(10 * SampleRateHz(), 1000);
294 }
295
296 } // namespace
297
298 AudioEncoderCngConfig::AudioEncoderCngConfig() = default;
299 AudioEncoderCngConfig::AudioEncoderCngConfig(AudioEncoderCngConfig&&) = default;
300 AudioEncoderCngConfig::~AudioEncoderCngConfig() = default;
301
IsOk() const302 bool AudioEncoderCngConfig::IsOk() const {
303 if (num_channels != 1)
304 return false;
305 if (!speech_encoder)
306 return false;
307 if (num_channels != speech_encoder->NumChannels())
308 return false;
309 if (sid_frame_interval_ms <
310 static_cast<int>(speech_encoder->Max10MsFramesInAPacket() * 10))
311 return false;
312 if (num_cng_coefficients > WEBRTC_CNG_MAX_LPC_ORDER ||
313 num_cng_coefficients <= 0)
314 return false;
315 return true;
316 }
317
CreateComfortNoiseEncoder(AudioEncoderCngConfig && config)318 std::unique_ptr<AudioEncoder> CreateComfortNoiseEncoder(
319 AudioEncoderCngConfig&& config) {
320 return std::make_unique<AudioEncoderCng>(std::move(config));
321 }
322
323 } // namespace webrtc
324