1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 3-Clause Clear License
5 * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
6 * License was not distributed with this source code in the LICENSE file, you
7 * can obtain it at www.aomedia.org/license/software-license/bsd-3-c-c. If the
8 * Alliance for Open Media Patent License 1.0 was not distributed with this
9 * source code in the PATENTS file, you can obtain it at
10 * www.aomedia.org/license/patent.
11 */
12 #include "iamf/cli/proto_conversion/proto_to_obu/audio_frame_generator.h"
13
14 #include <algorithm>
15 #include <cstddef>
16 #include <cstdint>
17 #include <cstdio>
18 #include <cstdlib>
19 #include <cstring>
20 #include <deque>
21 #include <list>
22 #include <memory>
23 #include <optional>
24 #include <string>
25 #include <utility>
26 #include <vector>
27
28 #include "absl/container/flat_hash_map.h"
29 #include "absl/container/flat_hash_set.h"
30 #include "absl/log/check.h"
31 #include "absl/log/log.h"
32 #include "absl/memory/memory.h"
33 #include "absl/status/status.h"
34 #include "absl/strings/str_cat.h"
35 #include "absl/synchronization/mutex.h"
36 #include "absl/types/span.h"
37 #include "iamf/cli/audio_element_with_data.h"
38 #include "iamf/cli/audio_frame_with_data.h"
39 #include "iamf/cli/channel_label.h"
40 #include "iamf/cli/codec/aac_encoder.h"
41 #include "iamf/cli/codec/encoder_base.h"
42 #include "iamf/cli/codec/flac_encoder.h"
43 #include "iamf/cli/codec/lpcm_encoder.h"
44 #include "iamf/cli/codec/opus_encoder.h"
45 #include "iamf/cli/demixing_module.h"
46 #include "iamf/cli/global_timing_module.h"
47 #include "iamf/cli/parameters_manager.h"
48 #include "iamf/cli/proto/audio_frame.pb.h"
49 #include "iamf/cli/proto/codec_config.pb.h"
50 #include "iamf/cli/proto/test_vector_metadata.pb.h"
51 #include "iamf/cli/proto_conversion/channel_label_utils.h"
52 #include "iamf/common/utils/macros.h"
53 #include "iamf/obu/audio_frame.h"
54 #include "iamf/obu/codec_config.h"
55 #include "iamf/obu/demixing_info_parameter_data.h"
56 #include "iamf/obu/recon_gain_info_parameter_data.h"
57 #include "iamf/obu/types.h"
58 #include "src/google/protobuf/repeated_ptr_field.h"
59
60 namespace iamf_tools {
61
62 namespace {
63
64 constexpr bool kValidateCodecDelay = true;
65
InitializeEncoder(const iamf_tools_cli_proto::CodecConfig & codec_config_metadata,const CodecConfigObu & codec_config,int num_channels,std::unique_ptr<EncoderBase> & encoder,bool validate_codec_delay,int substream_id=0)66 absl::Status InitializeEncoder(
67 const iamf_tools_cli_proto::CodecConfig& codec_config_metadata,
68 const CodecConfigObu& codec_config, int num_channels,
69 std::unique_ptr<EncoderBase>& encoder, bool validate_codec_delay,
70 int substream_id = 0) {
71 switch (codec_config.GetCodecConfig().codec_id) {
72 using enum CodecConfig::CodecId;
73 case kCodecIdLpcm:
74 encoder = std::make_unique<LpcmEncoder>(codec_config, num_channels);
75 break;
76 case kCodecIdOpus:
77 encoder = std::make_unique<OpusEncoder>(
78 codec_config_metadata.decoder_config_opus().opus_encoder_metadata(),
79 codec_config, num_channels, substream_id);
80 break;
81 case kCodecIdAacLc:
82 encoder = std::make_unique<AacEncoder>(
83 codec_config_metadata.decoder_config_aac().aac_encoder_metadata(),
84 codec_config, num_channels);
85 break;
86 case kCodecIdFlac:
87 encoder = std::make_unique<FlacEncoder>(
88 codec_config_metadata.decoder_config_flac().flac_encoder_metadata(),
89 codec_config, num_channels);
90 break;
91 default:
92 return absl::InvalidArgumentError(absl::StrCat(
93 "Unknown codec_id= ", codec_config.GetCodecConfig().codec_id));
94 }
95 RETURN_IF_NOT_OK(encoder->Initialize(validate_codec_delay));
96 return absl::OkStatus();
97 }
98
99 // Gets data relevant to encoding (Codec Config OBU and AudioElementWithData)
100 // and initializes encoders.
GetEncodingDataAndInitializeEncoders(const absl::flat_hash_map<DecodedUleb128,iamf_tools_cli_proto::CodecConfig> codec_config_metadata,const AudioElementWithData & audio_element_with_data,absl::flat_hash_map<uint32_t,std::unique_ptr<EncoderBase>> & substream_id_to_encoder)101 absl::Status GetEncodingDataAndInitializeEncoders(
102 const absl::flat_hash_map<DecodedUleb128, iamf_tools_cli_proto::CodecConfig>
103 codec_config_metadata,
104 const AudioElementWithData& audio_element_with_data,
105 absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
106 substream_id_to_encoder) {
107 for (const auto& [substream_id, labels] :
108 audio_element_with_data.substream_id_to_labels) {
109 const int num_channels = static_cast<int>(labels.size());
110 const CodecConfigObu& codec_config_obu =
111 *audio_element_with_data.codec_config;
112 auto codec_config_metadata_iter =
113 codec_config_metadata.find(codec_config_obu.GetCodecConfigId());
114 if (codec_config_metadata_iter == codec_config_metadata.end()) {
115 return absl::InvalidArgumentError(absl::StrCat(
116 "Failed to find codec config metadata for codec_config_id= ",
117 codec_config_obu.GetCodecConfigId()));
118 }
119
120 RETURN_IF_NOT_OK(InitializeEncoder(codec_config_metadata_iter->second,
121 codec_config_obu, num_channels,
122 substream_id_to_encoder[substream_id],
123 kValidateCodecDelay, substream_id));
124 }
125
126 return absl::OkStatus();
127 }
128
129 // Validates that the user requested number of samples to trim at start is
130 // enough to cover the delay that the encoder needs.
ValidateUserStartTrimIncludesCodecDelay(uint32_t user_samples_to_trim_at_start,uint32_t & encoder_required_samples_to_delay)131 absl::Status ValidateUserStartTrimIncludesCodecDelay(
132 uint32_t user_samples_to_trim_at_start,
133 uint32_t& encoder_required_samples_to_delay) {
134 // Return an error. But obey the user when
135 // `-DIGNORE_ERRORS_USE_ONLY_FOR_IAMF_TEST_SUITE` is set.
136 if (user_samples_to_trim_at_start < encoder_required_samples_to_delay) {
137 // Only pad up to what the user requests.
138 const auto message =
139 absl::StrCat("The encoder requires ", encoder_required_samples_to_delay,
140 " samples trimmed at the start but only ",
141 user_samples_to_trim_at_start, " were requested");
142 encoder_required_samples_to_delay = user_samples_to_trim_at_start;
143 return absl::InvalidArgumentError(message);
144 }
145
146 return absl::OkStatus();
147 }
148
GetNumSamplesToPadAtEndAndValidate(const uint32_t required_samples_to_pad_at_end,bool increment_samples_to_trim_at_end_by_padding,int64_t & user_samples_to_trim_at_end,uint32_t & num_samples_to_pad_at_end)149 absl::Status GetNumSamplesToPadAtEndAndValidate(
150 const uint32_t required_samples_to_pad_at_end,
151 bool increment_samples_to_trim_at_end_by_padding,
152 int64_t& user_samples_to_trim_at_end, uint32_t& num_samples_to_pad_at_end) {
153 if (increment_samples_to_trim_at_end_by_padding) {
154 // In this mode, the user's requested `samples_to_trim_at_end` represents
155 // the samples trimmed from the input data. Add in the virtual padded
156 // samples that the encoder will insert, to reflect the total number of
157 // samples which are trimmed in the OBU.
158 user_samples_to_trim_at_end += required_samples_to_pad_at_end;
159 }
160
161 num_samples_to_pad_at_end =
162 std::min(required_samples_to_pad_at_end,
163 static_cast<uint32_t>(user_samples_to_trim_at_end));
164 if (user_samples_to_trim_at_end < required_samples_to_pad_at_end) {
165 // Obey the user's request by setting `user_samples_to_trim_at_end`. But
166 // throw an error.
167 return absl::InvalidArgumentError(
168 absl::StrCat("User input requested ", user_samples_to_trim_at_end,
169 " trimmed samples. But ", required_samples_to_pad_at_end,
170 " samples are required to pad a full frame"));
171 }
172
173 return absl::OkStatus();
174 }
175
PadSamples(const size_t num_samples_to_pad,const size_t num_channels,std::deque<std::vector<int32_t>> & samples)176 void PadSamples(const size_t num_samples_to_pad, const size_t num_channels,
177 std::deque<std::vector<int32_t>>& samples) {
178 samples.insert(samples.end(), num_samples_to_pad,
179 std::vector<int32_t>(num_channels, 0));
180 }
181
MoveSamples(const size_t num_samples,std::deque<std::vector<int32_t>> & source_samples,std::vector<std::vector<int32_t>> & destination_samples)182 void MoveSamples(const size_t num_samples,
183 std::deque<std::vector<int32_t>>& source_samples,
184 std::vector<std::vector<int32_t>>& destination_samples) {
185 CHECK_GE(source_samples.size(), num_samples);
186 std::copy(source_samples.begin(), source_samples.begin() + num_samples,
187 destination_samples.begin());
188 source_samples.erase(source_samples.begin(),
189 source_samples.begin() + num_samples);
190 }
191
InitializeSubstreamData(const SubstreamIdLabelsMap & substream_id_to_labels,const absl::flat_hash_map<uint32_t,std::unique_ptr<EncoderBase>> & substream_id_to_encoder,bool user_samples_to_trim_at_start_includes_codec_delay,const uint32_t user_samples_to_trim_at_start,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data)192 absl::Status InitializeSubstreamData(
193 const SubstreamIdLabelsMap& substream_id_to_labels,
194 const absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
195 substream_id_to_encoder,
196 bool user_samples_to_trim_at_start_includes_codec_delay,
197 const uint32_t user_samples_to_trim_at_start,
198 absl::flat_hash_map<uint32_t, SubstreamData>&
199 substream_id_to_substream_data) {
200 // Validate user start trim is correct; it depends on the encoder. Insert
201 // the "virtual samples" at the start up to the amount required by the codec
202 // and encoder into the `samples_obu` queue. Trimming of additional optional
203 // samples will occur later to keep trimming logic in one place as much as
204 // possible.
205 for (const auto& [substream_id, labels] : substream_id_to_labels) {
206 const auto encoder_iter = substream_id_to_encoder.find(substream_id);
207 if (encoder_iter == substream_id_to_encoder.end()) {
208 return absl::InvalidArgumentError(absl::StrCat(
209 "Failed to find encoder for substream ID= ", substream_id));
210 }
211
212 uint32_t encoder_required_samples_to_delay =
213 encoder_iter->second->GetNumberOfSamplesToDelayAtStart();
214 if (user_samples_to_trim_at_start_includes_codec_delay) {
215 MAYBE_RETURN_IF_NOT_OK(ValidateUserStartTrimIncludesCodecDelay(
216 user_samples_to_trim_at_start, encoder_required_samples_to_delay));
217 }
218
219 // Initialize a `SubstreamData` with virtual samples for any delay
220 // introduced by the encoder.
221 auto& substream_data_for_id = substream_id_to_substream_data[substream_id];
222 substream_data_for_id = {
223 substream_id,
224 /*samples_obu=*/{},
225 /*samples_encode=*/{},
226 /*output_gains_linear=*/{},
227 /*num_samples_to_trim_at_end=*/0,
228 /*num_samples_to_trim_at_start=*/encoder_required_samples_to_delay};
229
230 PadSamples(encoder_required_samples_to_delay, labels.size(),
231 substream_data_for_id.samples_obu);
232 }
233
234 return absl::OkStatus();
235 }
236
237 // An audio element may contain many channels, denoted by their labels;
238 // this function returns whether all labels have their (same amount of)
239 // samples ready.
SamplesReadyForAudioElement(const LabelSamplesMap & label_to_samples,const absl::flat_hash_set<ChannelLabel::Label> & channel_labels_for_audio_element)240 bool SamplesReadyForAudioElement(const LabelSamplesMap& label_to_samples,
241 const absl::flat_hash_set<ChannelLabel::Label>&
242 channel_labels_for_audio_element) {
243 std::optional<size_t> common_num_samples;
244 for (const auto& label : channel_labels_for_audio_element) {
245 const auto label_to_samples_iter = label_to_samples.find(label);
246 if (label_to_samples_iter == label_to_samples.end()) {
247 return false;
248 }
249
250 const auto num_samples = label_to_samples_iter->second.size();
251 if (!common_num_samples.has_value()) {
252 common_num_samples = num_samples;
253 }
254
255 if (num_samples != *common_num_samples) {
256 return false;
257 }
258 }
259
260 return true;
261 }
262
DownMixSamples(const DecodedUleb128 audio_element_id,const DemixingModule & demixing_module,LabelSamplesMap & label_to_samples,ParametersManager & parameters_manager,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data,DownMixingParams & down_mixing_params)263 absl::Status DownMixSamples(const DecodedUleb128 audio_element_id,
264 const DemixingModule& demixing_module,
265 LabelSamplesMap& label_to_samples,
266 ParametersManager& parameters_manager,
267 absl::flat_hash_map<uint32_t, SubstreamData>&
268 substream_id_to_substream_data,
269 DownMixingParams& down_mixing_params) {
270 RETURN_IF_NOT_OK(parameters_manager.GetDownMixingParameters(
271 audio_element_id, down_mixing_params));
272 LOG_FIRST_N(INFO, 10) << "Using alpha=" << down_mixing_params.alpha
273 << " beta=" << down_mixing_params.beta
274 << " gamma=" << down_mixing_params.gamma
275 << " delta=" << down_mixing_params.delta
276 << " w_idx_offset=" << down_mixing_params.w_idx_offset
277 << " w_idx_used=" << down_mixing_params.w_idx_used
278 << " w=" << down_mixing_params.w;
279
280 // Down-mix OBU-aligned samples from input channels to substreams. May
281 // generate intermediate channels (e.g. L3 on the way of down-mixing L7 to L2)
282 // and expand `label_to_samples`.
283 RETURN_IF_NOT_OK(demixing_module.DownMixSamplesToSubstreams(
284 audio_element_id, down_mixing_params, label_to_samples,
285 substream_id_to_substream_data));
286
287 return absl::OkStatus();
288 }
289
290 // Gets the next frame of samples for all streams, either from "real" samples
291 // read from a file or from padding.
GetNextFrameSubstreamData(const DecodedUleb128 audio_element_id,const DemixingModule & demixing_module,const size_t num_samples_per_frame,const SubstreamIdLabelsMap & substream_id_to_labels,absl::flat_hash_map<uint32_t,AudioFrameGenerator::TrimmingState> & substream_id_to_trimming_state,LabelSamplesMap & label_to_samples,ParametersManager & parameters_manager,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data,DownMixingParams & down_mixing_params)292 absl::Status GetNextFrameSubstreamData(
293 const DecodedUleb128 audio_element_id,
294 const DemixingModule& demixing_module, const size_t num_samples_per_frame,
295 const SubstreamIdLabelsMap& substream_id_to_labels,
296 absl::flat_hash_map<uint32_t, AudioFrameGenerator::TrimmingState>&
297 substream_id_to_trimming_state,
298 LabelSamplesMap& label_to_samples, ParametersManager& parameters_manager,
299 absl::flat_hash_map<uint32_t, SubstreamData>&
300 substream_id_to_substream_data,
301 DownMixingParams& down_mixing_params) {
302 const bool no_sample_added =
303 (label_to_samples.empty() ||
304 std::all_of(label_to_samples.begin(), label_to_samples.end(),
305 [](const auto& entry) { return entry.second.empty(); }));
306 if (no_sample_added &&
307 (substream_id_to_substream_data.empty() ||
308 std::all_of(substream_id_to_substream_data.begin(),
309 substream_id_to_substream_data.end(), [](const auto& entry) {
310 return entry.second.samples_obu.empty();
311 }))) {
312 return absl::OkStatus();
313 }
314
315 RETURN_IF_NOT_OK(DownMixSamples(
316 audio_element_id, demixing_module, label_to_samples, parameters_manager,
317 substream_id_to_substream_data, down_mixing_params));
318
319 // Padding.
320 for (const auto& [substream_id, unused_labels] : substream_id_to_labels) {
321 auto& substream_data = substream_id_to_substream_data.at(substream_id);
322 const int num_channels = substream_data.samples_obu.front().size();
323 if (substream_data.samples_obu.size() < num_samples_per_frame) {
324 uint32_t num_samples_to_pad_at_end;
325 auto& trimming_state = substream_id_to_trimming_state.at(substream_id);
326 RETURN_IF_NOT_OK(GetNumSamplesToPadAtEndAndValidate(
327 num_samples_per_frame - substream_data.samples_obu.size(),
328 trimming_state.increment_samples_to_trim_at_end_by_padding,
329 trimming_state.user_samples_left_to_trim_at_end,
330 num_samples_to_pad_at_end));
331
332 PadSamples(num_samples_to_pad_at_end, num_channels,
333 substream_data.samples_obu);
334 PadSamples(num_samples_to_pad_at_end, num_channels,
335 substream_data.samples_encode);
336
337 // Record the number of padded samples to be trimmed later.
338 substream_data.num_samples_to_trim_at_end = num_samples_to_pad_at_end;
339 }
340
341 if (no_sample_added &&
342 substream_data.samples_encode.size() < num_samples_per_frame) {
343 const uint32_t num_samples_to_pad =
344 num_samples_per_frame - substream_data.samples_encode.size();
345
346 // It's possible to be in this state for the final frame when there
347 // are multiple padded frames at the start. Extra virtual samples
348 // need to be added. These samples will be "left in" the decoder
349 // after all OBUs are processed, but they should not count as being
350 // trimmed.
351 PadSamples(num_samples_to_pad, num_channels,
352 substream_data.samples_encode);
353 }
354 }
355
356 return absl::OkStatus();
357 }
358
359 // Take as many samples as possible out of the total number of samples to trim,
360 // up to the size of a full frame.
GetNumSamplesToTrimForFrame(const uint32_t num_samples_in_frame,uint32_t & num_samples_to_trim_at_start,uint32_t & num_samples_to_trim_at_end)361 std::pair<uint32_t, uint32_t> GetNumSamplesToTrimForFrame(
362 const uint32_t num_samples_in_frame, uint32_t& num_samples_to_trim_at_start,
363 uint32_t& num_samples_to_trim_at_end) {
364 const uint32_t frame_samples_to_trim_at_end =
365 std::min(num_samples_in_frame, num_samples_to_trim_at_end);
366 num_samples_to_trim_at_end -= frame_samples_to_trim_at_end;
367 const uint32_t frame_samples_to_trim_at_start =
368 std::min(num_samples_in_frame, num_samples_to_trim_at_start);
369 num_samples_to_trim_at_start -= frame_samples_to_trim_at_start;
370
371 return std::make_pair(frame_samples_to_trim_at_start,
372 frame_samples_to_trim_at_end);
373 }
374
375 // Encode frames for an audio element if samples are ready.
MaybeEncodeFramesForAudioElement(const DecodedUleb128 audio_element_id,const AudioElementWithData & audio_element_with_data,const DemixingModule & demixing_module,const absl::flat_hash_set<ChannelLabel::Label> & channel_labels_for_audio_element,LabelSamplesMap & label_to_samples,absl::flat_hash_map<uint32_t,AudioFrameGenerator::TrimmingState> & substream_id_to_trimming_state,ParametersManager & parameters_manager,absl::flat_hash_map<uint32_t,std::unique_ptr<EncoderBase>> & substream_id_to_encoder,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data,GlobalTimingModule & global_timing_module)376 absl::Status MaybeEncodeFramesForAudioElement(
377 const DecodedUleb128 audio_element_id,
378 const AudioElementWithData& audio_element_with_data,
379 const DemixingModule& demixing_module,
380 const absl::flat_hash_set<ChannelLabel::Label>&
381 channel_labels_for_audio_element,
382 LabelSamplesMap& label_to_samples,
383 absl::flat_hash_map<uint32_t, AudioFrameGenerator::TrimmingState>&
384 substream_id_to_trimming_state,
385 ParametersManager& parameters_manager,
386 absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
387 substream_id_to_encoder,
388 absl::flat_hash_map<uint32_t, SubstreamData>&
389 substream_id_to_substream_data,
390 GlobalTimingModule& global_timing_module) {
391 if (!SamplesReadyForAudioElement(label_to_samples,
392 channel_labels_for_audio_element)) {
393 // Waiting for more samples belonging to the same audio element; return
394 // for now.
395 return absl::OkStatus();
396 }
397
398 const CodecConfigObu& codec_config = *audio_element_with_data.codec_config;
399
400 // Get some common information about this stream.
401 const size_t num_samples_per_frame =
402 static_cast<size_t>(codec_config.GetNumSamplesPerFrame());
403 // TODO(b/310906409): Lossy codecs do not use PCM for internal
404 // representation of data. We may need to measure loudness
405 // at a different bit-depth than the input when AAC is
406 // updated to support higher bit-depths.
407 const int encoder_input_pcm_bit_depth =
408 static_cast<int>(codec_config.GetBitDepthToMeasureLoudness());
409
410 const uint32_t encoder_input_sample_rate = codec_config.GetInputSampleRate();
411 const uint32_t decoder_output_sample_rate =
412 codec_config.GetOutputSampleRate();
413 if (encoder_input_sample_rate != decoder_output_sample_rate) {
414 // Prevent cases where resampling would occur. This allows later code to
415 // simplify assumptions when considering the number of samples in a frame or
416 // the trimming information.
417 return absl::InvalidArgumentError(absl::StrCat(
418 "Input sample rate and output sample rate differ: (",
419 encoder_input_sample_rate, " vs ", decoder_output_sample_rate, ")"));
420 }
421
422 DownMixingParams down_mixing_params;
423
424 // Save a dummy label-to-empty samples map. This is used when automatically
425 // padding zero samples at the end of a frame.
426 LabelSamplesMap label_to_empty_samples;
427 for (const auto& [label, unused_samples] : label_to_samples) {
428 label_to_empty_samples[label] = {};
429 }
430
431 std::optional<int32_t> encoded_timestamp;
432 bool more_samples_to_encode = false;
433 do {
434 RETURN_IF_NOT_OK(GetNextFrameSubstreamData(
435 audio_element_id, demixing_module, num_samples_per_frame,
436 audio_element_with_data.substream_id_to_labels,
437 substream_id_to_trimming_state, label_to_samples, parameters_manager,
438 substream_id_to_substream_data, down_mixing_params));
439
440 more_samples_to_encode = false;
441 for (const auto& [substream_id, labels] :
442 audio_element_with_data.substream_id_to_labels) {
443 auto substream_data_iter =
444 substream_id_to_substream_data.find(substream_id);
445 if (substream_data_iter == substream_id_to_substream_data.end()) {
446 if (more_samples_to_encode) {
447 return absl::InvalidArgumentError(
448 absl::StrCat("Within Audio Element ID= ", audio_element_id,
449 ", substream #", substream_id, " has ended but ",
450 " some other substreams have more samples to come"));
451 }
452 continue;
453 }
454 auto& substream_data = substream_data_iter->second;
455 if (substream_data.samples_obu.empty()) {
456 // It's possible the user signalled to flush the stream, but it was
457 // already aligned. OK, there is nothing else to do.
458 continue;
459 }
460
461 more_samples_to_encode = true;
462
463 // Encode.
464 if (substream_data.samples_encode.size() < num_samples_per_frame) {
465 // Wait until there is a whole frame of samples to encode.
466 LOG(INFO) << "Waiting for complete frame; samples_obu.size()="
467 << substream_data.samples_obu.size()
468 << " samples_encode.size()= "
469 << substream_data.samples_encode.size();
470
471 // All frames corresponding to the same Audio Element should be skipped.
472 CHECK(!encoded_timestamp.has_value());
473 continue;
474 }
475
476 // Pop samples from the queues and arrange in (time, channel) axes.
477 const size_t num_samples_to_encode =
478 static_cast<size_t>(num_samples_per_frame);
479 std::vector<std::vector<int32_t>> samples_encode(num_samples_to_encode);
480 std::vector<std::vector<int32_t>> samples_obu(num_samples_to_encode);
481
482 MoveSamples(num_samples_to_encode, substream_data.samples_obu,
483 samples_obu);
484 MoveSamples(num_samples_to_encode, substream_data.samples_encode,
485 samples_encode);
486 const auto [frame_samples_to_trim_at_start,
487 frame_samples_to_trim_at_end] =
488 GetNumSamplesToTrimForFrame(
489 num_samples_to_encode,
490 substream_data.num_samples_to_trim_at_start,
491 substream_data.num_samples_to_trim_at_end);
492
493 // Both timestamps cover trimmed and regular samples.
494 InternalTimestamp start_timestamp;
495 InternalTimestamp end_timestamp;
496 RETURN_IF_NOT_OK(global_timing_module.GetNextAudioFrameTimestamps(
497 substream_id, samples_obu.size(), start_timestamp, end_timestamp));
498
499 if (encoded_timestamp.has_value()) {
500 // All frames corresponding to the same Audio Element should have
501 // the same start timestamp.
502 CHECK_EQ(*encoded_timestamp, start_timestamp);
503 }
504
505 auto partial_audio_frame_with_data =
506 absl::WrapUnique(new AudioFrameWithData{
507 .obu = AudioFrameObu(
508 {
509 .obu_trimming_status_flag =
510 (frame_samples_to_trim_at_end != 0 ||
511 frame_samples_to_trim_at_start != 0),
512 .num_samples_to_trim_at_end =
513 frame_samples_to_trim_at_end,
514 .num_samples_to_trim_at_start =
515 frame_samples_to_trim_at_start,
516 },
517 substream_id, {}),
518 .start_timestamp = start_timestamp,
519 .end_timestamp = end_timestamp,
520 .pcm_samples = samples_obu,
521 .down_mixing_params = down_mixing_params,
522 .recon_gain_info_parameter_data = ReconGainInfoParameterData(),
523 .audio_element_with_data = &audio_element_with_data});
524
525 RETURN_IF_NOT_OK(
526 substream_id_to_encoder.at(substream_id)
527 ->EncodeAudioFrame(encoder_input_pcm_bit_depth, samples_encode,
528 std::move(partial_audio_frame_with_data)));
529 encoded_timestamp = start_timestamp;
530 }
531
532 // Clears the samples for the next iteration.
533 label_to_samples = label_to_empty_samples;
534 } while (!encoded_timestamp.has_value() && more_samples_to_encode);
535
536 if (encoded_timestamp.has_value()) {
537 // All audio frames corresponding to the audio element have been encoded;
538 // update the parameter manager to use the next frame of parameters.
539 RETURN_IF_NOT_OK(parameters_manager.UpdateDemixingState(
540 audio_element_id, *encoded_timestamp + num_samples_per_frame));
541 }
542
543 return absl::OkStatus();
544 }
545
546 // Validates that all substreams share the same trimming information.
ValidateSubstreamsShareTrimming(const iamf_tools_cli_proto::AudioFrameObuMetadata & audio_frame_metadata,bool common_samples_to_trim_at_end_includes_padding,bool common_samples_to_trim_at_start_includes_codec_delay,int64_t common_samples_to_trim_at_start,int64_t common_samples_to_trim_at_end)547 absl::Status ValidateSubstreamsShareTrimming(
548 const iamf_tools_cli_proto::AudioFrameObuMetadata& audio_frame_metadata,
549 bool common_samples_to_trim_at_end_includes_padding,
550 bool common_samples_to_trim_at_start_includes_codec_delay,
551 int64_t common_samples_to_trim_at_start,
552 int64_t common_samples_to_trim_at_end) {
553 if (audio_frame_metadata.samples_to_trim_at_end() !=
554 common_samples_to_trim_at_end ||
555 audio_frame_metadata.samples_to_trim_at_start() !=
556 common_samples_to_trim_at_start ||
557 audio_frame_metadata.samples_to_trim_at_end_includes_padding() !=
558 common_samples_to_trim_at_end_includes_padding ||
559 audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay() !=
560 common_samples_to_trim_at_start_includes_codec_delay) {
561 return absl::InvalidArgumentError(
562 "Expected all substreams to have the same trimming information");
563 }
564
565 return absl::OkStatus();
566 }
567
568 // Applies additional user trimming to one audio frame.
ApplyUserTrimForFrame(const bool from_start,const uint32_t num_samples_in_frame,int64_t & user_trim_left,uint32_t & num_samples_trimmed_in_obu,bool & obu_trimming_status_flag)569 absl::Status ApplyUserTrimForFrame(const bool from_start,
570 const uint32_t num_samples_in_frame,
571 int64_t& user_trim_left,
572 uint32_t& num_samples_trimmed_in_obu,
573 bool& obu_trimming_status_flag) {
574 // Trim as many samples as the user requested. Up to the size of a full frame.
575 const uint32_t frame_samples_to_trim =
576 std::min(static_cast<uint32_t>(num_samples_in_frame),
577 static_cast<uint32_t>(user_trim_left));
578
579 const std::string start_or_end_string = (from_start ? "start" : "end");
580
581 // Some samples may already be trimmed due to prior processing, validate
582 // that the user requested enough samples to accommodate them.
583 if (num_samples_trimmed_in_obu > frame_samples_to_trim) {
584 return absl::InvalidArgumentError(
585 absl::StrCat("More samples were trimmed from the ", start_or_end_string,
586 "than expected: (", num_samples_trimmed_in_obu, " vs ",
587 frame_samples_to_trim, ")"));
588 }
589
590 // Apply the trim for this frame.
591 num_samples_trimmed_in_obu = frame_samples_to_trim;
592 user_trim_left -= frame_samples_to_trim;
593
594 // Ensure the `obu_trimming_status_flag` is accurate.
595 if (num_samples_trimmed_in_obu != 0) {
596 obu_trimming_status_flag = true;
597 }
598
599 if (user_trim_left > 0 && !from_start) {
600 // Automatic padding, plus user requested trim, exceeds the size of a frame.
601 return absl::InvalidArgumentError(
602 "The spec disallows trimming multiple frames from the end.");
603 }
604
605 return absl::OkStatus();
606 }
607
608 // Apply user requested from the end to the input Audio Frames. The requested
609 // trim must be at least the amount that was needed to cover the
610 // padding in the final audio frame. Then the rest will be applied to
611 // consecutive OBUs from the end without modifying the underlying data.
ValidateAndApplyUserTrimming(const bool is_last_frame,AudioFrameGenerator::TrimmingState & trimming_state,AudioFrameWithData & audio_frame)612 absl::Status ValidateAndApplyUserTrimming(
613 const bool is_last_frame,
614 AudioFrameGenerator::TrimmingState& trimming_state,
615 AudioFrameWithData& audio_frame) {
616 CHECK_NE(audio_frame.audio_element_with_data, nullptr);
617 CHECK_NE(audio_frame.audio_element_with_data->codec_config, nullptr);
618 const uint32_t num_samples_in_frame =
619 audio_frame.audio_element_with_data->codec_config
620 ->GetNumSamplesPerFrame();
621
622 RETURN_IF_NOT_OK(ApplyUserTrimForFrame(
623 /*from_start=*/true, num_samples_in_frame,
624 trimming_state.user_samples_left_to_trim_at_start,
625 audio_frame.obu.header_.num_samples_to_trim_at_start,
626 audio_frame.obu.header_.obu_trimming_status_flag));
627
628 if (is_last_frame) {
629 RETURN_IF_NOT_OK(ApplyUserTrimForFrame(
630 /*from_start=*/false, num_samples_in_frame,
631 trimming_state.user_samples_left_to_trim_at_end,
632 audio_frame.obu.header_.num_samples_to_trim_at_end,
633 audio_frame.obu.header_.obu_trimming_status_flag));
634 }
635
636 return absl::OkStatus();
637 }
638
639 } // namespace
640
AudioFrameGenerator(const::google::protobuf::RepeatedPtrField<iamf_tools_cli_proto::AudioFrameObuMetadata> & audio_frame_metadata,const::google::protobuf::RepeatedPtrField<iamf_tools_cli_proto::CodecConfigObuMetadata> & codec_config_metadata,const absl::flat_hash_map<DecodedUleb128,AudioElementWithData> & audio_elements,const DemixingModule & demixing_module,ParametersManager & parameters_manager,GlobalTimingModule & global_timing_module)641 AudioFrameGenerator::AudioFrameGenerator(
642 const ::google::protobuf::RepeatedPtrField<
643 iamf_tools_cli_proto::AudioFrameObuMetadata>& audio_frame_metadata,
644 const ::google::protobuf::RepeatedPtrField<
645 iamf_tools_cli_proto::CodecConfigObuMetadata>& codec_config_metadata,
646 const absl::flat_hash_map<DecodedUleb128, AudioElementWithData>&
647 audio_elements,
648 const DemixingModule& demixing_module,
649 ParametersManager& parameters_manager,
650 GlobalTimingModule& global_timing_module)
651 : audio_elements_(audio_elements),
652 demixing_module_(demixing_module),
653 parameters_manager_(parameters_manager),
654 global_timing_module_(global_timing_module),
655 // Set to a state NOT taking samples at first; may be changed to
656 // `kTakingSamples` once `Initialize()` is called.
657 state_(kFlushingRemaining) {
658 for (const auto& audio_frame_obu_metadata : audio_frame_metadata) {
659 audio_frame_metadata_[audio_frame_obu_metadata.audio_element_id()] =
660 audio_frame_obu_metadata;
661 }
662
663 for (const auto& codec_config_obu_metadata : codec_config_metadata) {
664 codec_config_metadata_[codec_config_obu_metadata.codec_config_id()] =
665 codec_config_obu_metadata.codec_config();
666 }
667 }
668
GetNumberOfSamplesToDelayAtStart(const iamf_tools_cli_proto::CodecConfig & codec_config_metadata,const CodecConfigObu & codec_config)669 absl::StatusOr<uint32_t> AudioFrameGenerator::GetNumberOfSamplesToDelayAtStart(
670 const iamf_tools_cli_proto::CodecConfig& codec_config_metadata,
671 const CodecConfigObu& codec_config) {
672 // This function is useful when querying what the codec delay should be. We
673 // don't want it to fail if the user-provided codec delay is wrong.
674 constexpr bool kDontValidateCodecDelay = false;
675
676 std::unique_ptr<EncoderBase> encoder;
677 RETURN_IF_NOT_OK(InitializeEncoder(codec_config_metadata, codec_config,
678 /*num_channels=*/1, encoder,
679 kDontValidateCodecDelay));
680 if (encoder == nullptr) {
681 return absl::InvalidArgumentError("Failed to initialize encoder");
682 }
683 return encoder->GetNumberOfSamplesToDelayAtStart();
684 }
685
Initialize()686 absl::Status AudioFrameGenerator::Initialize() {
687 absl::MutexLock lock(&mutex_);
688 if (audio_frame_metadata_.empty()) {
689 return absl::OkStatus();
690 }
691 const auto& first_audio_frame_metadata =
692 audio_frame_metadata_.begin()->second;
693 const int64_t common_samples_to_trim_at_start = static_cast<int64_t>(
694 first_audio_frame_metadata.samples_to_trim_at_start());
695 const int64_t common_samples_to_trim_at_end =
696 static_cast<int64_t>(first_audio_frame_metadata.samples_to_trim_at_end());
697 const bool common_samples_to_trim_at_end_includes_padding =
698 first_audio_frame_metadata.samples_to_trim_at_end_includes_padding();
699 const bool common_samples_to_trim_at_start_includes_codec_delay =
700 first_audio_frame_metadata
701 .samples_to_trim_at_start_includes_codec_delay();
702
703 for (const auto& [audio_element_id, audio_frame_metadata] :
704 audio_frame_metadata_) {
705 // Precompute the `ChannelLabel::Label` for each channel label string.
706 RETURN_IF_NOT_OK(ChannelLabelUtils::SelectConvertAndFillLabels(
707 audio_frame_metadata, audio_element_id_to_labels_[audio_element_id]));
708
709 // Find the Codec Config OBU for this mono or coupled stereo substream.
710 const auto audio_elements_iter = audio_elements_.find(audio_element_id);
711 if (audio_elements_iter == audio_elements_.end()) {
712 return absl::InvalidArgumentError(absl::StrCat(
713 "Audio Element with ID= ", audio_element_id, " not found"));
714 }
715
716 // Create an encoder for each substream.
717 const AudioElementWithData& audio_element_with_data =
718 audio_elements_iter->second;
719 if (audio_frame_metadata.samples_to_trim_at_end() >
720 audio_element_with_data.codec_config->GetNumSamplesPerFrame()) {
721 return absl::InvalidArgumentError(
722 "The spec disallows trimming multiple frames from the end.");
723 }
724 RETURN_IF_NOT_OK(GetEncodingDataAndInitializeEncoders(
725 codec_config_metadata_, audio_element_with_data,
726 substream_id_to_encoder_));
727
728 // Intermediate data for all substreams belonging to an Audio Element.
729 RETURN_IF_NOT_OK(InitializeSubstreamData(
730 audio_element_with_data.substream_id_to_labels,
731 substream_id_to_encoder_,
732 audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay(),
733 audio_frame_metadata.samples_to_trim_at_start(),
734 substream_id_to_substream_data_));
735
736 // Validate that a `DemixingParamDefinition` is available if down-mixing
737 // is needed.
738 const std::list<Demixer>* down_mixers = nullptr;
739 RETURN_IF_NOT_OK(
740 demixing_module_.GetDownMixers(audio_element_id, down_mixers));
741 if (!parameters_manager_.DemixingParamDefinitionAvailable(
742 audio_element_id) &&
743 !down_mixers->empty()) {
744 return absl::InvalidArgumentError(
745 "Must include `DemixingParamDefinition` in the Audio Element if "
746 "down-mixers are required to produce audio substreams");
747 }
748
749 // Validate the assumption that trimming is the same for all substreams.
750 RETURN_IF_NOT_OK(ValidateSubstreamsShareTrimming(
751 audio_frame_metadata, common_samples_to_trim_at_end_includes_padding,
752 common_samples_to_trim_at_start_includes_codec_delay,
753 common_samples_to_trim_at_start, common_samples_to_trim_at_end));
754
755 // Populate the map of trimming states with all substream ID.
756 for (const auto& [substream_id, labels] :
757 audio_element_with_data.substream_id_to_labels) {
758 // Add in the codec delay when it was not included in the user input.
759 const int64_t additional_samples_to_trim_at_start =
760 common_samples_to_trim_at_start_includes_codec_delay
761 ? 0
762 : substream_id_to_encoder_[substream_id]
763 ->GetNumberOfSamplesToDelayAtStart();
764 substream_id_to_trimming_state_[substream_id] = {
765 .increment_samples_to_trim_at_end_by_padding =
766 !audio_frame_metadata.samples_to_trim_at_end_includes_padding(),
767 .user_samples_left_to_trim_at_end = common_samples_to_trim_at_end,
768 .user_samples_left_to_trim_at_start =
769 common_samples_to_trim_at_start +
770 additional_samples_to_trim_at_start,
771 };
772 }
773 }
774
775 // If `substream_id_to_substream_data_` is not empty, meaning this generator
776 // is expecting audio substreams and is ready to take audio samples.
777 if (!substream_id_to_substream_data_.empty()) {
778 state_ = kTakingSamples;
779 }
780
781 return absl::OkStatus();
782 }
783
TakingSamples() const784 bool AudioFrameGenerator::TakingSamples() const {
785 absl::MutexLock lock(&mutex_);
786 return (state_ == kTakingSamples);
787 }
788
AddSamples(const DecodedUleb128 audio_element_id,ChannelLabel::Label label,absl::Span<const InternalSampleType> samples)789 absl::Status AudioFrameGenerator::AddSamples(
790 const DecodedUleb128 audio_element_id, ChannelLabel::Label label,
791 absl::Span<const InternalSampleType> samples) {
792 absl::MutexLock lock(&mutex_);
793 if (state_ != kTakingSamples) {
794 LOG_FIRST_N(WARNING, 3)
795 << "Calling `AddSamples()` after `Finalize()` has no effect.";
796 return absl::OkStatus();
797 }
798
799 if (samples.empty()) {
800 return absl::InvalidArgumentError(
801 absl::StrCat("Adding emptry frames is not allowed before `Finalize()` ",
802 "has been called. audio_element_id= ", audio_element_id));
803 }
804
805 const auto& audio_element_labels_iter =
806 audio_element_id_to_labels_.find(audio_element_id);
807 if (audio_element_labels_iter == audio_element_id_to_labels_.end()) {
808 return absl::InvalidArgumentError(
809 absl::StrCat("No audio frame metadata found for Audio Element ID= ",
810 audio_element_id));
811 }
812
813 auto& labeled_samples = id_to_labeled_samples_[audio_element_id];
814 labeled_samples[label] =
815 std::vector<InternalSampleType>(samples.begin(), samples.end());
816
817 const auto audio_element_iter = audio_elements_.find(audio_element_id);
818 if (audio_element_iter == audio_elements_.end()) {
819 return absl::InvalidArgumentError(
820 absl::StrCat("No Audio Element found for ID= ", audio_element_id));
821 }
822 const auto& audio_element_with_data = audio_element_iter->second;
823
824 RETURN_IF_NOT_OK(MaybeEncodeFramesForAudioElement(
825 audio_element_id, audio_element_with_data, demixing_module_,
826 audio_element_labels_iter->second, labeled_samples,
827 substream_id_to_trimming_state_, parameters_manager_,
828 substream_id_to_encoder_, substream_id_to_substream_data_,
829 global_timing_module_));
830
831 return absl::OkStatus();
832 }
833
Finalize()834 absl::Status AudioFrameGenerator::Finalize() {
835 absl::MutexLock lock(&mutex_);
836 if (state_ == kTakingSamples) {
837 state_ = kFinalizedCalled;
838 }
839
840 return absl::OkStatus();
841 }
842
GeneratingFrames() const843 bool AudioFrameGenerator::GeneratingFrames() const {
844 absl::MutexLock lock(&mutex_);
845 return !substream_id_to_encoder_.empty();
846 }
847
OutputFrames(std::list<AudioFrameWithData> & audio_frames)848 absl::Status AudioFrameGenerator::OutputFrames(
849 std::list<AudioFrameWithData>& audio_frames) {
850 absl::MutexLock lock(&mutex_);
851
852 if (state_ == kFlushingRemaining) {
853 // In this state, there might be some remaining samples queued in the
854 // encoders waiting to be encoded; continue to encode them one frame at a
855 // time.
856 for (const auto& [audio_element_id, audio_element_with_data] :
857 audio_elements_) {
858 RETURN_IF_NOT_OK(MaybeEncodeFramesForAudioElement(
859 audio_element_id, audio_element_with_data, demixing_module_,
860 audio_element_id_to_labels_.at(audio_element_id),
861 id_to_labeled_samples_[audio_element_id],
862 substream_id_to_trimming_state_, parameters_manager_,
863 substream_id_to_encoder_, substream_id_to_substream_data_,
864 global_timing_module_));
865 }
866 } else if (state_ == kFinalizedCalled) {
867 // The `Finalize()` has just been called, advance the state so that the
868 // remaining samples will be encoded in the next iteration.
869 state_ = kFlushingRemaining;
870 }
871
872 // Pop encoded audio frames from encoders.
873 for (auto substream_id_to_encoder_iter = substream_id_to_encoder_.begin();
874 substream_id_to_encoder_iter != substream_id_to_encoder_.end();) {
875 auto& [substream_id, encoder] = *substream_id_to_encoder_iter;
876
877 // Remove the substream data when the generator is in the
878 // `kFlushingRemaining` state and the encoder can be finalized.
879 if (state_ == kFlushingRemaining) {
880 auto substream_data_iter =
881 substream_id_to_substream_data_.find(substream_id);
882 if (substream_data_iter != substream_id_to_substream_data_.end() &&
883 substream_data_iter->second.samples_obu.empty()) {
884 RETURN_IF_NOT_OK(encoder->Finalize());
885 substream_id_to_substream_data_.erase(substream_data_iter);
886 }
887 }
888
889 if (encoder->FramesAvailable()) {
890 RETURN_IF_NOT_OK(encoder->Pop(audio_frames));
891 RETURN_IF_NOT_OK(ValidateAndApplyUserTrimming(
892 /*is_last_frame=*/encoder->Finished(),
893 substream_id_to_trimming_state_.at(substream_id),
894 audio_frames.back()));
895 }
896
897 // Remove finished encoder or advance the iterator.
898 if (encoder->Finished()) {
899 substream_id_to_encoder_.erase(substream_id_to_encoder_iter++);
900 } else {
901 ++substream_id_to_encoder_iter;
902 }
903 }
904
905 return absl::OkStatus();
906 }
907
908 } // namespace iamf_tools
909