• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 3-Clause Clear License
5  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
6  * License was not distributed with this source code in the LICENSE file, you
7  * can obtain it at www.aomedia.org/license/software-license/bsd-3-c-c. If the
8  * Alliance for Open Media Patent License 1.0 was not distributed with this
9  * source code in the PATENTS file, you can obtain it at
10  * www.aomedia.org/license/patent.
11  */
12 #include "iamf/cli/proto_conversion/proto_to_obu/audio_frame_generator.h"
13 
14 #include <algorithm>
15 #include <cstddef>
16 #include <cstdint>
17 #include <cstdio>
18 #include <cstdlib>
19 #include <cstring>
20 #include <deque>
21 #include <list>
22 #include <memory>
23 #include <optional>
24 #include <string>
25 #include <utility>
26 #include <vector>
27 
28 #include "absl/container/flat_hash_map.h"
29 #include "absl/container/flat_hash_set.h"
30 #include "absl/log/check.h"
31 #include "absl/log/log.h"
32 #include "absl/memory/memory.h"
33 #include "absl/status/status.h"
34 #include "absl/strings/str_cat.h"
35 #include "absl/synchronization/mutex.h"
36 #include "absl/types/span.h"
37 #include "iamf/cli/audio_element_with_data.h"
38 #include "iamf/cli/audio_frame_with_data.h"
39 #include "iamf/cli/channel_label.h"
40 #include "iamf/cli/codec/aac_encoder.h"
41 #include "iamf/cli/codec/encoder_base.h"
42 #include "iamf/cli/codec/flac_encoder.h"
43 #include "iamf/cli/codec/lpcm_encoder.h"
44 #include "iamf/cli/codec/opus_encoder.h"
45 #include "iamf/cli/demixing_module.h"
46 #include "iamf/cli/global_timing_module.h"
47 #include "iamf/cli/parameters_manager.h"
48 #include "iamf/cli/proto/audio_frame.pb.h"
49 #include "iamf/cli/proto/codec_config.pb.h"
50 #include "iamf/cli/proto/test_vector_metadata.pb.h"
51 #include "iamf/cli/proto_conversion/channel_label_utils.h"
52 #include "iamf/common/utils/macros.h"
53 #include "iamf/obu/audio_frame.h"
54 #include "iamf/obu/codec_config.h"
55 #include "iamf/obu/demixing_info_parameter_data.h"
56 #include "iamf/obu/recon_gain_info_parameter_data.h"
57 #include "iamf/obu/types.h"
58 #include "src/google/protobuf/repeated_ptr_field.h"
59 
60 namespace iamf_tools {
61 
62 namespace {
63 
64 constexpr bool kValidateCodecDelay = true;
65 
InitializeEncoder(const iamf_tools_cli_proto::CodecConfig & codec_config_metadata,const CodecConfigObu & codec_config,int num_channels,std::unique_ptr<EncoderBase> & encoder,bool validate_codec_delay,int substream_id=0)66 absl::Status InitializeEncoder(
67     const iamf_tools_cli_proto::CodecConfig& codec_config_metadata,
68     const CodecConfigObu& codec_config, int num_channels,
69     std::unique_ptr<EncoderBase>& encoder, bool validate_codec_delay,
70     int substream_id = 0) {
71   switch (codec_config.GetCodecConfig().codec_id) {
72     using enum CodecConfig::CodecId;
73     case kCodecIdLpcm:
74       encoder = std::make_unique<LpcmEncoder>(codec_config, num_channels);
75       break;
76     case kCodecIdOpus:
77       encoder = std::make_unique<OpusEncoder>(
78           codec_config_metadata.decoder_config_opus().opus_encoder_metadata(),
79           codec_config, num_channels, substream_id);
80       break;
81     case kCodecIdAacLc:
82       encoder = std::make_unique<AacEncoder>(
83           codec_config_metadata.decoder_config_aac().aac_encoder_metadata(),
84           codec_config, num_channels);
85       break;
86     case kCodecIdFlac:
87       encoder = std::make_unique<FlacEncoder>(
88           codec_config_metadata.decoder_config_flac().flac_encoder_metadata(),
89           codec_config, num_channels);
90       break;
91     default:
92       return absl::InvalidArgumentError(absl::StrCat(
93           "Unknown codec_id= ", codec_config.GetCodecConfig().codec_id));
94   }
95   RETURN_IF_NOT_OK(encoder->Initialize(validate_codec_delay));
96   return absl::OkStatus();
97 }
98 
99 // Gets data relevant to encoding (Codec Config OBU and AudioElementWithData)
100 // and initializes encoders.
GetEncodingDataAndInitializeEncoders(const absl::flat_hash_map<DecodedUleb128,iamf_tools_cli_proto::CodecConfig> codec_config_metadata,const AudioElementWithData & audio_element_with_data,absl::flat_hash_map<uint32_t,std::unique_ptr<EncoderBase>> & substream_id_to_encoder)101 absl::Status GetEncodingDataAndInitializeEncoders(
102     const absl::flat_hash_map<DecodedUleb128, iamf_tools_cli_proto::CodecConfig>
103         codec_config_metadata,
104     const AudioElementWithData& audio_element_with_data,
105     absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
106         substream_id_to_encoder) {
107   for (const auto& [substream_id, labels] :
108        audio_element_with_data.substream_id_to_labels) {
109     const int num_channels = static_cast<int>(labels.size());
110     const CodecConfigObu& codec_config_obu =
111         *audio_element_with_data.codec_config;
112     auto codec_config_metadata_iter =
113         codec_config_metadata.find(codec_config_obu.GetCodecConfigId());
114     if (codec_config_metadata_iter == codec_config_metadata.end()) {
115       return absl::InvalidArgumentError(absl::StrCat(
116           "Failed to find codec config metadata for codec_config_id= ",
117           codec_config_obu.GetCodecConfigId()));
118     }
119 
120     RETURN_IF_NOT_OK(InitializeEncoder(codec_config_metadata_iter->second,
121                                        codec_config_obu, num_channels,
122                                        substream_id_to_encoder[substream_id],
123                                        kValidateCodecDelay, substream_id));
124   }
125 
126   return absl::OkStatus();
127 }
128 
129 // Validates that the user requested number of samples to trim at start is
130 // enough to cover the delay that the encoder needs.
ValidateUserStartTrimIncludesCodecDelay(uint32_t user_samples_to_trim_at_start,uint32_t & encoder_required_samples_to_delay)131 absl::Status ValidateUserStartTrimIncludesCodecDelay(
132     uint32_t user_samples_to_trim_at_start,
133     uint32_t& encoder_required_samples_to_delay) {
134   // Return an error. But obey the user when
135   // `-DIGNORE_ERRORS_USE_ONLY_FOR_IAMF_TEST_SUITE` is set.
136   if (user_samples_to_trim_at_start < encoder_required_samples_to_delay) {
137     // Only pad up to what the user requests.
138     const auto message =
139         absl::StrCat("The encoder requires ", encoder_required_samples_to_delay,
140                      " samples trimmed at the start but only ",
141                      user_samples_to_trim_at_start, " were requested");
142     encoder_required_samples_to_delay = user_samples_to_trim_at_start;
143     return absl::InvalidArgumentError(message);
144   }
145 
146   return absl::OkStatus();
147 }
148 
GetNumSamplesToPadAtEndAndValidate(const uint32_t required_samples_to_pad_at_end,bool increment_samples_to_trim_at_end_by_padding,int64_t & user_samples_to_trim_at_end,uint32_t & num_samples_to_pad_at_end)149 absl::Status GetNumSamplesToPadAtEndAndValidate(
150     const uint32_t required_samples_to_pad_at_end,
151     bool increment_samples_to_trim_at_end_by_padding,
152     int64_t& user_samples_to_trim_at_end, uint32_t& num_samples_to_pad_at_end) {
153   if (increment_samples_to_trim_at_end_by_padding) {
154     // In this mode, the user's requested `samples_to_trim_at_end` represents
155     // the samples trimmed from the input data. Add in the virtual padded
156     // samples that the encoder will insert, to reflect the total number of
157     // samples which are trimmed in the OBU.
158     user_samples_to_trim_at_end += required_samples_to_pad_at_end;
159   }
160 
161   num_samples_to_pad_at_end =
162       std::min(required_samples_to_pad_at_end,
163                static_cast<uint32_t>(user_samples_to_trim_at_end));
164   if (user_samples_to_trim_at_end < required_samples_to_pad_at_end) {
165     // Obey the user's request by setting `user_samples_to_trim_at_end`. But
166     // throw an error.
167     return absl::InvalidArgumentError(
168         absl::StrCat("User input requested ", user_samples_to_trim_at_end,
169                      " trimmed samples. But ", required_samples_to_pad_at_end,
170                      " samples are required to pad a full frame"));
171   }
172 
173   return absl::OkStatus();
174 }
175 
PadSamples(const size_t num_samples_to_pad,const size_t num_channels,std::deque<std::vector<int32_t>> & samples)176 void PadSamples(const size_t num_samples_to_pad, const size_t num_channels,
177                 std::deque<std::vector<int32_t>>& samples) {
178   samples.insert(samples.end(), num_samples_to_pad,
179                  std::vector<int32_t>(num_channels, 0));
180 }
181 
MoveSamples(const size_t num_samples,std::deque<std::vector<int32_t>> & source_samples,std::vector<std::vector<int32_t>> & destination_samples)182 void MoveSamples(const size_t num_samples,
183                  std::deque<std::vector<int32_t>>& source_samples,
184                  std::vector<std::vector<int32_t>>& destination_samples) {
185   CHECK_GE(source_samples.size(), num_samples);
186   std::copy(source_samples.begin(), source_samples.begin() + num_samples,
187             destination_samples.begin());
188   source_samples.erase(source_samples.begin(),
189                        source_samples.begin() + num_samples);
190 }
191 
InitializeSubstreamData(const SubstreamIdLabelsMap & substream_id_to_labels,const absl::flat_hash_map<uint32_t,std::unique_ptr<EncoderBase>> & substream_id_to_encoder,bool user_samples_to_trim_at_start_includes_codec_delay,const uint32_t user_samples_to_trim_at_start,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data)192 absl::Status InitializeSubstreamData(
193     const SubstreamIdLabelsMap& substream_id_to_labels,
194     const absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
195         substream_id_to_encoder,
196     bool user_samples_to_trim_at_start_includes_codec_delay,
197     const uint32_t user_samples_to_trim_at_start,
198     absl::flat_hash_map<uint32_t, SubstreamData>&
199         substream_id_to_substream_data) {
200   // Validate user start trim is correct; it depends on the encoder. Insert
201   // the "virtual samples" at the start up to the amount required by the codec
202   // and encoder into the `samples_obu` queue. Trimming of additional optional
203   // samples will occur later to keep trimming logic in one place as much as
204   // possible.
205   for (const auto& [substream_id, labels] : substream_id_to_labels) {
206     const auto encoder_iter = substream_id_to_encoder.find(substream_id);
207     if (encoder_iter == substream_id_to_encoder.end()) {
208       return absl::InvalidArgumentError(absl::StrCat(
209           "Failed to find encoder for substream ID= ", substream_id));
210     }
211 
212     uint32_t encoder_required_samples_to_delay =
213         encoder_iter->second->GetNumberOfSamplesToDelayAtStart();
214     if (user_samples_to_trim_at_start_includes_codec_delay) {
215       MAYBE_RETURN_IF_NOT_OK(ValidateUserStartTrimIncludesCodecDelay(
216           user_samples_to_trim_at_start, encoder_required_samples_to_delay));
217     }
218 
219     // Initialize a `SubstreamData` with virtual samples for any delay
220     // introduced by the encoder.
221     auto& substream_data_for_id = substream_id_to_substream_data[substream_id];
222     substream_data_for_id = {
223         substream_id,
224         /*samples_obu=*/{},
225         /*samples_encode=*/{},
226         /*output_gains_linear=*/{},
227         /*num_samples_to_trim_at_end=*/0,
228         /*num_samples_to_trim_at_start=*/encoder_required_samples_to_delay};
229 
230     PadSamples(encoder_required_samples_to_delay, labels.size(),
231                substream_data_for_id.samples_obu);
232   }
233 
234   return absl::OkStatus();
235 }
236 
237 // An audio element may contain many channels, denoted by their labels;
238 // this function returns whether all labels have their (same amount of)
239 // samples ready.
SamplesReadyForAudioElement(const LabelSamplesMap & label_to_samples,const absl::flat_hash_set<ChannelLabel::Label> & channel_labels_for_audio_element)240 bool SamplesReadyForAudioElement(const LabelSamplesMap& label_to_samples,
241                                  const absl::flat_hash_set<ChannelLabel::Label>&
242                                      channel_labels_for_audio_element) {
243   std::optional<size_t> common_num_samples;
244   for (const auto& label : channel_labels_for_audio_element) {
245     const auto label_to_samples_iter = label_to_samples.find(label);
246     if (label_to_samples_iter == label_to_samples.end()) {
247       return false;
248     }
249 
250     const auto num_samples = label_to_samples_iter->second.size();
251     if (!common_num_samples.has_value()) {
252       common_num_samples = num_samples;
253     }
254 
255     if (num_samples != *common_num_samples) {
256       return false;
257     }
258   }
259 
260   return true;
261 }
262 
DownMixSamples(const DecodedUleb128 audio_element_id,const DemixingModule & demixing_module,LabelSamplesMap & label_to_samples,ParametersManager & parameters_manager,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data,DownMixingParams & down_mixing_params)263 absl::Status DownMixSamples(const DecodedUleb128 audio_element_id,
264                             const DemixingModule& demixing_module,
265                             LabelSamplesMap& label_to_samples,
266                             ParametersManager& parameters_manager,
267                             absl::flat_hash_map<uint32_t, SubstreamData>&
268                                 substream_id_to_substream_data,
269                             DownMixingParams& down_mixing_params) {
270   RETURN_IF_NOT_OK(parameters_manager.GetDownMixingParameters(
271       audio_element_id, down_mixing_params));
272   LOG_FIRST_N(INFO, 10) << "Using alpha=" << down_mixing_params.alpha
273                         << " beta=" << down_mixing_params.beta
274                         << " gamma=" << down_mixing_params.gamma
275                         << " delta=" << down_mixing_params.delta
276                         << " w_idx_offset=" << down_mixing_params.w_idx_offset
277                         << " w_idx_used=" << down_mixing_params.w_idx_used
278                         << " w=" << down_mixing_params.w;
279 
280   // Down-mix OBU-aligned samples from input channels to substreams. May
281   // generate intermediate channels (e.g. L3 on the way of down-mixing L7 to L2)
282   // and expand `label_to_samples`.
283   RETURN_IF_NOT_OK(demixing_module.DownMixSamplesToSubstreams(
284       audio_element_id, down_mixing_params, label_to_samples,
285       substream_id_to_substream_data));
286 
287   return absl::OkStatus();
288 }
289 
290 // Gets the next frame of samples for all streams, either from "real" samples
291 // read from a file or from padding.
GetNextFrameSubstreamData(const DecodedUleb128 audio_element_id,const DemixingModule & demixing_module,const size_t num_samples_per_frame,const SubstreamIdLabelsMap & substream_id_to_labels,absl::flat_hash_map<uint32_t,AudioFrameGenerator::TrimmingState> & substream_id_to_trimming_state,LabelSamplesMap & label_to_samples,ParametersManager & parameters_manager,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data,DownMixingParams & down_mixing_params)292 absl::Status GetNextFrameSubstreamData(
293     const DecodedUleb128 audio_element_id,
294     const DemixingModule& demixing_module, const size_t num_samples_per_frame,
295     const SubstreamIdLabelsMap& substream_id_to_labels,
296     absl::flat_hash_map<uint32_t, AudioFrameGenerator::TrimmingState>&
297         substream_id_to_trimming_state,
298     LabelSamplesMap& label_to_samples, ParametersManager& parameters_manager,
299     absl::flat_hash_map<uint32_t, SubstreamData>&
300         substream_id_to_substream_data,
301     DownMixingParams& down_mixing_params) {
302   const bool no_sample_added =
303       (label_to_samples.empty() ||
304        std::all_of(label_to_samples.begin(), label_to_samples.end(),
305                    [](const auto& entry) { return entry.second.empty(); }));
306   if (no_sample_added &&
307       (substream_id_to_substream_data.empty() ||
308        std::all_of(substream_id_to_substream_data.begin(),
309                    substream_id_to_substream_data.end(), [](const auto& entry) {
310                      return entry.second.samples_obu.empty();
311                    }))) {
312     return absl::OkStatus();
313   }
314 
315   RETURN_IF_NOT_OK(DownMixSamples(
316       audio_element_id, demixing_module, label_to_samples, parameters_manager,
317       substream_id_to_substream_data, down_mixing_params));
318 
319   // Padding.
320   for (const auto& [substream_id, unused_labels] : substream_id_to_labels) {
321     auto& substream_data = substream_id_to_substream_data.at(substream_id);
322     const int num_channels = substream_data.samples_obu.front().size();
323     if (substream_data.samples_obu.size() < num_samples_per_frame) {
324       uint32_t num_samples_to_pad_at_end;
325       auto& trimming_state = substream_id_to_trimming_state.at(substream_id);
326       RETURN_IF_NOT_OK(GetNumSamplesToPadAtEndAndValidate(
327           num_samples_per_frame - substream_data.samples_obu.size(),
328           trimming_state.increment_samples_to_trim_at_end_by_padding,
329           trimming_state.user_samples_left_to_trim_at_end,
330           num_samples_to_pad_at_end));
331 
332       PadSamples(num_samples_to_pad_at_end, num_channels,
333                  substream_data.samples_obu);
334       PadSamples(num_samples_to_pad_at_end, num_channels,
335                  substream_data.samples_encode);
336 
337       // Record the number of padded samples to be trimmed later.
338       substream_data.num_samples_to_trim_at_end = num_samples_to_pad_at_end;
339     }
340 
341     if (no_sample_added &&
342         substream_data.samples_encode.size() < num_samples_per_frame) {
343       const uint32_t num_samples_to_pad =
344           num_samples_per_frame - substream_data.samples_encode.size();
345 
346       // It's possible to be in this state for the final frame when there
347       // are multiple padded frames at the start. Extra virtual samples
348       // need to be added. These samples will be "left in" the decoder
349       // after all OBUs are processed, but they should not count as being
350       // trimmed.
351       PadSamples(num_samples_to_pad, num_channels,
352                  substream_data.samples_encode);
353     }
354   }
355 
356   return absl::OkStatus();
357 }
358 
359 // Take as many samples as possible out of the total number of samples to trim,
360 // up to the size of a full frame.
GetNumSamplesToTrimForFrame(const uint32_t num_samples_in_frame,uint32_t & num_samples_to_trim_at_start,uint32_t & num_samples_to_trim_at_end)361 std::pair<uint32_t, uint32_t> GetNumSamplesToTrimForFrame(
362     const uint32_t num_samples_in_frame, uint32_t& num_samples_to_trim_at_start,
363     uint32_t& num_samples_to_trim_at_end) {
364   const uint32_t frame_samples_to_trim_at_end =
365       std::min(num_samples_in_frame, num_samples_to_trim_at_end);
366   num_samples_to_trim_at_end -= frame_samples_to_trim_at_end;
367   const uint32_t frame_samples_to_trim_at_start =
368       std::min(num_samples_in_frame, num_samples_to_trim_at_start);
369   num_samples_to_trim_at_start -= frame_samples_to_trim_at_start;
370 
371   return std::make_pair(frame_samples_to_trim_at_start,
372                         frame_samples_to_trim_at_end);
373 }
374 
375 // Encode frames for an audio element if samples are ready.
MaybeEncodeFramesForAudioElement(const DecodedUleb128 audio_element_id,const AudioElementWithData & audio_element_with_data,const DemixingModule & demixing_module,const absl::flat_hash_set<ChannelLabel::Label> & channel_labels_for_audio_element,LabelSamplesMap & label_to_samples,absl::flat_hash_map<uint32_t,AudioFrameGenerator::TrimmingState> & substream_id_to_trimming_state,ParametersManager & parameters_manager,absl::flat_hash_map<uint32_t,std::unique_ptr<EncoderBase>> & substream_id_to_encoder,absl::flat_hash_map<uint32_t,SubstreamData> & substream_id_to_substream_data,GlobalTimingModule & global_timing_module)376 absl::Status MaybeEncodeFramesForAudioElement(
377     const DecodedUleb128 audio_element_id,
378     const AudioElementWithData& audio_element_with_data,
379     const DemixingModule& demixing_module,
380     const absl::flat_hash_set<ChannelLabel::Label>&
381         channel_labels_for_audio_element,
382     LabelSamplesMap& label_to_samples,
383     absl::flat_hash_map<uint32_t, AudioFrameGenerator::TrimmingState>&
384         substream_id_to_trimming_state,
385     ParametersManager& parameters_manager,
386     absl::flat_hash_map<uint32_t, std::unique_ptr<EncoderBase>>&
387         substream_id_to_encoder,
388     absl::flat_hash_map<uint32_t, SubstreamData>&
389         substream_id_to_substream_data,
390     GlobalTimingModule& global_timing_module) {
391   if (!SamplesReadyForAudioElement(label_to_samples,
392                                    channel_labels_for_audio_element)) {
393     // Waiting for more samples belonging to the same audio element; return
394     // for now.
395     return absl::OkStatus();
396   }
397 
398   const CodecConfigObu& codec_config = *audio_element_with_data.codec_config;
399 
400   // Get some common information about this stream.
401   const size_t num_samples_per_frame =
402       static_cast<size_t>(codec_config.GetNumSamplesPerFrame());
403   // TODO(b/310906409): Lossy codecs do not use PCM for internal
404   //                    representation of data. We may need to measure loudness
405   //                    at a different bit-depth than the input when AAC is
406   //                    updated to support higher bit-depths.
407   const int encoder_input_pcm_bit_depth =
408       static_cast<int>(codec_config.GetBitDepthToMeasureLoudness());
409 
410   const uint32_t encoder_input_sample_rate = codec_config.GetInputSampleRate();
411   const uint32_t decoder_output_sample_rate =
412       codec_config.GetOutputSampleRate();
413   if (encoder_input_sample_rate != decoder_output_sample_rate) {
414     // Prevent cases where resampling would occur. This allows later code to
415     // simplify assumptions when considering the number of samples in a frame or
416     // the trimming information.
417     return absl::InvalidArgumentError(absl::StrCat(
418         "Input sample rate and output sample rate differ: (",
419         encoder_input_sample_rate, " vs ", decoder_output_sample_rate, ")"));
420   }
421 
422   DownMixingParams down_mixing_params;
423 
424   // Save a dummy label-to-empty samples map. This is used when automatically
425   // padding zero samples at the end of a frame.
426   LabelSamplesMap label_to_empty_samples;
427   for (const auto& [label, unused_samples] : label_to_samples) {
428     label_to_empty_samples[label] = {};
429   }
430 
431   std::optional<int32_t> encoded_timestamp;
432   bool more_samples_to_encode = false;
433   do {
434     RETURN_IF_NOT_OK(GetNextFrameSubstreamData(
435         audio_element_id, demixing_module, num_samples_per_frame,
436         audio_element_with_data.substream_id_to_labels,
437         substream_id_to_trimming_state, label_to_samples, parameters_manager,
438         substream_id_to_substream_data, down_mixing_params));
439 
440     more_samples_to_encode = false;
441     for (const auto& [substream_id, labels] :
442          audio_element_with_data.substream_id_to_labels) {
443       auto substream_data_iter =
444           substream_id_to_substream_data.find(substream_id);
445       if (substream_data_iter == substream_id_to_substream_data.end()) {
446         if (more_samples_to_encode) {
447           return absl::InvalidArgumentError(
448               absl::StrCat("Within Audio Element ID= ", audio_element_id,
449                            ", substream #", substream_id, " has ended but ",
450                            " some other substreams have more samples to come"));
451         }
452         continue;
453       }
454       auto& substream_data = substream_data_iter->second;
455       if (substream_data.samples_obu.empty()) {
456         // It's possible the user signalled to flush the stream, but it was
457         // already aligned. OK, there is nothing else to do.
458         continue;
459       }
460 
461       more_samples_to_encode = true;
462 
463       // Encode.
464       if (substream_data.samples_encode.size() < num_samples_per_frame) {
465         // Wait until there is a whole frame of samples to encode.
466         LOG(INFO) << "Waiting for complete frame; samples_obu.size()="
467                   << substream_data.samples_obu.size()
468                   << " samples_encode.size()= "
469                   << substream_data.samples_encode.size();
470 
471         // All frames corresponding to the same Audio Element should be skipped.
472         CHECK(!encoded_timestamp.has_value());
473         continue;
474       }
475 
476       // Pop samples from the queues and arrange in (time, channel) axes.
477       const size_t num_samples_to_encode =
478           static_cast<size_t>(num_samples_per_frame);
479       std::vector<std::vector<int32_t>> samples_encode(num_samples_to_encode);
480       std::vector<std::vector<int32_t>> samples_obu(num_samples_to_encode);
481 
482       MoveSamples(num_samples_to_encode, substream_data.samples_obu,
483                   samples_obu);
484       MoveSamples(num_samples_to_encode, substream_data.samples_encode,
485                   samples_encode);
486       const auto [frame_samples_to_trim_at_start,
487                   frame_samples_to_trim_at_end] =
488           GetNumSamplesToTrimForFrame(
489               num_samples_to_encode,
490               substream_data.num_samples_to_trim_at_start,
491               substream_data.num_samples_to_trim_at_end);
492 
493       // Both timestamps cover trimmed and regular samples.
494       InternalTimestamp start_timestamp;
495       InternalTimestamp end_timestamp;
496       RETURN_IF_NOT_OK(global_timing_module.GetNextAudioFrameTimestamps(
497           substream_id, samples_obu.size(), start_timestamp, end_timestamp));
498 
499       if (encoded_timestamp.has_value()) {
500         // All frames corresponding to the same Audio Element should have
501         // the same start timestamp.
502         CHECK_EQ(*encoded_timestamp, start_timestamp);
503       }
504 
505       auto partial_audio_frame_with_data =
506           absl::WrapUnique(new AudioFrameWithData{
507               .obu = AudioFrameObu(
508                   {
509                       .obu_trimming_status_flag =
510                           (frame_samples_to_trim_at_end != 0 ||
511                            frame_samples_to_trim_at_start != 0),
512                       .num_samples_to_trim_at_end =
513                           frame_samples_to_trim_at_end,
514                       .num_samples_to_trim_at_start =
515                           frame_samples_to_trim_at_start,
516                   },
517                   substream_id, {}),
518               .start_timestamp = start_timestamp,
519               .end_timestamp = end_timestamp,
520               .pcm_samples = samples_obu,
521               .down_mixing_params = down_mixing_params,
522               .recon_gain_info_parameter_data = ReconGainInfoParameterData(),
523               .audio_element_with_data = &audio_element_with_data});
524 
525       RETURN_IF_NOT_OK(
526           substream_id_to_encoder.at(substream_id)
527               ->EncodeAudioFrame(encoder_input_pcm_bit_depth, samples_encode,
528                                  std::move(partial_audio_frame_with_data)));
529       encoded_timestamp = start_timestamp;
530     }
531 
532     // Clears the samples for the next iteration.
533     label_to_samples = label_to_empty_samples;
534   } while (!encoded_timestamp.has_value() && more_samples_to_encode);
535 
536   if (encoded_timestamp.has_value()) {
537     // All audio frames corresponding to the audio element have been encoded;
538     // update the parameter manager to use the next frame of parameters.
539     RETURN_IF_NOT_OK(parameters_manager.UpdateDemixingState(
540         audio_element_id, *encoded_timestamp + num_samples_per_frame));
541   }
542 
543   return absl::OkStatus();
544 }
545 
546 // Validates that all substreams share the same trimming information.
ValidateSubstreamsShareTrimming(const iamf_tools_cli_proto::AudioFrameObuMetadata & audio_frame_metadata,bool common_samples_to_trim_at_end_includes_padding,bool common_samples_to_trim_at_start_includes_codec_delay,int64_t common_samples_to_trim_at_start,int64_t common_samples_to_trim_at_end)547 absl::Status ValidateSubstreamsShareTrimming(
548     const iamf_tools_cli_proto::AudioFrameObuMetadata& audio_frame_metadata,
549     bool common_samples_to_trim_at_end_includes_padding,
550     bool common_samples_to_trim_at_start_includes_codec_delay,
551     int64_t common_samples_to_trim_at_start,
552     int64_t common_samples_to_trim_at_end) {
553   if (audio_frame_metadata.samples_to_trim_at_end() !=
554           common_samples_to_trim_at_end ||
555       audio_frame_metadata.samples_to_trim_at_start() !=
556           common_samples_to_trim_at_start ||
557       audio_frame_metadata.samples_to_trim_at_end_includes_padding() !=
558           common_samples_to_trim_at_end_includes_padding ||
559       audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay() !=
560           common_samples_to_trim_at_start_includes_codec_delay) {
561     return absl::InvalidArgumentError(
562         "Expected all substreams to have the same trimming information");
563   }
564 
565   return absl::OkStatus();
566 }
567 
568 // Applies additional user trimming to one audio frame.
ApplyUserTrimForFrame(const bool from_start,const uint32_t num_samples_in_frame,int64_t & user_trim_left,uint32_t & num_samples_trimmed_in_obu,bool & obu_trimming_status_flag)569 absl::Status ApplyUserTrimForFrame(const bool from_start,
570                                    const uint32_t num_samples_in_frame,
571                                    int64_t& user_trim_left,
572                                    uint32_t& num_samples_trimmed_in_obu,
573                                    bool& obu_trimming_status_flag) {
574   // Trim as many samples as the user requested. Up to the size of a full frame.
575   const uint32_t frame_samples_to_trim =
576       std::min(static_cast<uint32_t>(num_samples_in_frame),
577                static_cast<uint32_t>(user_trim_left));
578 
579   const std::string start_or_end_string = (from_start ? "start" : "end");
580 
581   // Some samples may already be trimmed due to prior processing, validate
582   // that the user requested enough samples to accommodate them.
583   if (num_samples_trimmed_in_obu > frame_samples_to_trim) {
584     return absl::InvalidArgumentError(
585         absl::StrCat("More samples were trimmed from the ", start_or_end_string,
586                      "than expected: (", num_samples_trimmed_in_obu, " vs ",
587                      frame_samples_to_trim, ")"));
588   }
589 
590   // Apply the trim for this frame.
591   num_samples_trimmed_in_obu = frame_samples_to_trim;
592   user_trim_left -= frame_samples_to_trim;
593 
594   // Ensure the `obu_trimming_status_flag` is accurate.
595   if (num_samples_trimmed_in_obu != 0) {
596     obu_trimming_status_flag = true;
597   }
598 
599   if (user_trim_left > 0 && !from_start) {
600     // Automatic padding, plus user requested trim, exceeds the size of a frame.
601     return absl::InvalidArgumentError(
602         "The spec disallows trimming multiple frames from the end.");
603   }
604 
605   return absl::OkStatus();
606 }
607 
608 // Apply user requested from the end to the input Audio Frames. The requested
609 // trim must be at least the amount that was needed to cover the
610 // padding in the final audio frame. Then the rest will be applied to
611 // consecutive OBUs from the end without modifying the underlying data.
ValidateAndApplyUserTrimming(const bool is_last_frame,AudioFrameGenerator::TrimmingState & trimming_state,AudioFrameWithData & audio_frame)612 absl::Status ValidateAndApplyUserTrimming(
613     const bool is_last_frame,
614     AudioFrameGenerator::TrimmingState& trimming_state,
615     AudioFrameWithData& audio_frame) {
616   CHECK_NE(audio_frame.audio_element_with_data, nullptr);
617   CHECK_NE(audio_frame.audio_element_with_data->codec_config, nullptr);
618   const uint32_t num_samples_in_frame =
619       audio_frame.audio_element_with_data->codec_config
620           ->GetNumSamplesPerFrame();
621 
622   RETURN_IF_NOT_OK(ApplyUserTrimForFrame(
623       /*from_start=*/true, num_samples_in_frame,
624       trimming_state.user_samples_left_to_trim_at_start,
625       audio_frame.obu.header_.num_samples_to_trim_at_start,
626       audio_frame.obu.header_.obu_trimming_status_flag));
627 
628   if (is_last_frame) {
629     RETURN_IF_NOT_OK(ApplyUserTrimForFrame(
630         /*from_start=*/false, num_samples_in_frame,
631         trimming_state.user_samples_left_to_trim_at_end,
632         audio_frame.obu.header_.num_samples_to_trim_at_end,
633         audio_frame.obu.header_.obu_trimming_status_flag));
634   }
635 
636   return absl::OkStatus();
637 }
638 
639 }  // namespace
640 
AudioFrameGenerator(const::google::protobuf::RepeatedPtrField<iamf_tools_cli_proto::AudioFrameObuMetadata> & audio_frame_metadata,const::google::protobuf::RepeatedPtrField<iamf_tools_cli_proto::CodecConfigObuMetadata> & codec_config_metadata,const absl::flat_hash_map<DecodedUleb128,AudioElementWithData> & audio_elements,const DemixingModule & demixing_module,ParametersManager & parameters_manager,GlobalTimingModule & global_timing_module)641 AudioFrameGenerator::AudioFrameGenerator(
642     const ::google::protobuf::RepeatedPtrField<
643         iamf_tools_cli_proto::AudioFrameObuMetadata>& audio_frame_metadata,
644     const ::google::protobuf::RepeatedPtrField<
645         iamf_tools_cli_proto::CodecConfigObuMetadata>& codec_config_metadata,
646     const absl::flat_hash_map<DecodedUleb128, AudioElementWithData>&
647         audio_elements,
648     const DemixingModule& demixing_module,
649     ParametersManager& parameters_manager,
650     GlobalTimingModule& global_timing_module)
651     : audio_elements_(audio_elements),
652       demixing_module_(demixing_module),
653       parameters_manager_(parameters_manager),
654       global_timing_module_(global_timing_module),
655       // Set to a state NOT taking samples at first; may be changed to
656       // `kTakingSamples` once `Initialize()` is called.
657       state_(kFlushingRemaining) {
658   for (const auto& audio_frame_obu_metadata : audio_frame_metadata) {
659     audio_frame_metadata_[audio_frame_obu_metadata.audio_element_id()] =
660         audio_frame_obu_metadata;
661   }
662 
663   for (const auto& codec_config_obu_metadata : codec_config_metadata) {
664     codec_config_metadata_[codec_config_obu_metadata.codec_config_id()] =
665         codec_config_obu_metadata.codec_config();
666   }
667 }
668 
GetNumberOfSamplesToDelayAtStart(const iamf_tools_cli_proto::CodecConfig & codec_config_metadata,const CodecConfigObu & codec_config)669 absl::StatusOr<uint32_t> AudioFrameGenerator::GetNumberOfSamplesToDelayAtStart(
670     const iamf_tools_cli_proto::CodecConfig& codec_config_metadata,
671     const CodecConfigObu& codec_config) {
672   // This function is useful when querying what the codec delay should be. We
673   // don't want it to fail if the user-provided codec delay is wrong.
674   constexpr bool kDontValidateCodecDelay = false;
675 
676   std::unique_ptr<EncoderBase> encoder;
677   RETURN_IF_NOT_OK(InitializeEncoder(codec_config_metadata, codec_config,
678                                      /*num_channels=*/1, encoder,
679                                      kDontValidateCodecDelay));
680   if (encoder == nullptr) {
681     return absl::InvalidArgumentError("Failed to initialize encoder");
682   }
683   return encoder->GetNumberOfSamplesToDelayAtStart();
684 }
685 
Initialize()686 absl::Status AudioFrameGenerator::Initialize() {
687   absl::MutexLock lock(&mutex_);
688   if (audio_frame_metadata_.empty()) {
689     return absl::OkStatus();
690   }
691   const auto& first_audio_frame_metadata =
692       audio_frame_metadata_.begin()->second;
693   const int64_t common_samples_to_trim_at_start = static_cast<int64_t>(
694       first_audio_frame_metadata.samples_to_trim_at_start());
695   const int64_t common_samples_to_trim_at_end =
696       static_cast<int64_t>(first_audio_frame_metadata.samples_to_trim_at_end());
697   const bool common_samples_to_trim_at_end_includes_padding =
698       first_audio_frame_metadata.samples_to_trim_at_end_includes_padding();
699   const bool common_samples_to_trim_at_start_includes_codec_delay =
700       first_audio_frame_metadata
701           .samples_to_trim_at_start_includes_codec_delay();
702 
703   for (const auto& [audio_element_id, audio_frame_metadata] :
704        audio_frame_metadata_) {
705     // Precompute the `ChannelLabel::Label` for each channel label string.
706     RETURN_IF_NOT_OK(ChannelLabelUtils::SelectConvertAndFillLabels(
707         audio_frame_metadata, audio_element_id_to_labels_[audio_element_id]));
708 
709     // Find the Codec Config OBU for this mono or coupled stereo substream.
710     const auto audio_elements_iter = audio_elements_.find(audio_element_id);
711     if (audio_elements_iter == audio_elements_.end()) {
712       return absl::InvalidArgumentError(absl::StrCat(
713           "Audio Element with ID= ", audio_element_id, " not found"));
714     }
715 
716     // Create an encoder for each substream.
717     const AudioElementWithData& audio_element_with_data =
718         audio_elements_iter->second;
719     if (audio_frame_metadata.samples_to_trim_at_end() >
720         audio_element_with_data.codec_config->GetNumSamplesPerFrame()) {
721       return absl::InvalidArgumentError(
722           "The spec disallows trimming multiple frames from the end.");
723     }
724     RETURN_IF_NOT_OK(GetEncodingDataAndInitializeEncoders(
725         codec_config_metadata_, audio_element_with_data,
726         substream_id_to_encoder_));
727 
728     // Intermediate data for all substreams belonging to an Audio Element.
729     RETURN_IF_NOT_OK(InitializeSubstreamData(
730         audio_element_with_data.substream_id_to_labels,
731         substream_id_to_encoder_,
732         audio_frame_metadata.samples_to_trim_at_start_includes_codec_delay(),
733         audio_frame_metadata.samples_to_trim_at_start(),
734         substream_id_to_substream_data_));
735 
736     // Validate that a `DemixingParamDefinition` is available if down-mixing
737     // is needed.
738     const std::list<Demixer>* down_mixers = nullptr;
739     RETURN_IF_NOT_OK(
740         demixing_module_.GetDownMixers(audio_element_id, down_mixers));
741     if (!parameters_manager_.DemixingParamDefinitionAvailable(
742             audio_element_id) &&
743         !down_mixers->empty()) {
744       return absl::InvalidArgumentError(
745           "Must include `DemixingParamDefinition` in the Audio Element if "
746           "down-mixers are required to produce audio substreams");
747     }
748 
749     // Validate the assumption that trimming is the same for all substreams.
750     RETURN_IF_NOT_OK(ValidateSubstreamsShareTrimming(
751         audio_frame_metadata, common_samples_to_trim_at_end_includes_padding,
752         common_samples_to_trim_at_start_includes_codec_delay,
753         common_samples_to_trim_at_start, common_samples_to_trim_at_end));
754 
755     // Populate the map of trimming states with all substream ID.
756     for (const auto& [substream_id, labels] :
757          audio_element_with_data.substream_id_to_labels) {
758       // Add in the codec delay when it was not included in the user input.
759       const int64_t additional_samples_to_trim_at_start =
760           common_samples_to_trim_at_start_includes_codec_delay
761               ? 0
762               : substream_id_to_encoder_[substream_id]
763                     ->GetNumberOfSamplesToDelayAtStart();
764       substream_id_to_trimming_state_[substream_id] = {
765           .increment_samples_to_trim_at_end_by_padding =
766               !audio_frame_metadata.samples_to_trim_at_end_includes_padding(),
767           .user_samples_left_to_trim_at_end = common_samples_to_trim_at_end,
768           .user_samples_left_to_trim_at_start =
769               common_samples_to_trim_at_start +
770               additional_samples_to_trim_at_start,
771       };
772     }
773   }
774 
775   // If `substream_id_to_substream_data_` is not empty, meaning this generator
776   // is expecting audio substreams and is ready to take audio samples.
777   if (!substream_id_to_substream_data_.empty()) {
778     state_ = kTakingSamples;
779   }
780 
781   return absl::OkStatus();
782 }
783 
TakingSamples() const784 bool AudioFrameGenerator::TakingSamples() const {
785   absl::MutexLock lock(&mutex_);
786   return (state_ == kTakingSamples);
787 }
788 
AddSamples(const DecodedUleb128 audio_element_id,ChannelLabel::Label label,absl::Span<const InternalSampleType> samples)789 absl::Status AudioFrameGenerator::AddSamples(
790     const DecodedUleb128 audio_element_id, ChannelLabel::Label label,
791     absl::Span<const InternalSampleType> samples) {
792   absl::MutexLock lock(&mutex_);
793   if (state_ != kTakingSamples) {
794     LOG_FIRST_N(WARNING, 3)
795         << "Calling `AddSamples()` after `Finalize()` has no effect.";
796     return absl::OkStatus();
797   }
798 
799   if (samples.empty()) {
800     return absl::InvalidArgumentError(
801         absl::StrCat("Adding emptry frames is not allowed before `Finalize()` ",
802                      "has been called. audio_element_id= ", audio_element_id));
803   }
804 
805   const auto& audio_element_labels_iter =
806       audio_element_id_to_labels_.find(audio_element_id);
807   if (audio_element_labels_iter == audio_element_id_to_labels_.end()) {
808     return absl::InvalidArgumentError(
809         absl::StrCat("No audio frame metadata found for Audio Element ID= ",
810                      audio_element_id));
811   }
812 
813   auto& labeled_samples = id_to_labeled_samples_[audio_element_id];
814   labeled_samples[label] =
815       std::vector<InternalSampleType>(samples.begin(), samples.end());
816 
817   const auto audio_element_iter = audio_elements_.find(audio_element_id);
818   if (audio_element_iter == audio_elements_.end()) {
819     return absl::InvalidArgumentError(
820         absl::StrCat("No Audio Element found for ID= ", audio_element_id));
821   }
822   const auto& audio_element_with_data = audio_element_iter->second;
823 
824   RETURN_IF_NOT_OK(MaybeEncodeFramesForAudioElement(
825       audio_element_id, audio_element_with_data, demixing_module_,
826       audio_element_labels_iter->second, labeled_samples,
827       substream_id_to_trimming_state_, parameters_manager_,
828       substream_id_to_encoder_, substream_id_to_substream_data_,
829       global_timing_module_));
830 
831   return absl::OkStatus();
832 }
833 
Finalize()834 absl::Status AudioFrameGenerator::Finalize() {
835   absl::MutexLock lock(&mutex_);
836   if (state_ == kTakingSamples) {
837     state_ = kFinalizedCalled;
838   }
839 
840   return absl::OkStatus();
841 }
842 
GeneratingFrames() const843 bool AudioFrameGenerator::GeneratingFrames() const {
844   absl::MutexLock lock(&mutex_);
845   return !substream_id_to_encoder_.empty();
846 }
847 
OutputFrames(std::list<AudioFrameWithData> & audio_frames)848 absl::Status AudioFrameGenerator::OutputFrames(
849     std::list<AudioFrameWithData>& audio_frames) {
850   absl::MutexLock lock(&mutex_);
851 
852   if (state_ == kFlushingRemaining) {
853     // In this state, there might be some remaining samples queued in the
854     // encoders waiting to be encoded; continue to encode them one frame at a
855     // time.
856     for (const auto& [audio_element_id, audio_element_with_data] :
857          audio_elements_) {
858       RETURN_IF_NOT_OK(MaybeEncodeFramesForAudioElement(
859           audio_element_id, audio_element_with_data, demixing_module_,
860           audio_element_id_to_labels_.at(audio_element_id),
861           id_to_labeled_samples_[audio_element_id],
862           substream_id_to_trimming_state_, parameters_manager_,
863           substream_id_to_encoder_, substream_id_to_substream_data_,
864           global_timing_module_));
865     }
866   } else if (state_ == kFinalizedCalled) {
867     // The `Finalize()` has just been called, advance the state so that the
868     // remaining samples will be encoded in the next iteration.
869     state_ = kFlushingRemaining;
870   }
871 
872   // Pop encoded audio frames from encoders.
873   for (auto substream_id_to_encoder_iter = substream_id_to_encoder_.begin();
874        substream_id_to_encoder_iter != substream_id_to_encoder_.end();) {
875     auto& [substream_id, encoder] = *substream_id_to_encoder_iter;
876 
877     // Remove the substream data when the generator is in the
878     // `kFlushingRemaining` state and the encoder can be finalized.
879     if (state_ == kFlushingRemaining) {
880       auto substream_data_iter =
881           substream_id_to_substream_data_.find(substream_id);
882       if (substream_data_iter != substream_id_to_substream_data_.end() &&
883           substream_data_iter->second.samples_obu.empty()) {
884         RETURN_IF_NOT_OK(encoder->Finalize());
885         substream_id_to_substream_data_.erase(substream_data_iter);
886       }
887     }
888 
889     if (encoder->FramesAvailable()) {
890       RETURN_IF_NOT_OK(encoder->Pop(audio_frames));
891       RETURN_IF_NOT_OK(ValidateAndApplyUserTrimming(
892           /*is_last_frame=*/encoder->Finished(),
893           substream_id_to_trimming_state_.at(substream_id),
894           audio_frames.back()));
895     }
896 
897     // Remove finished encoder or advance the iterator.
898     if (encoder->Finished()) {
899       substream_id_to_encoder_.erase(substream_id_to_encoder_iter++);
900     } else {
901       ++substream_id_to_encoder_iter;
902     }
903   }
904 
905   return absl::OkStatus();
906 }
907 
908 }  // namespace iamf_tools
909