1 /*
2 * Copyright (c) 2021 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "api/video/rtp_video_frame_assembler.h"
12
13 #include <algorithm>
14 #include <cstdint>
15 #include <map>
16 #include <memory>
17 #include <utility>
18 #include <vector>
19
20 #include "absl/container/inlined_vector.h"
21 #include "absl/types/optional.h"
22 #include "modules/rtp_rtcp/source/rtp_dependency_descriptor_extension.h"
23 #include "modules/rtp_rtcp/source/rtp_generic_frame_descriptor_extension.h"
24 #include "modules/rtp_rtcp/source/rtp_packet_received.h"
25 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_av1.h"
26 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_generic.h"
27 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_h264.h"
28 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_raw.h"
29 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp8.h"
30 #include "modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.h"
31 #include "modules/video_coding/frame_object.h"
32 #include "modules/video_coding/packet_buffer.h"
33 #include "modules/video_coding/rtp_frame_reference_finder.h"
34 #include "rtc_base/logging.h"
35
36 namespace webrtc {
37 namespace {
CreateDepacketizer(RtpVideoFrameAssembler::PayloadFormat payload_format)38 std::unique_ptr<VideoRtpDepacketizer> CreateDepacketizer(
39 RtpVideoFrameAssembler::PayloadFormat payload_format) {
40 switch (payload_format) {
41 case RtpVideoFrameAssembler::kRaw:
42 return std::make_unique<VideoRtpDepacketizerRaw>();
43 case RtpVideoFrameAssembler::kH264:
44 return std::make_unique<VideoRtpDepacketizerH264>();
45 case RtpVideoFrameAssembler::kVp8:
46 return std::make_unique<VideoRtpDepacketizerVp8>();
47 case RtpVideoFrameAssembler::kVp9:
48 return std::make_unique<VideoRtpDepacketizerVp9>();
49 case RtpVideoFrameAssembler::kAv1:
50 return std::make_unique<VideoRtpDepacketizerAv1>();
51 case RtpVideoFrameAssembler::kGeneric:
52 return std::make_unique<VideoRtpDepacketizerGeneric>();
53 }
54 RTC_DCHECK_NOTREACHED();
55 return nullptr;
56 }
57 } // namespace
58
59 class RtpVideoFrameAssembler::Impl {
60 public:
61 explicit Impl(std::unique_ptr<VideoRtpDepacketizer> depacketizer);
62 ~Impl() = default;
63
64 FrameVector InsertPacket(const RtpPacketReceived& packet);
65
66 private:
67 using RtpFrameVector =
68 absl::InlinedVector<std::unique_ptr<RtpFrameObject>, 3>;
69
70 RtpFrameVector AssembleFrames(
71 video_coding::PacketBuffer::InsertResult insert_result);
72 FrameVector FindReferences(RtpFrameVector frames);
73 FrameVector UpdateWithPadding(uint16_t seq_num);
74 bool ParseDependenciesDescriptorExtension(const RtpPacketReceived& rtp_packet,
75 RTPVideoHeader& video_header);
76 bool ParseGenericDescriptorExtension(const RtpPacketReceived& rtp_packet,
77 RTPVideoHeader& video_header);
78 void ClearOldData(uint16_t incoming_seq_num);
79
80 std::unique_ptr<FrameDependencyStructure> video_structure_;
81 SeqNumUnwrapper<uint16_t> frame_id_unwrapper_;
82 absl::optional<int64_t> video_structure_frame_id_;
83 std::unique_ptr<VideoRtpDepacketizer> depacketizer_;
84 video_coding::PacketBuffer packet_buffer_;
85 RtpFrameReferenceFinder reference_finder_;
86 };
87
Impl(std::unique_ptr<VideoRtpDepacketizer> depacketizer)88 RtpVideoFrameAssembler::Impl::Impl(
89 std::unique_ptr<VideoRtpDepacketizer> depacketizer)
90 : depacketizer_(std::move(depacketizer)),
91 packet_buffer_(/*start_buffer_size=*/2048, /*max_buffer_size=*/2048) {}
92
InsertPacket(const RtpPacketReceived & rtp_packet)93 RtpVideoFrameAssembler::FrameVector RtpVideoFrameAssembler::Impl::InsertPacket(
94 const RtpPacketReceived& rtp_packet) {
95 if (rtp_packet.payload_size() == 0) {
96 ClearOldData(rtp_packet.SequenceNumber());
97 return UpdateWithPadding(rtp_packet.SequenceNumber());
98 }
99
100 absl::optional<VideoRtpDepacketizer::ParsedRtpPayload> parsed_payload =
101 depacketizer_->Parse(rtp_packet.PayloadBuffer());
102
103 if (parsed_payload == absl::nullopt) {
104 return {};
105 }
106
107 if (rtp_packet.HasExtension<RtpDependencyDescriptorExtension>()) {
108 if (!ParseDependenciesDescriptorExtension(rtp_packet,
109 parsed_payload->video_header)) {
110 return {};
111 }
112 } else if (rtp_packet.HasExtension<RtpGenericFrameDescriptorExtension00>()) {
113 if (!ParseGenericDescriptorExtension(rtp_packet,
114 parsed_payload->video_header)) {
115 return {};
116 }
117 }
118
119 parsed_payload->video_header.is_last_packet_in_frame |= rtp_packet.Marker();
120
121 auto packet = std::make_unique<video_coding::PacketBuffer::Packet>(
122 rtp_packet, parsed_payload->video_header);
123 packet->video_payload = std::move(parsed_payload->video_payload);
124
125 ClearOldData(rtp_packet.SequenceNumber());
126 return FindReferences(
127 AssembleFrames(packet_buffer_.InsertPacket(std::move(packet))));
128 }
129
ClearOldData(uint16_t incoming_seq_num)130 void RtpVideoFrameAssembler::Impl::ClearOldData(uint16_t incoming_seq_num) {
131 constexpr uint16_t kOldSeqNumThreshold = 2000;
132 uint16_t old_seq_num = incoming_seq_num - kOldSeqNumThreshold;
133 packet_buffer_.ClearTo(old_seq_num);
134 reference_finder_.ClearTo(old_seq_num);
135 }
136
137 RtpVideoFrameAssembler::Impl::RtpFrameVector
AssembleFrames(video_coding::PacketBuffer::InsertResult insert_result)138 RtpVideoFrameAssembler::Impl::AssembleFrames(
139 video_coding::PacketBuffer::InsertResult insert_result) {
140 video_coding::PacketBuffer::Packet* first_packet = nullptr;
141 std::vector<rtc::ArrayView<const uint8_t>> payloads;
142 RtpFrameVector result;
143
144 for (auto& packet : insert_result.packets) {
145 if (packet->is_first_packet_in_frame()) {
146 first_packet = packet.get();
147 payloads.clear();
148 }
149 payloads.emplace_back(packet->video_payload);
150
151 if (packet->is_last_packet_in_frame()) {
152 rtc::scoped_refptr<EncodedImageBuffer> bitstream =
153 depacketizer_->AssembleFrame(payloads);
154
155 if (!bitstream) {
156 continue;
157 }
158
159 const video_coding::PacketBuffer::Packet& last_packet = *packet;
160 result.push_back(std::make_unique<RtpFrameObject>(
161 first_packet->seq_num, //
162 last_packet.seq_num, //
163 last_packet.marker_bit, //
164 /*times_nacked=*/0, //
165 /*first_packet_received_time=*/0, //
166 /*last_packet_received_time=*/0, //
167 first_packet->timestamp, //
168 /*ntp_time_ms=*/0, //
169 /*timing=*/VideoSendTiming(), //
170 first_packet->payload_type, //
171 first_packet->codec(), //
172 last_packet.video_header.rotation, //
173 last_packet.video_header.content_type, //
174 first_packet->video_header, //
175 last_packet.video_header.color_space, //
176 /*packet_infos=*/RtpPacketInfos(), //
177 std::move(bitstream)));
178 }
179 }
180
181 return result;
182 }
183
184 RtpVideoFrameAssembler::FrameVector
FindReferences(RtpFrameVector frames)185 RtpVideoFrameAssembler::Impl::FindReferences(RtpFrameVector frames) {
186 FrameVector res;
187 for (auto& frame : frames) {
188 auto complete_frames = reference_finder_.ManageFrame(std::move(frame));
189 for (std::unique_ptr<RtpFrameObject>& complete_frame : complete_frames) {
190 uint16_t rtp_seq_num_start = complete_frame->first_seq_num();
191 uint16_t rtp_seq_num_end = complete_frame->last_seq_num();
192 res.emplace_back(rtp_seq_num_start, rtp_seq_num_end,
193 std::move(complete_frame));
194 }
195 }
196 return res;
197 }
198
199 RtpVideoFrameAssembler::FrameVector
UpdateWithPadding(uint16_t seq_num)200 RtpVideoFrameAssembler::Impl::UpdateWithPadding(uint16_t seq_num) {
201 auto res =
202 FindReferences(AssembleFrames(packet_buffer_.InsertPadding(seq_num)));
203 auto ref_finder_update = reference_finder_.PaddingReceived(seq_num);
204
205 for (std::unique_ptr<RtpFrameObject>& complete_frame : ref_finder_update) {
206 uint16_t rtp_seq_num_start = complete_frame->first_seq_num();
207 uint16_t rtp_seq_num_end = complete_frame->last_seq_num();
208 res.emplace_back(rtp_seq_num_start, rtp_seq_num_end,
209 std::move(complete_frame));
210 }
211
212 return res;
213 }
214
ParseDependenciesDescriptorExtension(const RtpPacketReceived & rtp_packet,RTPVideoHeader & video_header)215 bool RtpVideoFrameAssembler::Impl::ParseDependenciesDescriptorExtension(
216 const RtpPacketReceived& rtp_packet,
217 RTPVideoHeader& video_header) {
218 webrtc::DependencyDescriptor dependency_descriptor;
219
220 if (!rtp_packet.GetExtension<RtpDependencyDescriptorExtension>(
221 video_structure_.get(), &dependency_descriptor)) {
222 // Descriptor is either malformed, or the template referenced is not in
223 // the `video_structure_` currently being held.
224 // TODO(bugs.webrtc.org/10342): Improve packet reordering behavior.
225 RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc()
226 << " Failed to parse dependency descriptor.";
227 return false;
228 }
229
230 if (dependency_descriptor.attached_structure != nullptr &&
231 !dependency_descriptor.first_packet_in_frame) {
232 RTC_LOG(LS_WARNING) << "ssrc: " << rtp_packet.Ssrc()
233 << "Invalid dependency descriptor: structure "
234 "attached to non first packet of a frame.";
235 return false;
236 }
237
238 video_header.is_first_packet_in_frame =
239 dependency_descriptor.first_packet_in_frame;
240 video_header.is_last_packet_in_frame =
241 dependency_descriptor.last_packet_in_frame;
242
243 int64_t frame_id =
244 frame_id_unwrapper_.Unwrap(dependency_descriptor.frame_number);
245 auto& generic_descriptor_info = video_header.generic.emplace();
246 generic_descriptor_info.frame_id = frame_id;
247 generic_descriptor_info.spatial_index =
248 dependency_descriptor.frame_dependencies.spatial_id;
249 generic_descriptor_info.temporal_index =
250 dependency_descriptor.frame_dependencies.temporal_id;
251
252 for (int fdiff : dependency_descriptor.frame_dependencies.frame_diffs) {
253 generic_descriptor_info.dependencies.push_back(frame_id - fdiff);
254 }
255 for (int cdiff : dependency_descriptor.frame_dependencies.chain_diffs) {
256 generic_descriptor_info.chain_diffs.push_back(frame_id - cdiff);
257 }
258 generic_descriptor_info.decode_target_indications =
259 dependency_descriptor.frame_dependencies.decode_target_indications;
260 if (dependency_descriptor.resolution) {
261 video_header.width = dependency_descriptor.resolution->Width();
262 video_header.height = dependency_descriptor.resolution->Height();
263 }
264 if (dependency_descriptor.active_decode_targets_bitmask.has_value()) {
265 generic_descriptor_info.active_decode_targets =
266 *dependency_descriptor.active_decode_targets_bitmask;
267 }
268
269 // FrameDependencyStructure is sent in the dependency descriptor of the first
270 // packet of a key frame and is required to parse all subsequent packets until
271 // the next key frame.
272 if (dependency_descriptor.attached_structure) {
273 RTC_DCHECK(dependency_descriptor.first_packet_in_frame);
274 if (video_structure_frame_id_ > frame_id) {
275 RTC_LOG(LS_WARNING)
276 << "Arrived key frame with id " << frame_id << " and structure id "
277 << dependency_descriptor.attached_structure->structure_id
278 << " is older than the latest received key frame with id "
279 << *video_structure_frame_id_ << " and structure id "
280 << video_structure_->structure_id;
281 return false;
282 }
283 video_structure_ = std::move(dependency_descriptor.attached_structure);
284 video_structure_frame_id_ = frame_id;
285 video_header.frame_type = VideoFrameType::kVideoFrameKey;
286 } else {
287 video_header.frame_type = VideoFrameType::kVideoFrameDelta;
288 }
289 return true;
290 }
291
ParseGenericDescriptorExtension(const RtpPacketReceived & rtp_packet,RTPVideoHeader & video_header)292 bool RtpVideoFrameAssembler::Impl::ParseGenericDescriptorExtension(
293 const RtpPacketReceived& rtp_packet,
294 RTPVideoHeader& video_header) {
295 RtpGenericFrameDescriptor generic_frame_descriptor;
296 if (!rtp_packet.GetExtension<RtpGenericFrameDescriptorExtension00>(
297 &generic_frame_descriptor)) {
298 return false;
299 }
300
301 video_header.is_first_packet_in_frame =
302 generic_frame_descriptor.FirstPacketInSubFrame();
303 video_header.is_last_packet_in_frame =
304 generic_frame_descriptor.LastPacketInSubFrame();
305
306 if (generic_frame_descriptor.FirstPacketInSubFrame()) {
307 video_header.frame_type =
308 generic_frame_descriptor.FrameDependenciesDiffs().empty()
309 ? VideoFrameType::kVideoFrameKey
310 : VideoFrameType::kVideoFrameDelta;
311
312 auto& generic_descriptor_info = video_header.generic.emplace();
313 int64_t frame_id =
314 frame_id_unwrapper_.Unwrap(generic_frame_descriptor.FrameId());
315 generic_descriptor_info.frame_id = frame_id;
316 generic_descriptor_info.spatial_index =
317 generic_frame_descriptor.SpatialLayer();
318 generic_descriptor_info.temporal_index =
319 generic_frame_descriptor.TemporalLayer();
320 for (uint16_t fdiff : generic_frame_descriptor.FrameDependenciesDiffs()) {
321 generic_descriptor_info.dependencies.push_back(frame_id - fdiff);
322 }
323 }
324 video_header.width = generic_frame_descriptor.Width();
325 video_header.height = generic_frame_descriptor.Height();
326 return true;
327 }
328
RtpVideoFrameAssembler(PayloadFormat payload_format)329 RtpVideoFrameAssembler::RtpVideoFrameAssembler(PayloadFormat payload_format)
330 : impl_(std::make_unique<Impl>(CreateDepacketizer(payload_format))) {}
331
332 RtpVideoFrameAssembler::~RtpVideoFrameAssembler() = default;
333
InsertPacket(const RtpPacketReceived & packet)334 RtpVideoFrameAssembler::FrameVector RtpVideoFrameAssembler::InsertPacket(
335 const RtpPacketReceived& packet) {
336 return impl_->InsertPacket(packet);
337 }
338
339 } // namespace webrtc
340