1 // Copyright 2020 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef CAST_STANDALONE_SENDER_STREAMING_VP8_ENCODER_H_ 6 #define CAST_STANDALONE_SENDER_STREAMING_VP8_ENCODER_H_ 7 8 #include <vpx/vpx_encoder.h> 9 #include <vpx/vpx_image.h> 10 11 #include <algorithm> 12 #include <condition_variable> // NOLINT 13 #include <functional> 14 #include <memory> 15 #include <mutex> 16 #include <queue> 17 #include <thread> 18 #include <vector> 19 20 #include "absl/base/thread_annotations.h" 21 #include "cast/streaming/frame_id.h" 22 #include "cast/streaming/rtp_time.h" 23 #include "platform/api/task_runner.h" 24 #include "platform/api/time.h" 25 26 namespace openscreen { 27 28 class TaskRunner; 29 30 namespace cast { 31 32 class Sender; 33 34 // Uses libvpx to encode VP8 video and streams it to a Sender. Includes 35 // extensive logic for fine-tuning the encoder parameters in real-time, to 36 // provide the best quality results given external, uncontrollable factors: 37 // CPU/network availability, and the complexity of the video frame content. 38 // 39 // Internally, a separate encode thread is created and used to prevent blocking 40 // the main thread while frames are being encoded. All public API methods are 41 // assumed to be called on the same sequence/thread as the main TaskRunner 42 // (injected via the constructor). 43 // 44 // Usage: 45 // 46 // 1. EncodeAndSend() is used to queue-up video frames for encoding and sending, 47 // which will be done on a best-effort basis. 48 // 49 // 2. The client is expected to call SetTargetBitrate() frequently based on its 50 // own bandwidth estimates and congestion control logic. In addition, a client 51 // may provide a callback for each frame's encode statistics, which can be used 52 // to further optimize the user experience. For example, the stats can be used 53 // as a signal to reduce the data volume (i.e., resolution and/or frame rate) 54 // coming from the video capture source. 55 class StreamingVp8Encoder { 56 public: 57 // Configurable parameters passed to the StreamingVp8Encoder constructor. 58 struct Parameters { 59 // Number of threads to parallelize frame encoding. This should be set based 60 // on the number of CPU cores available for encoding, but no more than 8. 61 int num_encode_threads = 62 std::min(std::max<int>(std::thread::hardware_concurrency(), 1), 8); 63 64 // Best-quality quantizer (lower is better quality). Range: [0,63] 65 int min_quantizer = 4; 66 67 // Worst-quality quantizer (lower is better quality). Range: [0,63] 68 int max_quantizer = 63; 69 70 // Worst-quality quantizer to use when the CPU is extremely constrained. 71 // Range: [min_quantizer,max_quantizer] 72 int max_cpu_saver_quantizer = 25; 73 74 // Maximum amount of wall-time a frame's encode can take, relative to the 75 // frame's duration, before the CPU-saver logic is activated. The default 76 // (70%) is appropriate for systems with four or more cores, but should be 77 // reduced (e.g., 50%) for systems with fewer than three cores. 78 // 79 // Example: For 30 FPS (continuous) video, the frame duration is ~33.3ms, 80 // and a value of 0.5 here would mean that the CPU-saver logic starts 81 // sacrificing quality when frame encodes start taking longer than ~16.7ms. 82 double max_time_utilization = 0.7; 83 }; 84 85 // Represents an input VideoFrame, passed to EncodeAndSend(). 86 struct VideoFrame { 87 // Image width and height. 88 int width; 89 int height; 90 91 // I420 format image pointers and row strides (the number of bytes between 92 // the start of successive rows). The pointers only need to remain valid 93 // until the EncodeAndSend() call returns. 94 const uint8_t* yuv_planes[3]; 95 int yuv_strides[3]; 96 97 // How long this frame will be held before the next frame will be displayed, 98 // or zero if unknown. The frame duration is passed to the VP8 codec, 99 // affecting a number of important behaviors, including: per-frame 100 // bandwidth, CPU time spent encoding, temporal quality trade-offs, and 101 // key/golden/alt-ref frame generation intervals. 102 Clock::duration duration; 103 }; 104 105 // Performance statistics for a single frame's encode. 106 // 107 // For full details on how to use these stats in an end-to-end system, see: 108 // https://www.chromium.org/developers/design-documents/ 109 // auto-throttled-screen-capture-and-mirroring 110 // and https://source.chromium.org/chromium/chromium/src/+/master: 111 // media/cast/sender/performance_metrics_overlay.h 112 struct Stats { 113 // The Cast Streaming ID that was assigned to the frame. 114 FrameId frame_id; 115 116 // The RTP timestamp of the frame. 117 RtpTimeTicks rtp_timestamp; 118 119 // How long the frame took to encode. This is wall time, not CPU time or 120 // some other load metric. 121 Clock::duration encode_wall_time; 122 123 // The frame's predicted duration; or, the actual duration if it was 124 // provided in the VideoFrame. 125 Clock::duration frame_duration; 126 127 // The encoded frame's size in bytes. 128 int encoded_size; 129 130 // The average size of an encoded frame in bytes, having this 131 // |frame_duration| and current target bitrate. 132 double target_size; 133 134 // The actual quantizer the VP8 encoder used, in the range [0,63]. 135 int quantizer; 136 137 // The "hindsight" quantizer value that would have produced the best quality 138 // encoding of the frame at the current target bitrate. The nominal range is 139 // [0.0,63.0]. If it is larger than 63.0, then it was impossible for VP8 to 140 // encode the frame within the current target bitrate (e.g., too much 141 // "entropy" in the image, or too low a target bitrate). 142 double perfect_quantizer; 143 144 // Utilization feedback metrics. The nominal range for each of these is 145 // [0.0,1.0] where 1.0 means "the entire budget available for the frame was 146 // exhausted." Going above 1.0 is okay for one or a few frames, since it's 147 // the average over many frames that matters before the system is considered 148 // "redlining." 149 // 150 // The max of these three provides an overall utilization control signal. 151 // The usual approach is for upstream control logic to increase/decrease the 152 // data volume (e.g., video resolution and/or frame rate) to maintain a good 153 // target point. time_utilizationStats154 double time_utilization() const { 155 return static_cast<double>(encode_wall_time.count()) / 156 frame_duration.count(); 157 } space_utilizationStats158 double space_utilization() const { return encoded_size / target_size; } entropy_utilizationStats159 double entropy_utilization() const { 160 return perfect_quantizer / kMaxQuantizer; 161 } 162 }; 163 164 StreamingVp8Encoder(const Parameters& params, 165 TaskRunner* task_runner, 166 Sender* sender); 167 168 ~StreamingVp8Encoder(); 169 170 // Get/Set the target bitrate. This may be changed at any time, as frequently 171 // as desired, and it will take effect internally as soon as possible. 172 int GetTargetBitrate() const; 173 void SetTargetBitrate(int new_bitrate); 174 175 // Encode |frame| using the VP8 encoder, assemble an EncodedFrame, and enqueue 176 // into the Sender. The frame may be dropped if too many frames are in-flight. 177 // If provided, the |stats_callback| is run after the frame is enqueued in the 178 // Sender (via the main TaskRunner). 179 void EncodeAndSend(const VideoFrame& frame, 180 Clock::time_point reference_time, 181 std::function<void(Stats)> stats_callback); 182 183 static constexpr int kMinQuantizer = 0; 184 static constexpr int kMaxQuantizer = 63; 185 186 private: 187 // Syntactic convenience to wrap the vpx_image_t alloc/free API in a smart 188 // pointer. 189 struct VpxImageDeleter { operatorVpxImageDeleter190 void operator()(vpx_image_t* ptr) const { vpx_img_free(ptr); } 191 }; 192 using VpxImageUniquePtr = std::unique_ptr<vpx_image_t, VpxImageDeleter>; 193 194 // Represents the state of one frame encode. This is created in 195 // EncodeAndSend(), and passed to the encode thread via the |encode_queue_|. 196 struct WorkUnit { 197 VpxImageUniquePtr image; 198 Clock::duration duration; 199 Clock::time_point reference_time; 200 RtpTimeTicks rtp_timestamp; 201 std::function<void(Stats)> stats_callback; 202 }; 203 204 // Same as WorkUnit, but with additional fields to carry the encode results. 205 struct WorkUnitWithResults : public WorkUnit { 206 std::vector<uint8_t> payload; 207 bool is_key_frame; 208 Stats stats; 209 }; 210 is_encoder_initialized()211 bool is_encoder_initialized() const { return config_.g_threads != 0; } 212 213 // Destroys the VP8 encoder context if it has been initialized. 214 void DestroyEncoder(); 215 216 // The procedure for the |encode_thread_| that loops, processing work units 217 // from the |encode_queue_| by calling Encode() until it's time to end the 218 // thread. 219 void ProcessWorkUnitsUntilTimeToQuit(); 220 221 // If the |encoder_| is live, attempt reconfiguration to allow it to encode 222 // frames at a new frame size, target bitrate, or "CPU encoding speed." If 223 // reconfiguration is not possible, destroy the existing instance and 224 // re-create a new |encoder_| instance. 225 void PrepareEncoder(int width, int height, int target_bitrate); 226 227 // Wraps the complex libvpx vpx_codec_encode() call using inputs from 228 // |work_unit| and populating results there. 229 void EncodeFrame(bool force_key_frame, WorkUnitWithResults* work_unit); 230 231 // Computes and populates |work_unit.stats| after the last call to 232 // EncodeFrame(). 233 void ComputeFrameEncodeStats(Clock::duration encode_wall_time, 234 int target_bitrate, 235 WorkUnitWithResults* work_unit); 236 237 // Updates the |ideal_speed_setting_|, to take effect with the next frame 238 // encode, based on the given performance |stats|. 239 void UpdateSpeedSettingForNextFrame(const Stats& stats); 240 241 // Assembles and enqueues an EncodedFrame with the Sender on the main thread. 242 void SendEncodedFrame(WorkUnitWithResults results); 243 244 // Allocates a vpx_image_t and copies the content from |frame| to it. 245 static VpxImageUniquePtr CloneAsVpxImage(const VideoFrame& frame); 246 247 const Parameters params_; 248 TaskRunner* const main_task_runner_; 249 Sender* const sender_; 250 251 // The reference time of the first frame passed to EncodeAndSend(). 252 Clock::time_point start_time_ = Clock::time_point::min(); 253 254 // The RTP timestamp of the last frame that was pushed into the 255 // |encode_queue_| by EncodeAndSend(). This is used to check whether 256 // timestamps are monotonically increasing. 257 RtpTimeTicks last_enqueued_rtp_timestamp_; 258 259 // Guards a few members shared by both the main and encode threads. 260 std::mutex mutex_; 261 262 // Used by the encode thread to sleep until more work is available. 263 std::condition_variable cv_ ABSL_GUARDED_BY(mutex_); 264 265 // These encode parameters not passed in the WorkUnit struct because it is 266 // desirable for them to be applied as soon as possible, with the very next 267 // WorkUnit popped from the |encode_queue_| on the encode thread, and not to 268 // wait until some later WorkUnit is processed. 269 bool needs_key_frame_ ABSL_GUARDED_BY(mutex_) = true; 270 int target_bitrate_ ABSL_GUARDED_BY(mutex_) = 2 << 20; // Default: 2 Mbps. 271 272 // The queue of frame encodes. The size of this queue is implicitly bounded by 273 // EncodeAndSend(), where it checks for the total in-flight media duration and 274 // maybe drops a frame. 275 std::queue<WorkUnit> encode_queue_ ABSL_GUARDED_BY(mutex_); 276 277 // Current VP8 encoder configuration. Most of the fields are unchanging, and 278 // are populated in the ctor; but thereafter, only the encode thread accesses 279 // this struct. 280 // 281 // The speed setting is controlled via a separate libvpx API (see members 282 // below). 283 vpx_codec_enc_cfg_t config_{}; 284 285 // These represent the magnitude of the VP8 speed setting, where larger values 286 // (i.e., faster speed) request less CPU usage but will provide lower video 287 // quality. Only the encode thread accesses these. 288 double ideal_speed_setting_; // A time-weighted average, from measurements. 289 int current_speed_setting_; // Current |encoder_| speed setting. 290 291 // libvpx VP8 encoder instance. Only the encode thread accesses this. 292 vpx_codec_ctx_t encoder_; 293 294 // This member should be last in the class since the thread should not start 295 // until all above members have been initialized by the constructor. 296 std::thread encode_thread_; 297 }; 298 299 } // namespace cast 300 } // namespace openscreen 301 302 #endif // CAST_STANDALONE_SENDER_STREAMING_VP8_ENCODER_H_ 303