1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/threading_strategy.h"
16
17 #include <algorithm>
18 #include <cassert>
19 #include <memory>
20
21 #include "src/frame_scratch_buffer.h"
22 #include "src/utils/constants.h"
23 #include "src/utils/logging.h"
24 #include "src/utils/vector.h"
25
26 namespace libgav1 {
27 namespace {
28
29 #if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
30 constexpr int kFrameParallelThresholdMultiplier = 3;
31 #else
32 constexpr int kFrameParallelThresholdMultiplier =
33 LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
34 #endif
35
36 // Computes the number of frame threads to be used based on the following
37 // heuristic:
38 // * If |thread_count| == 1, return 0.
39 // * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
40 // return 0.
41 // * Otherwise, return the largest value of i which satisfies the following
42 // condition: i + i * tile_columns <= thread_count. This ensures that there
43 // are at least |tile_columns| worker threads for each frame thread.
44 // * This function will never return 1 or a value > |thread_count|.
45 //
46 // This heuristic is based on empirical performance data. The in-frame
47 // threading model (combination of tile multithreading, superblock row
48 // multithreading and post filter multithreading) performs better than the
49 // frame parallel model until we reach the threshold of |thread_count| >
50 // |tile_count| * kFrameParallelThresholdMultiplier.
51 //
52 // It is a function of |tile_count| since tile threading and superblock row
53 // multithreading will scale only as a factor of |tile_count|. The threshold
54 // kFrameParallelThresholdMultiplier is arrived at based on empirical data.
55 // The general idea is that superblock row multithreading plateaus at 4 *
56 // |tile_count| because in most practical cases there aren't more than that
57 // many superblock rows and columns available to work on in parallel.
ComputeFrameThreadCount(int thread_count,int tile_count,int tile_columns)58 int ComputeFrameThreadCount(int thread_count, int tile_count,
59 int tile_columns) {
60 assert(thread_count > 0);
61 if (thread_count == 1) return 0;
62 return (thread_count <= tile_count * kFrameParallelThresholdMultiplier)
63 ? 0
64 : std::max(2, thread_count / (1 + tile_columns));
65 }
66
67 } // namespace
68
Reset(const ObuFrameHeader & frame_header,int thread_count)69 bool ThreadingStrategy::Reset(const ObuFrameHeader& frame_header,
70 int thread_count) {
71 assert(thread_count > 0);
72 frame_parallel_ = false;
73
74 if (thread_count == 1) {
75 thread_pool_.reset(nullptr);
76 tile_thread_count_ = 0;
77 max_tile_index_for_row_threads_ = 0;
78 return true;
79 }
80
81 // We do work in the current thread, so it is sufficient to create
82 // |thread_count|-1 threads in the threadpool.
83 thread_count = std::min(thread_count, static_cast<int>(kMaxThreads)) - 1;
84
85 if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
86 thread_pool_ = ThreadPool::Create("libgav1", thread_count);
87 if (thread_pool_ == nullptr) {
88 LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
89 thread_count);
90 tile_thread_count_ = 0;
91 max_tile_index_for_row_threads_ = 0;
92 return false;
93 }
94 }
95
96 // Prefer tile threads first (but only if there is more than one tile).
97 const int tile_count = frame_header.tile_info.tile_count;
98 if (tile_count > 1) {
99 // We want 1 + tile_thread_count_ <= tile_count because the current thread
100 // is also used to decode tiles. This is equivalent to
101 // tile_thread_count_ <= tile_count - 1.
102 tile_thread_count_ = std::min(thread_count, tile_count - 1);
103 thread_count -= tile_thread_count_;
104 if (thread_count == 0) {
105 max_tile_index_for_row_threads_ = 0;
106 return true;
107 }
108 } else {
109 tile_thread_count_ = 0;
110 }
111
112 #if defined(__ANDROID__)
113 // Assign the remaining threads for each Tile. The heuristic used here is that
114 // we will assign two threads for each Tile. So for example, if |thread_count|
115 // is 2, for a stream with 2 tiles the first tile would get both the threads
116 // and the second tile would have row multi-threading turned off. This
117 // heuristic is based on the fact that row multi-threading is fast enough only
118 // when there are at least two threads to do the decoding (since one thread
119 // always does the parsing).
120 //
121 // This heuristic might stop working when SIMD optimizations make the decoding
122 // much faster and the parsing thread is only as fast as the decoding threads.
123 // So we will have to revisit this later to make sure that this is still
124 // optimal.
125 //
126 // Note that while this heuristic significantly improves performance on high
127 // end devices (like the Pixel 3), there are some performance regressions in
128 // some lower end devices (in some cases) and that needs to be revisited as we
129 // bring in more optimizations. Overall, the gains because of this heuristic
130 // seems to be much larger than the regressions.
131 for (int i = 0; i < tile_count; ++i) {
132 max_tile_index_for_row_threads_ = i + 1;
133 thread_count -= 2;
134 if (thread_count <= 0) break;
135 }
136 #else // !defined(__ANDROID__)
137 // Assign the remaining threads to each Tile.
138 for (int i = 0; i < tile_count; ++i) {
139 const int count = thread_count / tile_count +
140 static_cast<int>(i < thread_count % tile_count);
141 if (count == 0) {
142 // Once we see a 0 value, all subsequent values will be 0 since it is
143 // supposed to be assigned in a round-robin fashion.
144 break;
145 }
146 max_tile_index_for_row_threads_ = i + 1;
147 }
148 #endif // defined(__ANDROID__)
149 return true;
150 }
151
Reset(int thread_count)152 bool ThreadingStrategy::Reset(int thread_count) {
153 assert(thread_count > 0);
154 frame_parallel_ = true;
155
156 // In frame parallel mode, we simply access the underlying |thread_pool_|
157 // directly. So ensure all the other threadpool getter functions return
158 // nullptr. Also, superblock row multithreading is always disabled in frame
159 // parallel mode.
160 tile_thread_count_ = 0;
161 max_tile_index_for_row_threads_ = 0;
162
163 if (thread_pool_ == nullptr || thread_pool_->num_threads() != thread_count) {
164 thread_pool_ = ThreadPool::Create("libgav1-fp", thread_count);
165 if (thread_pool_ == nullptr) {
166 LIBGAV1_DLOG(ERROR, "Failed to create a thread pool with %d threads.",
167 thread_count);
168 return false;
169 }
170 }
171 return true;
172 }
173
InitializeThreadPoolsForFrameParallel(int thread_count,int tile_count,int tile_columns,std::unique_ptr<ThreadPool> * const frame_thread_pool,FrameScratchBufferPool * const frame_scratch_buffer_pool)174 bool InitializeThreadPoolsForFrameParallel(
175 int thread_count, int tile_count, int tile_columns,
176 std::unique_ptr<ThreadPool>* const frame_thread_pool,
177 FrameScratchBufferPool* const frame_scratch_buffer_pool) {
178 assert(*frame_thread_pool == nullptr);
179 thread_count = std::min(thread_count, static_cast<int>(kMaxThreads));
180 const int frame_threads =
181 ComputeFrameThreadCount(thread_count, tile_count, tile_columns);
182 if (frame_threads == 0) return true;
183 *frame_thread_pool = ThreadPool::Create(frame_threads);
184 if (*frame_thread_pool == nullptr) {
185 LIBGAV1_DLOG(ERROR, "Failed to create frame thread pool with %d threads.",
186 frame_threads);
187 return false;
188 }
189 int remaining_threads = thread_count - frame_threads;
190 if (remaining_threads == 0) return true;
191 int threads_per_frame = remaining_threads / frame_threads;
192 const int extra_threads = remaining_threads % frame_threads;
193 Vector<std::unique_ptr<FrameScratchBuffer>> frame_scratch_buffers;
194 if (!frame_scratch_buffers.reserve(frame_threads)) return false;
195 // Create the tile thread pools.
196 for (int i = 0; i < frame_threads && remaining_threads > 0; ++i) {
197 std::unique_ptr<FrameScratchBuffer> frame_scratch_buffer =
198 frame_scratch_buffer_pool->Get();
199 if (frame_scratch_buffer == nullptr) {
200 return false;
201 }
202 // If the number of tile threads cannot be divided equally amongst all the
203 // frame threads, assign one extra thread to the first |extra_threads| frame
204 // threads.
205 const int current_frame_thread_count =
206 threads_per_frame + static_cast<int>(i < extra_threads);
207 if (!frame_scratch_buffer->threading_strategy.Reset(
208 current_frame_thread_count)) {
209 return false;
210 }
211 remaining_threads -= current_frame_thread_count;
212 frame_scratch_buffers.push_back_unchecked(std::move(frame_scratch_buffer));
213 }
214 // We release the frame scratch buffers in reverse order so that the extra
215 // threads are allocated to buffers in the top of the stack.
216 for (int i = static_cast<int>(frame_scratch_buffers.size()) - 1; i >= 0;
217 --i) {
218 frame_scratch_buffer_pool->Release(std::move(frame_scratch_buffers[i]));
219 }
220 return true;
221 }
222
223 } // namespace libgav1
224