1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/common/task/work_group_picking.h"
17
18 #include <algorithm>
19 #include <limits>
20 #include <set>
21 #include <vector>
22
23 #include "tensorflow/lite/delegates/gpu/common/util.h"
24
25 namespace tflite {
26 namespace gpu {
27
28 namespace {
29
Get2DWorkgroupsEqualTo128()30 std::vector<int2> Get2DWorkgroupsEqualTo128() {
31 return {{128, 1}, {64, 2}, {32, 4}, {16, 8},
32 {8, 16}, {4, 32}, {2, 64}, {1, 128}};
33 }
34
GenerateWorkGroupSizesXYMultipleOf(int multiplier,int3 grid,const KernelInfo & kernel_info,const GpuInfo & gpu_info,WorkGroupSizeAlignment z_alignment)35 std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(
36 int multiplier, int3 grid, const KernelInfo& kernel_info,
37 const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
38 std::vector<int3> work_groups;
39 work_groups.reserve(32);
40
41 std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
42
43 for (int x = 1; x <= kernel_info.max_work_group_size; x *= 2) {
44 for (int y = 1; y <= kernel_info.max_work_group_size; y *= 2) {
45 int work_group_size_xy = x * y;
46 if (work_group_size_xy % multiplier != 0 ||
47 work_group_size_xy > kernel_info.max_work_group_size) {
48 continue;
49 }
50 for (auto z : possible_z_sizes) {
51 if (work_group_size_xy * z > kernel_info.max_work_group_size) {
52 continue;
53 }
54 if (x <= gpu_info.GetMaxWorkGroupSizeForX() &&
55 y <= gpu_info.GetMaxWorkGroupSizeForY() &&
56 z <= gpu_info.GetMaxWorkGroupSizeForZ()) {
57 work_groups.push_back({x, y, z});
58 }
59 }
60 }
61 }
62 return work_groups;
63 }
64
GenerateWorkGroupSizesXMultipleOf(int multiplier,int3 grid,const KernelInfo & kernel_info,const GpuInfo & gpu_info,WorkGroupSizeAlignment z_alignment)65 std::vector<int3> GenerateWorkGroupSizesXMultipleOf(
66 int multiplier, int3 grid, const KernelInfo& kernel_info,
67 const GpuInfo& gpu_info, WorkGroupSizeAlignment z_alignment) {
68 std::vector<int3> work_groups;
69 work_groups.reserve(32);
70
71 std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
72 std::vector<int> possible_y_sizes =
73 GetPossibleSizes(grid.y, WorkGroupSizeAlignment::PRECISE);
74
75 for (int x = multiplier;
76 x <= kernel_info.max_work_group_size && x < grid.x + multiplier;
77 x += multiplier) {
78 for (auto y : possible_y_sizes) {
79 for (auto z : possible_z_sizes) {
80 if (x <= gpu_info.GetMaxWorkGroupSizeForX() &&
81 y <= gpu_info.GetMaxWorkGroupSizeForY() &&
82 z <= gpu_info.GetMaxWorkGroupSizeForZ() &&
83 x * y * z <= kernel_info.max_work_group_size) {
84 work_groups.push_back({x, y, z});
85 }
86 }
87 }
88 }
89 return work_groups;
90 }
91
GetWorkGroupsAlignedToGrid(const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,std::vector<int3> * work_groups)92 void GetWorkGroupsAlignedToGrid(const GpuInfo& gpu_info,
93 const KernelInfo& kernel_info, const int3& grid,
94 std::vector<int3>* work_groups) {
95 int3 max_wg_size;
96 max_wg_size.x = gpu_info.GetMaxWorkGroupSizeForX();
97 max_wg_size.y = gpu_info.GetMaxWorkGroupSizeForY();
98 max_wg_size.z = gpu_info.GetMaxWorkGroupSizeForZ();
99 GenerateWorkGroupSizesAlignedToGrid(
100 grid, max_wg_size, kernel_info.max_work_group_size, work_groups);
101 }
102
GetPenalty(int grid_size,int group_size)103 int GetPenalty(int grid_size, int group_size) {
104 const int reminder = grid_size % group_size;
105 return reminder == 0 ? 0 : group_size - reminder;
106 }
107
GetPenalty(int2 grid_size,int2 group_size)108 int GetPenalty(int2 grid_size, int2 group_size) {
109 const int p_x = GetPenalty(grid_size.x, group_size.x);
110 const int p_y = GetPenalty(grid_size.y, group_size.y);
111 return p_x * grid_size.y + p_y * grid_size.x + p_x * p_y;
112 }
113
GetMaxSizeWithMinPenalty(int size,int max_size)114 int GetMaxSizeWithMinPenalty(int size, int max_size) {
115 int best_size = 128;
116 int min_penalty = GetPenalty(size, best_size);
117 for (int i = 2; i * 128 <= max_size; ++i) {
118 if (GetPenalty(size, i * 128) == min_penalty) {
119 best_size = i * 128;
120 }
121 }
122 return best_size;
123 }
124
GetMaxSizeWithMinPenalty(int2 size,int max_size)125 int2 GetMaxSizeWithMinPenalty(int2 size, int max_size) {
126 std::vector<int2> base_groups = Get2DWorkgroupsEqualTo128();
127 int min_penalty = std::numeric_limits<int>::max();
128 for (const auto& group : base_groups) {
129 min_penalty = std::min(GetPenalty(size, group), min_penalty);
130 }
131 for (const auto& group : base_groups) {
132 for (int y = 1; y * group.y <= max_size; ++y) {
133 int new_group_y = y * group.y;
134 for (int x = 1; x * group.x <= max_size; ++x) {
135 int new_group_x = x * group.x;
136 if (new_group_x * new_group_y > max_size) {
137 break;
138 }
139 if (GetPenalty(size, int2(new_group_x, new_group_y)) == min_penalty) {
140 return int2(new_group_x, new_group_y);
141 }
142 }
143 }
144 }
145 return int2(0, 0);
146 }
147
GetBiggestDividerWithPriority(int number,int max_divider)148 int GetBiggestDividerWithPriority(int number, int max_divider) {
149 if (number % 8 == 0 && 8 <= max_divider) {
150 return 8;
151 }
152 if (number % 4 == 0 && 4 <= max_divider) {
153 return 4;
154 }
155 if (number % 2 == 0 && 2 <= max_divider) {
156 return 2;
157 }
158 for (int i = max_divider; i != 0; i--) {
159 if (number % i == 0) {
160 return i;
161 }
162 }
163 return 1;
164 }
165
GetBiggestDivider(int number,int max_divider)166 int GetBiggestDivider(int number, int max_divider) {
167 for (int i = max_divider; i != 0; i--) {
168 if (number % i == 0) {
169 return i;
170 }
171 }
172 return 1;
173 }
174
GetOptimalSizeForApple(int grid_size)175 int GetOptimalSizeForApple(int grid_size) {
176 if (grid_size % 8 == 0 || grid_size % 8 >= 4 || grid_size >= 16) {
177 return 8;
178 }
179 if (grid_size % 4 == 0 || grid_size % 4 >= 2 || grid_size >= 8) {
180 return 4;
181 }
182 if (grid_size % 2 == 0 || grid_size >= 4) {
183 return 2;
184 }
185 return 1;
186 }
187
GetWorkGroupSizeForApple(const uint3 & grid_size)188 int3 GetWorkGroupSizeForApple(const uint3& grid_size) {
189 int x_size = GetOptimalSizeForApple(grid_size.x);
190 int y_size = GetOptimalSizeForApple(grid_size.y);
191 int z_size = std::max(1, 32 / (x_size * y_size));
192 z_size = std::min(z_size, static_cast<int>(grid_size.z));
193 return {x_size, y_size, z_size};
194 }
195
196 } // namespace
197
GetWorkGroupXY128ConvLinear(const int3 & grid)198 int3 GetWorkGroupXY128ConvLinear(const int3& grid) {
199 int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
200 if (grid.x <= 128) {
201 return int3(128, 1, grid_z);
202 }
203 int grid_x = GetMaxSizeWithMinPenalty(grid.x, 512 / grid_z);
204 return {grid_x, 1, grid_z};
205 }
206
GetWorkGroupXY128Conv(const int3 & grid)207 int3 GetWorkGroupXY128Conv(const int3& grid) {
208 int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
209 if (grid.x <= 16 && grid.y <= 8) {
210 return int3(16, 8, grid_z);
211 }
212 int2 grid_xy = GetMaxSizeWithMinPenalty(int2(grid.x, grid.y), 512 / grid_z);
213 return int3(grid_xy.x, grid_xy.y, grid_z);
214 }
215
GetWorkGroupXY128Simple(const int3 & grid)216 int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
217
GetWorkGroup(const int3 & grid,int max_size)218 int3 GetWorkGroup(const int3& grid, int max_size) {
219 int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
220 int wg_xy_size = max_size / wg_z;
221 int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
222 int wg_y = std::min(wg_xy_size / wg_x, grid.y);
223 return int3(wg_x, wg_y, wg_z);
224 }
225
GetWorkGroupConv(const int3 & grid,int max_size,int max_z_size)226 int3 GetWorkGroupConv(const int3& grid, int max_size, int max_z_size) {
227 int wg_z = GetBiggestDivider(grid.z, max_z_size);
228 int wg_xy_size = std::min(256, max_size) / wg_z;
229 int wg_x = std::min(grid.x, wg_xy_size);
230 int wg_y = std::min(wg_xy_size / wg_x, grid.y);
231 if (wg_y == grid.y && grid.y % 2 == 0) {
232 wg_y = grid.y / 2;
233 }
234 return int3(wg_x, wg_y, wg_z);
235 }
236
GetPossibleWorkGroupsXYMultipleOf(int multiplier,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,WorkGroupSizeAlignment z_alignment,std::vector<int3> * work_groups)237 void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const GpuInfo& gpu_info,
238 const KernelInfo& kernel_info,
239 const int3& grid,
240 WorkGroupSizeAlignment z_alignment,
241 std::vector<int3>* work_groups) {
242 *work_groups = GenerateWorkGroupSizesXYMultipleOf(
243 multiplier, grid, kernel_info, gpu_info, z_alignment);
244 }
245
GetPossibleWorkGroupsXMultipleOf(int multiplier,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,WorkGroupSizeAlignment z_alignment,std::vector<int3> * work_groups)246 void GetPossibleWorkGroupsXMultipleOf(int multiplier, const GpuInfo& gpu_info,
247 const KernelInfo& kernel_info,
248 const int3& grid,
249 WorkGroupSizeAlignment z_alignment,
250 std::vector<int3>* work_groups) {
251 *work_groups = GenerateWorkGroupSizesXMultipleOf(
252 multiplier, grid, kernel_info, gpu_info, z_alignment);
253 }
254
XY128RequiresMoreWorkGroupsThenXY128Linear(int width,int height)255 bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) {
256 int planar_work_groups = DivideRoundUp(width * height, 128);
257 auto base_work_groups = Get2DWorkgroupsEqualTo128();
258 bool have_equal_work_groups = false;
259 for (auto& work_group : base_work_groups) {
260 int x_groups = DivideRoundUp(width, work_group.x);
261 int y_groups = DivideRoundUp(height, work_group.y);
262 int xy_groups = x_groups * y_groups;
263 if (xy_groups == planar_work_groups) {
264 have_equal_work_groups = true;
265 break;
266 }
267 }
268 return !have_equal_work_groups;
269 }
270
GetPossibleWorkGroups(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,std::vector<int3> * work_groups)271 void GetPossibleWorkGroups(TuningType tuning_type, const GpuInfo& gpu_info,
272 const KernelInfo& kernel_info, const int3& grid,
273 std::vector<int3>* work_groups) {
274 if (gpu_info.IsApple()) {
275 work_groups->push_back(GetWorkGroupSizeForApple(grid));
276 return;
277 }
278 switch (tuning_type) {
279 case TuningType::kFast:
280 work_groups->push_back(
281 GetWorkGroup(grid, kernel_info.max_work_group_size));
282 return;
283 case TuningType::kExhaustive: {
284 GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
285 return;
286 }
287 default:
288 work_groups->push_back({8, 4, 1});
289 return;
290 }
291 }
292
GetPossibleWorkGroupsConv(TuningType tuning_type,const GpuInfo & gpu_info,const KernelInfo & kernel_info,const int3 & grid,std::vector<int3> * work_groups)293 void GetPossibleWorkGroupsConv(TuningType tuning_type, const GpuInfo& gpu_info,
294 const KernelInfo& kernel_info, const int3& grid,
295 std::vector<int3>* work_groups) {
296 if (gpu_info.IsApple()) {
297 work_groups->push_back(GetWorkGroupSizeForApple(grid));
298 return;
299 }
300 switch (tuning_type) {
301 case TuningType::kFast: {
302 int max_z_size = 16;
303 if (gpu_info.IsAdreno()) {
304 max_z_size = gpu_info.adreno_info.IsAdreno3xx() ? 16 : 64;
305 }
306 max_z_size = std::min(max_z_size, gpu_info.GetMaxWorkGroupSizeForZ());
307 work_groups->push_back(
308 GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
309 return;
310 }
311 case TuningType::kExhaustive: {
312 GetWorkGroupsAlignedToGrid(gpu_info, kernel_info, grid, work_groups);
313 return;
314 }
315 default:
316 work_groups->push_back({8, 4, 1});
317 return;
318 }
319 }
320
GetFirstSuitableWorkGroup(const std::vector<int3> & wgs,int max_wg_size)321 int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size) {
322 for (const auto& wg : wgs) {
323 const int wg_size = wg.x * wg.y * wg.z;
324 if (wg_size <= max_wg_size) {
325 return wg;
326 }
327 }
328 return {1, 1, 1};
329 }
330
331 } // namespace gpu
332 } // namespace tflite
333