1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/gl/workgroups/ideal_workgroup_picker.h"
17
18 #include <map>
19 #include <vector>
20
21 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
22 #include "tensorflow/lite/delegates/gpu/common/operations.h"
23 #include "tensorflow/lite/delegates/gpu/common/shape.h"
24 #include "tensorflow/lite/delegates/gpu/common/types.h"
25 #include "tensorflow/lite/delegates/gpu/gl/workgroups/calculator.h"
26
27 namespace tflite {
28 namespace gpu {
29 namespace gl {
30 namespace {
31
32 // This code employs the results the workgroup performance reseach
33 // (b/117291356).
34
35 // Describes the ideal convolution for the specific operation case
36 // Case here means specific "kernel + strides" combination for specific
37 // operations type, not sizes of input and output tensors, they can be any.
38 struct IdealByCase {
ParamsAcceptedtflite::gpu::gl::__anon054730a20111::IdealByCase39 bool ParamsAccepted(OperationType in_op_type, HW in_kernel,
40 HW in_strides) const {
41 return operation_type == in_op_type && kernel == in_kernel &&
42 strides == in_strides;
43 }
44 OperationType operation_type;
45 HW kernel;
46 HW strides;
47 uint3 ideal_workgroup;
48 };
49
50 // Describes the ideal convolution for the type of operations. It means that
51 // any configuration of operation of this type will be working with top 10%
52 // performance with the particular GPU.
53 struct IdealByType {
ParamsAcceptedtflite::gpu::gl::__anon054730a20111::IdealByType54 bool ParamsAccepted(OperationType in_op_type) const {
55 return operation_type == in_op_type;
56 }
57 OperationType operation_type;
58 uint3 ideal_workgroup;
59 };
60
61 // Describes ideal workgroups for the particular GPU model.
62 struct IdealWorkgroups {
63 std::vector<IdealByType> by_type;
64 std::vector<IdealByCase> by_case;
65 };
66
67 // List of Ideal workgroups which is received after the research mentioned
68 // above.
69
70 // Ideal workgroups for Adreno 630.
71 std::vector<IdealByType>* kIdealByTypeAdreno630Ptr =
72 new std::vector<IdealByType>{
73 {OperationType::CONVOLUTION_2D, uint3(4, 8, 4)},
74 {OperationType::DEPTHWISE_CONVOLUTION, uint3(4, 4, 8)},
75 };
76
77 std::vector<IdealByCase>* kIdealByCaseAdreno630Ptr =
78 new std::vector<IdealByCase>{
79 {OperationType::CONVOLUTION_2D, HW(1, 1), HW(1, 1), uint3(4, 8, 4)},
80 {OperationType::CONVOLUTION_2D, HW(3, 3), HW(2, 2), uint3(8, 4, 4)},
81 {OperationType::DEPTHWISE_CONVOLUTION, HW(1, 1), HW(1, 1),
82 uint3(8, 4, 4)},
83 {OperationType::DEPTHWISE_CONVOLUTION, HW(3, 3), HW(2, 2),
84 uint3(4, 4, 4)},
85 };
86
87 // Ideal workgroups for Adreno 540.
88 std::vector<IdealByType>* kIdealByTypeAdreno540Ptr =
89 new std::vector<IdealByType>{
90 {OperationType::CONVOLUTION_2D, uint3(8, 2, 2)},
91 {OperationType::DEPTHWISE_CONVOLUTION, uint3(8, 8, 2)},
92 };
93
94 std::vector<IdealByCase>* kIdealByCaseAdreno540Ptr =
95 new std::vector<IdealByCase>{
96 {OperationType::CONVOLUTION_2D, HW(1, 1), HW(1, 1), uint3(4, 2, 8)},
97 {OperationType::CONVOLUTION_2D, HW(3, 3), HW(2, 2), uint3(8, 2, 8)},
98 {OperationType::DEPTHWISE_CONVOLUTION, HW(1, 1), HW(1, 1),
99 uint3(8, 4, 8)},
100 {OperationType::DEPTHWISE_CONVOLUTION, HW(3, 3), HW(2, 2),
101 uint3(4, 4, 8)},
102 };
103
104 // Ideal workgroups for Adreno 510.
105 std::vector<IdealByType>* kIdealByTypeAdreno510Ptr =
106 new std::vector<IdealByType>{
107 {OperationType::CONVOLUTION_2D, uint3(8, 4, 4)},
108 {OperationType::DEPTHWISE_CONVOLUTION, uint3(8, 4, 4)},
109 };
110
111 std::vector<IdealByCase>* kIdealByCaseAdreno510Ptr =
112 new std::vector<IdealByCase>{
113 {OperationType::CONVOLUTION_2D, HW(1, 1), HW(1, 1), uint3(4, 2, 8)},
114 {OperationType::CONVOLUTION_2D, HW(3, 3), HW(2, 2), uint3(8, 2, 8)},
115 {OperationType::DEPTHWISE_CONVOLUTION, HW(1, 1), HW(1, 1),
116 uint3(8, 4, 8)},
117 {OperationType::DEPTHWISE_CONVOLUTION, HW(3, 3), HW(2, 2),
118 uint3(4, 4, 8)},
119 };
120
121 // Ideal workgroups for Adreno 509.
122 std::vector<IdealByType>* kIdealByTypeAdreno509Ptr =
123 new std::vector<IdealByType>{
124 {OperationType::CONVOLUTION_2D, uint3(8, 4, 8)},
125 {OperationType::DEPTHWISE_CONVOLUTION, uint3(8, 8, 2)},
126 };
127
128 // Ideal workgroups for Adreno 508, 506, 505, 418, 405
129 std::vector<IdealByType>* kIdealByTypeAdreno508Ptr =
130 new std::vector<IdealByType>{
131 {OperationType::CONVOLUTION_2D, uint3(8, 4, 8)},
132 {OperationType::DEPTHWISE_CONVOLUTION, uint3(8, 4, 8)},
133 };
134 std::vector<IdealByType>* kIdealByTypeAdreno506Ptr = kIdealByTypeAdreno508Ptr;
135 std::vector<IdealByType>* kIdealByTypeAdreno505Ptr = kIdealByTypeAdreno508Ptr;
136 std::vector<IdealByType>* kIdealByTypeAdreno418Ptr = kIdealByTypeAdreno508Ptr;
137 std::vector<IdealByType>* kIdealByTypeAdreno405Ptr = kIdealByTypeAdreno508Ptr;
138
139 // Put all ideal workgroups from the list together.
140 const std::map<AdrenoGpu, IdealWorkgroups>* kIdealAdrenoWorkgroupsInfoPtr =
141 new std::map<AdrenoGpu, IdealWorkgroups>{
142 {AdrenoGpu::kAdreno630,
143 {*kIdealByTypeAdreno630Ptr, *kIdealByCaseAdreno630Ptr}},
144 {AdrenoGpu::kAdreno540, {*kIdealByTypeAdreno540Ptr, {}}},
145 {AdrenoGpu::kAdreno510,
146 {*kIdealByTypeAdreno510Ptr, *kIdealByCaseAdreno510Ptr}},
147 {AdrenoGpu::kAdreno509, {*kIdealByTypeAdreno509Ptr, {}}},
148 {AdrenoGpu::kAdreno508, {*kIdealByTypeAdreno508Ptr, {}}},
149 {AdrenoGpu::kAdreno506, {*kIdealByTypeAdreno506Ptr, {}}},
150 {AdrenoGpu::kAdreno505, {*kIdealByTypeAdreno505Ptr, {}}},
151 {AdrenoGpu::kAdreno418, {*kIdealByTypeAdreno418Ptr, {}}},
152 {AdrenoGpu::kAdreno405, {*kIdealByTypeAdreno405Ptr, {}}},
153 };
154
155 } // namespace
156
GetIdealWorkgroupIfPossible(const GpuInfo & gpu_info,OperationType op_type,HW kernel,HW strides,uint3 default_wg,OHWI workload)157 uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
158 OperationType op_type, HW kernel, HW strides,
159 uint3 default_wg, OHWI workload) {
160 // Research showed that ideal workgroup approach doesn't work well with
161 // convolutions, which have small amount of output channels or output
162 // height/width dimensions
163 if (workload.o < 32 || workload.h <= 5 || workload.w <= 5) return default_wg;
164
165 if (!gpu_info.IsAdreno()) {
166 return default_wg;
167 }
168 auto adreno_gpu_version = gpu_info.adreno_info.adreno_gpu;
169
170 // If GPU was investigated
171 if (!kIdealAdrenoWorkgroupsInfoPtr->count(adreno_gpu_version)) {
172 return default_wg;
173 }
174
175 // Try to find the ideal workgroup by the specific operation case, cause they
176 // are expected to be better tuned than default "by type" cases
177 for (const auto& specific_case :
178 kIdealAdrenoWorkgroupsInfoPtr->at(adreno_gpu_version).by_case) {
179 if (specific_case.ParamsAccepted(op_type, kernel, strides)) {
180 return specific_case.ideal_workgroup;
181 }
182 }
183
184 // Try to find the ideal workgroup by the operation type
185 for (const auto& default_case :
186 kIdealAdrenoWorkgroupsInfoPtr->at(adreno_gpu_version).by_type) {
187 if (default_case.ParamsAccepted(op_type)) {
188 return default_case.ideal_workgroup;
189 }
190 }
191
192 // If no ideal workgroup is found, use the default workgroup suggested by each
193 // operation.
194 return default_wg;
195 }
196
GetIdealWorkgroupIfPossible(const GpuInfo & gpu_info,OperationType op_type,HW kernel,HW strides,OHWI workload)197 uint3 GetIdealWorkgroupIfPossible(const GpuInfo& gpu_info,
198 OperationType op_type, HW kernel, HW strides,
199 OHWI workload) {
200 return GetIdealWorkgroupIfPossible(gpu_info, op_type, kernel, strides,
201 kEmptyWorkgroupSize, workload);
202 }
203
204 } // namespace gl
205 } // namespace gpu
206 } // namespace tflite
207