1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
17
18 #include <array>
19 #include <map>
20 #include <string>
21 #include <vector>
22
23 #include "absl/strings/str_cat.h"
24 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
26 #include "tensorflow/lite/delegates/gpu/cl/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/status.h"
28 #include "tensorflow/lite/delegates/gpu/common/types.h"
29
30 namespace tflite {
31 namespace gpu {
32 namespace cl {
33
CLCommandQueue(cl_command_queue queue,bool has_ownership)34 CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership)
35 : queue_(queue), has_ownership_(has_ownership) {}
36
CLCommandQueue(CLCommandQueue && queue)37 CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue)
38 : queue_(queue.queue_), has_ownership_(queue.has_ownership_) {
39 queue.queue_ = nullptr;
40 }
41
operator =(CLCommandQueue && queue)42 CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
43 if (this != &queue) {
44 Release();
45 std::swap(queue_, queue.queue_);
46 has_ownership_ = queue.has_ownership_;
47 }
48 return *this;
49 }
50
~CLCommandQueue()51 CLCommandQueue::~CLCommandQueue() { Release(); }
52
Release()53 void CLCommandQueue::Release() {
54 if (has_ownership_ && queue_) {
55 clReleaseCommandQueue(queue_);
56 queue_ = nullptr;
57 }
58 }
59
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,CLEvent * event)60 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
61 const int3& work_groups_count,
62 const int3& work_group_size,
63 CLEvent* event) {
64 std::array<size_t, 3> local;
65 std::array<size_t, 3> global;
66 for (int i = 0; i < 3; ++i) {
67 local[i] = work_group_size[i];
68 global[i] = work_groups_count[i] * work_group_size[i];
69 }
70 cl_event resulting_event;
71 const int error_code = clEnqueueNDRangeKernel(
72 queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0,
73 nullptr, event ? &resulting_event : nullptr);
74 if (event) {
75 *event = CLEvent(resulting_event);
76 }
77 if (error_code != CL_SUCCESS) {
78 return absl::UnknownError(
79 absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
80 CLErrorCodeToString(error_code)));
81 }
82 return absl::OkStatus();
83 }
84
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)85 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
86 const int3& work_groups_count,
87 const int3& work_group_size) {
88 return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
89 }
90
EnqueueEvent(CLEvent * event)91 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
92 cl_event resulting_event;
93 const int error_code = clEnqueueMarker(queue_, &resulting_event);
94 *event = CLEvent(resulting_event);
95 if (error_code != CL_SUCCESS) {
96 return absl::UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
97 CLErrorCodeToString(error_code)));
98 }
99 return absl::OkStatus();
100 }
101
EnqueueWriteImage(cl_mem memory,int3 region,const void * data)102 absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
103 const void* data) {
104 const size_t origin[] = {0, 0, 0};
105 const size_t r[] = {static_cast<size_t>(region.x),
106 static_cast<size_t>(region.y),
107 static_cast<size_t>(region.z)};
108 auto error_code = clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0,
109 0, data, 0, nullptr, nullptr);
110 if (error_code != CL_SUCCESS) {
111 return absl::UnknownError(
112 absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
113 CLErrorCodeToString(error_code)));
114 }
115
116 return absl::OkStatus();
117 }
118
EnqueueReadImage(cl_mem memory,int3 region,void * data)119 absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
120 void* data) {
121 const size_t origin[] = {0, 0, 0};
122 const size_t r[] = {static_cast<size_t>(region.x),
123 static_cast<size_t>(region.y),
124 static_cast<size_t>(region.z)};
125 auto error_code = clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0,
126 data, 0, nullptr, nullptr);
127 if (error_code != CL_SUCCESS) {
128 return absl::UnknownError(
129 absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
130 CLErrorCodeToString(error_code)));
131 }
132
133 return absl::OkStatus();
134 }
135
EnqueueWriteBuffer(cl_mem memory,size_t size_in_bytes,const void * data)136 absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory,
137 size_t size_in_bytes,
138 const void* data) {
139 auto error_code = clEnqueueWriteBuffer(
140 queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
141 if (error_code != CL_SUCCESS) {
142 return absl::UnknownError(
143 absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
144 CLErrorCodeToString(error_code)));
145 }
146 return absl::OkStatus();
147 }
148
EnqueueReadBuffer(cl_mem memory,size_t size_in_bytes,void * data)149 absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory,
150 size_t size_in_bytes,
151 void* data) {
152 auto error_code = clEnqueueReadBuffer(
153 queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
154 if (error_code != CL_SUCCESS) {
155 return absl::UnknownError(
156 absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
157 CLErrorCodeToString(error_code)));
158 }
159 return absl::OkStatus();
160 }
161
WaitForCompletion()162 absl::Status CLCommandQueue::WaitForCompletion() {
163 auto error_code = clFinish(queue_);
164 if (error_code != CL_SUCCESS) {
165 return absl::UnknownError(
166 absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
167 }
168 return absl::OkStatus();
169 }
170
ProfilingCommandQueue(cl_command_queue queue)171 ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
172 : CLCommandQueue(queue, true) {
173 events_.reserve(128);
174 }
175
ProfilingCommandQueue(ProfilingCommandQueue && queue)176 ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
177 : CLCommandQueue(std::move(queue)),
178 events_(std::move(queue.events_)),
179 current_label_(std::move(queue.current_label_)) {}
180
operator =(ProfilingCommandQueue && queue)181 ProfilingCommandQueue& ProfilingCommandQueue::operator=(
182 ProfilingCommandQueue&& queue) {
183 if (this != &queue) {
184 events_ = std::move(queue.events_);
185 current_label_ = std::move(queue.current_label_);
186 CLCommandQueue::operator=(std::move(queue));
187 }
188 return *this;
189 }
190
SetEventsLabel(const std::string & name)191 void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
192 current_label_ = name;
193 }
194
ResetMeasurements()195 void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
196
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)197 absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
198 const int3& work_groups_count,
199 const int3& work_group_size) {
200 events_.push_back(CLEvent());
201 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
202 work_group_size,
203 &events_[events_.size() - 1]));
204 events_.back().SetName(current_label_);
205 return absl::OkStatus();
206 }
207
GetProfilingInfo() const208 ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
209 ProfilingInfo result;
210 result.dispatches.resize(events_.size());
211 for (int i = 0; i < events_.size(); ++i) {
212 result.dispatches[i].label = events_[i].GetName();
213 result.dispatches[i].duration =
214 absl::Nanoseconds(events_[i].GetEventTimeNs());
215 }
216 return result;
217 }
218
GetBestWorkGroupIndex(const CLKernel & kernel,const GpuInfo & gpu_info,const std::vector<int3> & work_groups_count,const std::vector<int3> & work_group_sizes,int * index)219 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
220 const CLKernel& kernel, const GpuInfo& gpu_info,
221 const std::vector<int3>& work_groups_count,
222 const std::vector<int3>& work_group_sizes, int* index) {
223 // Some Adreno 3xx can have wrong numbers for some events
224 const bool possible_bug_with_events =
225 gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
226 events_.resize(work_group_sizes.size());
227 for (int i = 0; i < work_group_sizes.size(); ++i) {
228 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
229 work_group_sizes[i], &events_[i]));
230
231 // reducing the speed of memory leak on Mali for some kernels
232 if (gpu_info.IsMali() && i % 8 == 7) {
233 events_[i - 7].Wait();
234 }
235 if (possible_bug_with_events) {
236 // We are trying to increase probability for correct result.
237 RETURN_IF_ERROR(WaitForCompletion());
238 }
239 }
240
241 RETURN_IF_ERROR(WaitForCompletion());
242
243 // To release memory of some kernel pool on Mali.
244 if (gpu_info.IsMali()) {
245 RETURN_IF_ERROR(kernel.ReInit());
246 }
247
248 int minimum_index = 0;
249 double minimum_time = std::numeric_limits<double>::max();
250 if (possible_bug_with_events) { // we will try to cut out suspicious results
251 double average_time = 0.0;
252 int average_samples_count = 0;
253 for (int i = 0; i < work_group_sizes.size(); ++i) {
254 if (events_[i].GetEventTimeMs() < 100 * 1000) { // 100 sec
255 average_time += events_[i].GetEventTimeMs();
256 average_samples_count++;
257 }
258 }
259 average_time /= average_samples_count;
260 for (int i = 0; i < work_group_sizes.size(); ++i) {
261 double time = events_[i].GetEventTimeMs();
262 if (time < minimum_time && time >= 0.1 * average_time) {
263 minimum_index = i;
264 minimum_time = time;
265 }
266 }
267 } else {
268 for (int i = 0; i < work_group_sizes.size(); ++i) {
269 double time = events_[i].GetEventTimeMs();
270 if (time < minimum_time) {
271 minimum_index = i;
272 minimum_time = time;
273 }
274 }
275 }
276
277 *index = minimum_index;
278
279 return absl::OkStatus();
280 }
281
CreateCLCommandQueue(const CLDevice & device,const CLContext & context,CLCommandQueue * result)282 absl::Status CreateCLCommandQueue(const CLDevice& device,
283 const CLContext& context,
284 CLCommandQueue* result) {
285 int error_code;
286 cl_command_queue queue =
287 clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
288 if (!queue) {
289 return absl::UnknownError(
290 absl::StrCat("Failed to create a command queue - ",
291 CLErrorCodeToString(error_code)));
292 }
293 *result = CLCommandQueue(queue, true);
294 return absl::OkStatus();
295 }
296
GetQueueExecutionTimeMs() const297 double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
298 const uint64_t start = events_.front().GetStartedTimeNs();
299 const uint64_t end = events_.back().GetFinishedTimeNs();
300 const uint64_t time_ns = (end - start);
301
302 return static_cast<double>(time_ns) / 1000000.0;
303 }
304
GetSumOfEventsTimeMs() const305 double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
306 double sum = 0.0;
307 for (int i = 0; i < events_.size(); ++i) {
308 sum += events_[i].GetEventTimeMs();
309 }
310 return sum;
311 }
312
CreateProfilingCommandQueue(const CLDevice & device,const CLContext & context,ProfilingCommandQueue * result)313 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
314 const CLContext& context,
315 ProfilingCommandQueue* result) {
316 int error_code;
317 cl_command_queue queue = clCreateCommandQueue(
318 context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
319 if (!queue) {
320 return absl::UnknownError(
321 absl::StrCat("Failed to create a command queue - ",
322 CLErrorCodeToString(error_code)));
323 }
324
325 *result = ProfilingCommandQueue(queue);
326 return absl::OkStatus();
327 }
328
329 } // namespace cl
330 } // namespace gpu
331 } // namespace tflite
332