1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
17
18 #include <array>
19 #include <map>
20 #include <string>
21 #include <utility>
22 #include <vector>
23
24 #include "absl/strings/str_cat.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
26 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
27 #include "tensorflow/lite/delegates/gpu/cl/util.h"
28 #include "tensorflow/lite/delegates/gpu/common/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/types.h"
30
31 namespace tflite {
32 namespace gpu {
33 namespace cl {
34
CLCommandQueue(cl_command_queue queue,bool has_ownership)35 CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership)
36 : queue_(queue), has_ownership_(has_ownership) {}
37
CLCommandQueue(CLCommandQueue && queue)38 CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue)
39 : queue_(queue.queue_), has_ownership_(queue.has_ownership_) {
40 queue.queue_ = nullptr;
41 }
42
operator =(CLCommandQueue && queue)43 CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
44 if (this != &queue) {
45 Release();
46 std::swap(queue_, queue.queue_);
47 has_ownership_ = queue.has_ownership_;
48 }
49 return *this;
50 }
51
~CLCommandQueue()52 CLCommandQueue::~CLCommandQueue() { Release(); }
53
Release()54 void CLCommandQueue::Release() {
55 if (has_ownership_ && queue_) {
56 clReleaseCommandQueue(queue_);
57 queue_ = nullptr;
58 }
59 }
60
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,CLEvent * event)61 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
62 const int3& work_groups_count,
63 const int3& work_group_size,
64 CLEvent* event) {
65 std::array<size_t, 3> local;
66 std::array<size_t, 3> global;
67 for (int i = 0; i < 3; ++i) {
68 local[i] = work_group_size[i];
69 global[i] = work_groups_count[i] * work_group_size[i];
70 }
71 cl_event resulting_event;
72 const int error_code = clEnqueueNDRangeKernel(
73 queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0,
74 nullptr, event ? &resulting_event : nullptr);
75 if (event) {
76 *event = CLEvent(resulting_event);
77 }
78 if (error_code != CL_SUCCESS) {
79 return absl::UnknownError(
80 absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
81 CLErrorCodeToString(error_code)));
82 }
83 return absl::OkStatus();
84 }
85
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)86 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
87 const int3& work_groups_count,
88 const int3& work_group_size) {
89 return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
90 }
91
EnqueueEvent(CLEvent * event)92 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
93 cl_event resulting_event;
94 const int error_code = clEnqueueMarker(queue_, &resulting_event);
95 *event = CLEvent(resulting_event);
96 if (error_code != CL_SUCCESS) {
97 return absl::UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
98 CLErrorCodeToString(error_code)));
99 }
100 return absl::OkStatus();
101 }
102
EnqueueWriteImage(cl_mem memory,int3 region,const void * data,bool async)103 absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
104 const void* data, bool async) {
105 const size_t origin[] = {0, 0, 0};
106 const size_t r[] = {static_cast<size_t>(region.x),
107 static_cast<size_t>(region.y),
108 static_cast<size_t>(region.z)};
109 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
110 auto error_code = clEnqueueWriteImage(queue_, memory, blocking, origin, r, 0,
111 0, data, 0, nullptr, nullptr);
112 if (error_code != CL_SUCCESS) {
113 return absl::UnknownError(
114 absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
115 CLErrorCodeToString(error_code)));
116 }
117
118 return absl::OkStatus();
119 }
120
EnqueueReadImage(cl_mem memory,int3 region,void * data,bool async)121 absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
122 void* data, bool async) {
123 const size_t origin[] = {0, 0, 0};
124 const size_t r[] = {static_cast<size_t>(region.x),
125 static_cast<size_t>(region.y),
126 static_cast<size_t>(region.z)};
127 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
128 auto error_code = clEnqueueReadImage(queue_, memory, blocking, origin, r, 0,
129 0, data, 0, nullptr, nullptr);
130 if (error_code != CL_SUCCESS) {
131 return absl::UnknownError(
132 absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
133 CLErrorCodeToString(error_code)));
134 }
135
136 return absl::OkStatus();
137 }
138
EnqueueWriteBuffer(cl_mem memory,size_t size_in_bytes,const void * data,bool async)139 absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory,
140 size_t size_in_bytes,
141 const void* data, bool async) {
142 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
143 auto error_code = clEnqueueWriteBuffer(
144 queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
145 if (error_code != CL_SUCCESS) {
146 return absl::UnknownError(
147 absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
148 CLErrorCodeToString(error_code)));
149 }
150 return absl::OkStatus();
151 }
152
EnqueueReadBuffer(cl_mem memory,size_t size_in_bytes,void * data,bool async)153 absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory,
154 size_t size_in_bytes, void* data,
155 bool async) {
156 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
157 auto error_code = clEnqueueReadBuffer(
158 queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
159 if (error_code != CL_SUCCESS) {
160 return absl::UnknownError(
161 absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
162 CLErrorCodeToString(error_code)));
163 }
164 return absl::OkStatus();
165 }
166
WaitForCompletion()167 absl::Status CLCommandQueue::WaitForCompletion() {
168 auto error_code = clFinish(queue_);
169 if (error_code != CL_SUCCESS) {
170 return absl::UnknownError(
171 absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
172 }
173 return absl::OkStatus();
174 }
175
ProfilingCommandQueue(cl_command_queue queue)176 ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
177 : CLCommandQueue(queue, true) {
178 events_.reserve(128);
179 }
180
ProfilingCommandQueue(ProfilingCommandQueue && queue)181 ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
182 : CLCommandQueue(std::move(queue)),
183 events_(std::move(queue.events_)),
184 number_of_dispatches_(std::move(queue.number_of_dispatches_)),
185 current_label_(std::move(queue.current_label_)) {}
186
operator =(ProfilingCommandQueue && queue)187 ProfilingCommandQueue& ProfilingCommandQueue::operator=(
188 ProfilingCommandQueue&& queue) {
189 if (this != &queue) {
190 events_ = std::move(queue.events_);
191 number_of_dispatches_ = std::move(queue.number_of_dispatches_);
192 current_label_ = std::move(queue.current_label_);
193 CLCommandQueue::operator=(std::move(queue));
194 }
195 return *this;
196 }
197
SetEventsLabel(const std::string & name)198 void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
199 current_label_ = name;
200 }
201
ResetMeasurements()202 void ProfilingCommandQueue::ResetMeasurements() {
203 events_.clear();
204 number_of_dispatches_.clear();
205 }
206
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)207 absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
208 const int3& work_groups_count,
209 const int3& work_group_size) {
210 events_.push_back(CLEvent());
211 number_of_dispatches_.push_back(1);
212 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
213 work_group_size,
214 &events_[events_.size() - 1]));
215 events_.back().SetName(current_label_);
216 return absl::OkStatus();
217 }
218
DispatchNTimes(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,int n,int flush_period)219 absl::Status ProfilingCommandQueue::DispatchNTimes(
220 const CLKernel& kernel, const int3& work_groups_count,
221 const int3& work_group_size, int n, int flush_period) {
222 number_of_dispatches_.push_back(n);
223 if (n == 1) {
224 events_.push_back(CLEvent());
225 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
226 work_group_size,
227 &events_[events_.size() - 1]));
228 events_.back().SetName(current_label_);
229 } else {
230 events_.push_back(CLEvent());
231 events_.push_back(CLEvent());
232 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
233 work_group_size,
234 &events_[events_.size() - 2]));
235 for (int i = 1; i < n - 1; ++i) {
236 RETURN_IF_ERROR(
237 CLCommandQueue::Dispatch(kernel, work_groups_count, work_group_size));
238 if (flush_period && i % flush_period == 0) {
239 clFlush(queue_);
240 }
241 }
242 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
243 work_group_size,
244 &events_[events_.size() - 1]));
245 clFlush(queue_);
246 events_[events_.size() - 2].SetName(current_label_);
247 events_[events_.size() - 1].SetName(current_label_);
248 }
249 return absl::OkStatus();
250 }
251
GetProfilingInfo() const252 ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
253 ProfilingInfo result;
254 result.dispatches.resize(number_of_dispatches_.size());
255 int events_counter = 0;
256 for (int i = 0; i < number_of_dispatches_.size(); ++i) {
257 result.dispatches[i].label = events_[events_counter].GetName();
258 if (number_of_dispatches_[i] == 1) {
259 result.dispatches[i].duration =
260 absl::Nanoseconds(events_[events_counter].GetEventTimeNs());
261 events_counter += 1;
262 } else {
263 result.dispatches[i].duration =
264 absl::Nanoseconds(events_[events_counter + 1].GetFinishedTimeNs() -
265 events_[events_counter].GetStartedTimeNs()) /
266 number_of_dispatches_[i];
267 events_counter += 2;
268 }
269 }
270 return result;
271 }
272
GetBestWorkGroupIndex(const CLKernel & kernel,const GpuInfo & gpu_info,const std::vector<int3> & work_groups_count,const std::vector<int3> & work_group_sizes,int * index)273 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
274 const CLKernel& kernel, const GpuInfo& gpu_info,
275 const std::vector<int3>& work_groups_count,
276 const std::vector<int3>& work_group_sizes, int* index) {
277 // Some Adreno 3xx can have wrong numbers for some events
278 const bool possible_bug_with_events =
279 gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
280 events_.resize(work_group_sizes.size());
281 for (int i = 0; i < work_group_sizes.size(); ++i) {
282 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
283 work_group_sizes[i], &events_[i]));
284
285 // reducing the speed of memory leak on Mali for some kernels
286 if (gpu_info.IsMali() && i % 8 == 7) {
287 events_[i - 7].Wait();
288 }
289 if (possible_bug_with_events) {
290 // We are trying to increase probability for correct result.
291 RETURN_IF_ERROR(WaitForCompletion());
292 }
293 }
294
295 RETURN_IF_ERROR(WaitForCompletion());
296
297 // To release memory of some kernel pool on Mali.
298 if (gpu_info.IsMali()) {
299 RETURN_IF_ERROR(kernel.ReInit());
300 }
301
302 int minimum_index = 0;
303 double minimum_time = std::numeric_limits<double>::max();
304 if (possible_bug_with_events) { // we will try to cut out suspicious results
305 double average_time = 0.0;
306 int average_samples_count = 0;
307 for (int i = 0; i < work_group_sizes.size(); ++i) {
308 if (events_[i].GetEventTimeMs() < 100 * 1000) { // 100 sec
309 average_time += events_[i].GetEventTimeMs();
310 average_samples_count++;
311 }
312 }
313 average_time /= average_samples_count;
314 for (int i = 0; i < work_group_sizes.size(); ++i) {
315 double time = events_[i].GetEventTimeMs();
316 if (time < minimum_time && time >= 0.1 * average_time) {
317 minimum_index = i;
318 minimum_time = time;
319 }
320 }
321 } else {
322 for (int i = 0; i < work_group_sizes.size(); ++i) {
323 double time = events_[i].GetEventTimeMs();
324 if (time < minimum_time) {
325 minimum_index = i;
326 minimum_time = time;
327 }
328 }
329 }
330
331 *index = minimum_index;
332
333 return absl::OkStatus();
334 }
335
CreateCLCommandQueue(const CLDevice & device,const CLContext & context,CLCommandQueue * result)336 absl::Status CreateCLCommandQueue(const CLDevice& device,
337 const CLContext& context,
338 CLCommandQueue* result) {
339 int error_code;
340 cl_command_queue queue =
341 clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
342 if (!queue) {
343 return absl::UnknownError(
344 absl::StrCat("Failed to create a command queue - ",
345 CLErrorCodeToString(error_code)));
346 }
347 *result = CLCommandQueue(queue, true);
348 return absl::OkStatus();
349 }
350
GetQueueExecutionTimeMs() const351 double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
352 const uint64_t start = events_.front().GetStartedTimeNs();
353 const uint64_t end = events_.back().GetFinishedTimeNs();
354 const uint64_t time_ns = (end - start);
355
356 return static_cast<double>(time_ns) / 1000000.0;
357 }
358
GetSumOfEventsTimeMs() const359 double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
360 double sum = 0.0;
361 for (int i = 0; i < events_.size(); ++i) {
362 sum += events_[i].GetEventTimeMs();
363 }
364 return sum;
365 }
366
CreateProfilingCommandQueue(const CLDevice & device,const CLContext & context,ProfilingCommandQueue * result)367 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
368 const CLContext& context,
369 ProfilingCommandQueue* result) {
370 int error_code;
371 cl_command_queue queue = clCreateCommandQueue(
372 context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
373 if (!queue) {
374 return absl::UnknownError(
375 absl::StrCat("Failed to create a command queue - ",
376 CLErrorCodeToString(error_code)));
377 }
378
379 *result = ProfilingCommandQueue(queue);
380 return absl::OkStatus();
381 }
382
383 } // namespace cl
384 } // namespace gpu
385 } // namespace tflite
386