1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
17
18 #include <array>
19 #include <map>
20 #include <string>
21 #include <vector>
22
23 #include "absl/strings/str_cat.h"
24 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
26 #include "tensorflow/lite/delegates/gpu/cl/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/status.h"
28 #include "tensorflow/lite/delegates/gpu/common/types.h"
29
30 namespace tflite {
31 namespace gpu {
32 namespace cl {
33
CLCommandQueue(cl_command_queue queue,bool has_ownership)34 CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership)
35 : queue_(queue), has_ownership_(has_ownership) {}
36
CLCommandQueue(CLCommandQueue && queue)37 CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue)
38 : queue_(queue.queue_), has_ownership_(queue.has_ownership_) {
39 queue.queue_ = nullptr;
40 }
41
operator =(CLCommandQueue && queue)42 CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
43 if (this != &queue) {
44 Release();
45 std::swap(queue_, queue.queue_);
46 has_ownership_ = queue.has_ownership_;
47 }
48 return *this;
49 }
50
~CLCommandQueue()51 CLCommandQueue::~CLCommandQueue() { Release(); }
52
Release()53 void CLCommandQueue::Release() {
54 if (has_ownership_ && queue_) {
55 clReleaseCommandQueue(queue_);
56 queue_ = nullptr;
57 }
58 }
59
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,CLEvent * event)60 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
61 const int3& work_groups_count,
62 const int3& work_group_size,
63 CLEvent* event) {
64 std::array<size_t, 3> local;
65 std::array<size_t, 3> global;
66 for (int i = 0; i < 3; ++i) {
67 local[i] = work_group_size[i];
68 global[i] = work_groups_count[i] * work_group_size[i];
69 }
70 cl_event resulting_event;
71 const int error_code = clEnqueueNDRangeKernel(
72 queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0,
73 nullptr, event ? &resulting_event : nullptr);
74 if (event) {
75 *event = CLEvent(resulting_event);
76 }
77 if (error_code != CL_SUCCESS) {
78 return absl::UnknownError(
79 absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
80 CLErrorCodeToString(error_code)));
81 }
82 return absl::OkStatus();
83 }
84
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)85 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
86 const int3& work_groups_count,
87 const int3& work_group_size) {
88 return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
89 }
90
EnqueueEvent(CLEvent * event)91 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
92 cl_event resulting_event;
93 const int error_code = clEnqueueMarker(queue_, &resulting_event);
94 *event = CLEvent(resulting_event);
95 if (error_code != CL_SUCCESS) {
96 return absl::UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
97 CLErrorCodeToString(error_code)));
98 }
99 return absl::OkStatus();
100 }
101
EnqueueWriteImage(cl_mem memory,int3 region,const void * data,bool async)102 absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
103 const void* data, bool async) {
104 const size_t origin[] = {0, 0, 0};
105 const size_t r[] = {static_cast<size_t>(region.x),
106 static_cast<size_t>(region.y),
107 static_cast<size_t>(region.z)};
108 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
109 auto error_code = clEnqueueWriteImage(queue_, memory, blocking, origin, r, 0,
110 0, data, 0, nullptr, nullptr);
111 if (error_code != CL_SUCCESS) {
112 return absl::UnknownError(
113 absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
114 CLErrorCodeToString(error_code)));
115 }
116
117 return absl::OkStatus();
118 }
119
EnqueueReadImage(cl_mem memory,int3 region,void * data,bool async)120 absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
121 void* data, bool async) {
122 const size_t origin[] = {0, 0, 0};
123 const size_t r[] = {static_cast<size_t>(region.x),
124 static_cast<size_t>(region.y),
125 static_cast<size_t>(region.z)};
126 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
127 auto error_code = clEnqueueReadImage(queue_, memory, blocking, origin, r, 0,
128 0, data, 0, nullptr, nullptr);
129 if (error_code != CL_SUCCESS) {
130 return absl::UnknownError(
131 absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
132 CLErrorCodeToString(error_code)));
133 }
134
135 return absl::OkStatus();
136 }
137
EnqueueWriteBuffer(cl_mem memory,size_t size_in_bytes,const void * data,bool async)138 absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory,
139 size_t size_in_bytes,
140 const void* data, bool async) {
141 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
142 auto error_code = clEnqueueWriteBuffer(
143 queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
144 if (error_code != CL_SUCCESS) {
145 return absl::UnknownError(
146 absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
147 CLErrorCodeToString(error_code)));
148 }
149 return absl::OkStatus();
150 }
151
EnqueueReadBuffer(cl_mem memory,size_t size_in_bytes,void * data,bool async)152 absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory,
153 size_t size_in_bytes, void* data,
154 bool async) {
155 const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
156 auto error_code = clEnqueueReadBuffer(
157 queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
158 if (error_code != CL_SUCCESS) {
159 return absl::UnknownError(
160 absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
161 CLErrorCodeToString(error_code)));
162 }
163 return absl::OkStatus();
164 }
165
WaitForCompletion()166 absl::Status CLCommandQueue::WaitForCompletion() {
167 auto error_code = clFinish(queue_);
168 if (error_code != CL_SUCCESS) {
169 return absl::UnknownError(
170 absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
171 }
172 return absl::OkStatus();
173 }
174
ProfilingCommandQueue(cl_command_queue queue)175 ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
176 : CLCommandQueue(queue, true) {
177 events_.reserve(128);
178 }
179
ProfilingCommandQueue(ProfilingCommandQueue && queue)180 ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
181 : CLCommandQueue(std::move(queue)),
182 events_(std::move(queue.events_)),
183 current_label_(std::move(queue.current_label_)) {}
184
operator =(ProfilingCommandQueue && queue)185 ProfilingCommandQueue& ProfilingCommandQueue::operator=(
186 ProfilingCommandQueue&& queue) {
187 if (this != &queue) {
188 events_ = std::move(queue.events_);
189 current_label_ = std::move(queue.current_label_);
190 CLCommandQueue::operator=(std::move(queue));
191 }
192 return *this;
193 }
194
SetEventsLabel(const std::string & name)195 void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
196 current_label_ = name;
197 }
198
ResetMeasurements()199 void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
200
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)201 absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
202 const int3& work_groups_count,
203 const int3& work_group_size) {
204 events_.push_back(CLEvent());
205 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
206 work_group_size,
207 &events_[events_.size() - 1]));
208 events_.back().SetName(current_label_);
209 return absl::OkStatus();
210 }
211
GetProfilingInfo() const212 ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
213 ProfilingInfo result;
214 result.dispatches.resize(events_.size());
215 for (int i = 0; i < events_.size(); ++i) {
216 result.dispatches[i].label = events_[i].GetName();
217 result.dispatches[i].duration =
218 absl::Nanoseconds(events_[i].GetEventTimeNs());
219 }
220 return result;
221 }
222
GetBestWorkGroupIndex(const CLKernel & kernel,const GpuInfo & gpu_info,const std::vector<int3> & work_groups_count,const std::vector<int3> & work_group_sizes,int * index)223 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
224 const CLKernel& kernel, const GpuInfo& gpu_info,
225 const std::vector<int3>& work_groups_count,
226 const std::vector<int3>& work_group_sizes, int* index) {
227 // Some Adreno 3xx can have wrong numbers for some events
228 const bool possible_bug_with_events =
229 gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
230 events_.resize(work_group_sizes.size());
231 for (int i = 0; i < work_group_sizes.size(); ++i) {
232 RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
233 work_group_sizes[i], &events_[i]));
234
235 // reducing the speed of memory leak on Mali for some kernels
236 if (gpu_info.IsMali() && i % 8 == 7) {
237 events_[i - 7].Wait();
238 }
239 if (possible_bug_with_events) {
240 // We are trying to increase probability for correct result.
241 RETURN_IF_ERROR(WaitForCompletion());
242 }
243 }
244
245 RETURN_IF_ERROR(WaitForCompletion());
246
247 // To release memory of some kernel pool on Mali.
248 if (gpu_info.IsMali()) {
249 RETURN_IF_ERROR(kernel.ReInit());
250 }
251
252 int minimum_index = 0;
253 double minimum_time = std::numeric_limits<double>::max();
254 if (possible_bug_with_events) { // we will try to cut out suspicious results
255 double average_time = 0.0;
256 int average_samples_count = 0;
257 for (int i = 0; i < work_group_sizes.size(); ++i) {
258 if (events_[i].GetEventTimeMs() < 100 * 1000) { // 100 sec
259 average_time += events_[i].GetEventTimeMs();
260 average_samples_count++;
261 }
262 }
263 average_time /= average_samples_count;
264 for (int i = 0; i < work_group_sizes.size(); ++i) {
265 double time = events_[i].GetEventTimeMs();
266 if (time < minimum_time && time >= 0.1 * average_time) {
267 minimum_index = i;
268 minimum_time = time;
269 }
270 }
271 } else {
272 for (int i = 0; i < work_group_sizes.size(); ++i) {
273 double time = events_[i].GetEventTimeMs();
274 if (time < minimum_time) {
275 minimum_index = i;
276 minimum_time = time;
277 }
278 }
279 }
280
281 *index = minimum_index;
282
283 return absl::OkStatus();
284 }
285
CreateCLCommandQueue(const CLDevice & device,const CLContext & context,CLCommandQueue * result)286 absl::Status CreateCLCommandQueue(const CLDevice& device,
287 const CLContext& context,
288 CLCommandQueue* result) {
289 int error_code;
290 cl_command_queue queue =
291 clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
292 if (!queue) {
293 return absl::UnknownError(
294 absl::StrCat("Failed to create a command queue - ",
295 CLErrorCodeToString(error_code)));
296 }
297 *result = CLCommandQueue(queue, true);
298 return absl::OkStatus();
299 }
300
GetQueueExecutionTimeMs() const301 double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
302 const uint64_t start = events_.front().GetStartedTimeNs();
303 const uint64_t end = events_.back().GetFinishedTimeNs();
304 const uint64_t time_ns = (end - start);
305
306 return static_cast<double>(time_ns) / 1000000.0;
307 }
308
GetSumOfEventsTimeMs() const309 double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
310 double sum = 0.0;
311 for (int i = 0; i < events_.size(); ++i) {
312 sum += events_[i].GetEventTimeMs();
313 }
314 return sum;
315 }
316
CreateProfilingCommandQueue(const CLDevice & device,const CLContext & context,ProfilingCommandQueue * result)317 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
318 const CLContext& context,
319 ProfilingCommandQueue* result) {
320 int error_code;
321 cl_command_queue queue = clCreateCommandQueue(
322 context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
323 if (!queue) {
324 return absl::UnknownError(
325 absl::StrCat("Failed to create a command queue - ",
326 CLErrorCodeToString(error_code)));
327 }
328
329 *result = ProfilingCommandQueue(queue);
330 return absl::OkStatus();
331 }
332
333 } // namespace cl
334 } // namespace gpu
335 } // namespace tflite
336