• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
17 
18 #include <array>
19 #include <map>
20 #include <string>
21 #include <vector>
22 
23 #include "absl/strings/str_cat.h"
24 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
26 #include "tensorflow/lite/delegates/gpu/cl/util.h"
27 #include "tensorflow/lite/delegates/gpu/common/status.h"
28 #include "tensorflow/lite/delegates/gpu/common/types.h"
29 
30 namespace tflite {
31 namespace gpu {
32 namespace cl {
33 
CLCommandQueue(cl_command_queue queue,bool has_ownership)34 CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership)
35     : queue_(queue), has_ownership_(has_ownership) {}
36 
CLCommandQueue(CLCommandQueue && queue)37 CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue)
38     : queue_(queue.queue_), has_ownership_(queue.has_ownership_) {
39   queue.queue_ = nullptr;
40 }
41 
operator =(CLCommandQueue && queue)42 CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
43   if (this != &queue) {
44     Release();
45     std::swap(queue_, queue.queue_);
46     has_ownership_ = queue.has_ownership_;
47   }
48   return *this;
49 }
50 
~CLCommandQueue()51 CLCommandQueue::~CLCommandQueue() { Release(); }
52 
Release()53 void CLCommandQueue::Release() {
54   if (has_ownership_ && queue_) {
55     clReleaseCommandQueue(queue_);
56     queue_ = nullptr;
57   }
58 }
59 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,CLEvent * event)60 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
61                                       const int3& work_groups_count,
62                                       const int3& work_group_size,
63                                       CLEvent* event) {
64   std::array<size_t, 3> local;
65   std::array<size_t, 3> global;
66   for (int i = 0; i < 3; ++i) {
67     local[i] = work_group_size[i];
68     global[i] = work_groups_count[i] * work_group_size[i];
69   }
70   cl_event resulting_event;
71   const int error_code = clEnqueueNDRangeKernel(
72       queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0,
73       nullptr, event ? &resulting_event : nullptr);
74   if (event) {
75     *event = CLEvent(resulting_event);
76   }
77   if (error_code != CL_SUCCESS) {
78     return absl::UnknownError(
79         absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
80                      CLErrorCodeToString(error_code)));
81   }
82   return absl::OkStatus();
83 }
84 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)85 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
86                                       const int3& work_groups_count,
87                                       const int3& work_group_size) {
88   return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
89 }
90 
EnqueueEvent(CLEvent * event)91 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
92   cl_event resulting_event;
93   const int error_code = clEnqueueMarker(queue_, &resulting_event);
94   *event = CLEvent(resulting_event);
95   if (error_code != CL_SUCCESS) {
96     return absl::UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
97                                            CLErrorCodeToString(error_code)));
98   }
99   return absl::OkStatus();
100 }
101 
EnqueueWriteImage(cl_mem memory,int3 region,const void * data,bool async)102 absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
103                                                const void* data, bool async) {
104   const size_t origin[] = {0, 0, 0};
105   const size_t r[] = {static_cast<size_t>(region.x),
106                       static_cast<size_t>(region.y),
107                       static_cast<size_t>(region.z)};
108   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
109   auto error_code = clEnqueueWriteImage(queue_, memory, blocking, origin, r, 0,
110                                         0, data, 0, nullptr, nullptr);
111   if (error_code != CL_SUCCESS) {
112     return absl::UnknownError(
113         absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
114                      CLErrorCodeToString(error_code)));
115   }
116 
117   return absl::OkStatus();
118 }
119 
EnqueueReadImage(cl_mem memory,int3 region,void * data,bool async)120 absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
121                                               void* data, bool async) {
122   const size_t origin[] = {0, 0, 0};
123   const size_t r[] = {static_cast<size_t>(region.x),
124                       static_cast<size_t>(region.y),
125                       static_cast<size_t>(region.z)};
126   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
127   auto error_code = clEnqueueReadImage(queue_, memory, blocking, origin, r, 0,
128                                        0, data, 0, nullptr, nullptr);
129   if (error_code != CL_SUCCESS) {
130     return absl::UnknownError(
131         absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
132                      CLErrorCodeToString(error_code)));
133   }
134 
135   return absl::OkStatus();
136 }
137 
EnqueueWriteBuffer(cl_mem memory,size_t size_in_bytes,const void * data,bool async)138 absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory,
139                                                 size_t size_in_bytes,
140                                                 const void* data, bool async) {
141   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
142   auto error_code = clEnqueueWriteBuffer(
143       queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
144   if (error_code != CL_SUCCESS) {
145     return absl::UnknownError(
146         absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
147                      CLErrorCodeToString(error_code)));
148   }
149   return absl::OkStatus();
150 }
151 
EnqueueReadBuffer(cl_mem memory,size_t size_in_bytes,void * data,bool async)152 absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory,
153                                                size_t size_in_bytes, void* data,
154                                                bool async) {
155   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
156   auto error_code = clEnqueueReadBuffer(
157       queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
158   if (error_code != CL_SUCCESS) {
159     return absl::UnknownError(
160         absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
161                      CLErrorCodeToString(error_code)));
162   }
163   return absl::OkStatus();
164 }
165 
WaitForCompletion()166 absl::Status CLCommandQueue::WaitForCompletion() {
167   auto error_code = clFinish(queue_);
168   if (error_code != CL_SUCCESS) {
169     return absl::UnknownError(
170         absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
171   }
172   return absl::OkStatus();
173 }
174 
ProfilingCommandQueue(cl_command_queue queue)175 ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
176     : CLCommandQueue(queue, true) {
177   events_.reserve(128);
178 }
179 
ProfilingCommandQueue(ProfilingCommandQueue && queue)180 ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
181     : CLCommandQueue(std::move(queue)),
182       events_(std::move(queue.events_)),
183       current_label_(std::move(queue.current_label_)) {}
184 
operator =(ProfilingCommandQueue && queue)185 ProfilingCommandQueue& ProfilingCommandQueue::operator=(
186     ProfilingCommandQueue&& queue) {
187   if (this != &queue) {
188     events_ = std::move(queue.events_);
189     current_label_ = std::move(queue.current_label_);
190     CLCommandQueue::operator=(std::move(queue));
191   }
192   return *this;
193 }
194 
SetEventsLabel(const std::string & name)195 void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
196   current_label_ = name;
197 }
198 
ResetMeasurements()199 void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
200 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)201 absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
202                                              const int3& work_groups_count,
203                                              const int3& work_group_size) {
204   events_.push_back(CLEvent());
205   RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
206                                            work_group_size,
207                                            &events_[events_.size() - 1]));
208   events_.back().SetName(current_label_);
209   return absl::OkStatus();
210 }
211 
GetProfilingInfo() const212 ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
213   ProfilingInfo result;
214   result.dispatches.resize(events_.size());
215   for (int i = 0; i < events_.size(); ++i) {
216     result.dispatches[i].label = events_[i].GetName();
217     result.dispatches[i].duration =
218         absl::Nanoseconds(events_[i].GetEventTimeNs());
219   }
220   return result;
221 }
222 
GetBestWorkGroupIndex(const CLKernel & kernel,const GpuInfo & gpu_info,const std::vector<int3> & work_groups_count,const std::vector<int3> & work_group_sizes,int * index)223 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
224     const CLKernel& kernel, const GpuInfo& gpu_info,
225     const std::vector<int3>& work_groups_count,
226     const std::vector<int3>& work_group_sizes, int* index) {
227   // Some Adreno 3xx can have wrong numbers for some events
228   const bool possible_bug_with_events =
229       gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
230   events_.resize(work_group_sizes.size());
231   for (int i = 0; i < work_group_sizes.size(); ++i) {
232     RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
233                                              work_group_sizes[i], &events_[i]));
234 
235     // reducing the speed of memory leak on Mali for some kernels
236     if (gpu_info.IsMali() && i % 8 == 7) {
237       events_[i - 7].Wait();
238     }
239     if (possible_bug_with_events) {
240       // We are trying to increase probability for correct result.
241       RETURN_IF_ERROR(WaitForCompletion());
242     }
243   }
244 
245   RETURN_IF_ERROR(WaitForCompletion());
246 
247   // To release memory of some kernel pool on Mali.
248   if (gpu_info.IsMali()) {
249     RETURN_IF_ERROR(kernel.ReInit());
250   }
251 
252   int minimum_index = 0;
253   double minimum_time = std::numeric_limits<double>::max();
254   if (possible_bug_with_events) {  // we will try to cut out suspicious results
255     double average_time = 0.0;
256     int average_samples_count = 0;
257     for (int i = 0; i < work_group_sizes.size(); ++i) {
258       if (events_[i].GetEventTimeMs() < 100 * 1000) {  // 100 sec
259         average_time += events_[i].GetEventTimeMs();
260         average_samples_count++;
261       }
262     }
263     average_time /= average_samples_count;
264     for (int i = 0; i < work_group_sizes.size(); ++i) {
265       double time = events_[i].GetEventTimeMs();
266       if (time < minimum_time && time >= 0.1 * average_time) {
267         minimum_index = i;
268         minimum_time = time;
269       }
270     }
271   } else {
272     for (int i = 0; i < work_group_sizes.size(); ++i) {
273       double time = events_[i].GetEventTimeMs();
274       if (time < minimum_time) {
275         minimum_index = i;
276         minimum_time = time;
277       }
278     }
279   }
280 
281   *index = minimum_index;
282 
283   return absl::OkStatus();
284 }
285 
CreateCLCommandQueue(const CLDevice & device,const CLContext & context,CLCommandQueue * result)286 absl::Status CreateCLCommandQueue(const CLDevice& device,
287                                   const CLContext& context,
288                                   CLCommandQueue* result) {
289   int error_code;
290   cl_command_queue queue =
291       clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
292   if (!queue) {
293     return absl::UnknownError(
294         absl::StrCat("Failed to create a command queue - ",
295                      CLErrorCodeToString(error_code)));
296   }
297   *result = CLCommandQueue(queue, true);
298   return absl::OkStatus();
299 }
300 
GetQueueExecutionTimeMs() const301 double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
302   const uint64_t start = events_.front().GetStartedTimeNs();
303   const uint64_t end = events_.back().GetFinishedTimeNs();
304   const uint64_t time_ns = (end - start);
305 
306   return static_cast<double>(time_ns) / 1000000.0;
307 }
308 
GetSumOfEventsTimeMs() const309 double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
310   double sum = 0.0;
311   for (int i = 0; i < events_.size(); ++i) {
312     sum += events_[i].GetEventTimeMs();
313   }
314   return sum;
315 }
316 
CreateProfilingCommandQueue(const CLDevice & device,const CLContext & context,ProfilingCommandQueue * result)317 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
318                                          const CLContext& context,
319                                          ProfilingCommandQueue* result) {
320   int error_code;
321   cl_command_queue queue = clCreateCommandQueue(
322       context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
323   if (!queue) {
324     return absl::UnknownError(
325         absl::StrCat("Failed to create a command queue - ",
326                      CLErrorCodeToString(error_code)));
327   }
328 
329   *result = ProfilingCommandQueue(queue);
330   return absl::OkStatus();
331 }
332 
333 }  // namespace cl
334 }  // namespace gpu
335 }  // namespace tflite
336