• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
17 
18 #include <array>
19 #include <map>
20 #include <string>
21 #include <utility>
22 #include <vector>
23 
24 #include "absl/strings/str_cat.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
26 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
27 #include "tensorflow/lite/delegates/gpu/cl/util.h"
28 #include "tensorflow/lite/delegates/gpu/common/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/types.h"
30 
31 namespace tflite {
32 namespace gpu {
33 namespace cl {
34 
CLCommandQueue(cl_command_queue queue,bool has_ownership)35 CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership)
36     : queue_(queue), has_ownership_(has_ownership) {}
37 
CLCommandQueue(CLCommandQueue && queue)38 CLCommandQueue::CLCommandQueue(CLCommandQueue&& queue)
39     : queue_(queue.queue_), has_ownership_(queue.has_ownership_) {
40   queue.queue_ = nullptr;
41 }
42 
operator =(CLCommandQueue && queue)43 CLCommandQueue& CLCommandQueue::operator=(CLCommandQueue&& queue) {
44   if (this != &queue) {
45     Release();
46     std::swap(queue_, queue.queue_);
47     has_ownership_ = queue.has_ownership_;
48   }
49   return *this;
50 }
51 
~CLCommandQueue()52 CLCommandQueue::~CLCommandQueue() { Release(); }
53 
Release()54 void CLCommandQueue::Release() {
55   if (has_ownership_ && queue_) {
56     clReleaseCommandQueue(queue_);
57     queue_ = nullptr;
58   }
59 }
60 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,CLEvent * event)61 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
62                                       const int3& work_groups_count,
63                                       const int3& work_group_size,
64                                       CLEvent* event) {
65   std::array<size_t, 3> local;
66   std::array<size_t, 3> global;
67   for (int i = 0; i < 3; ++i) {
68     local[i] = work_group_size[i];
69     global[i] = work_groups_count[i] * work_group_size[i];
70   }
71   cl_event resulting_event;
72   const int error_code = clEnqueueNDRangeKernel(
73       queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0,
74       nullptr, event ? &resulting_event : nullptr);
75   if (event) {
76     *event = CLEvent(resulting_event);
77   }
78   if (error_code != CL_SUCCESS) {
79     return absl::UnknownError(
80         absl::StrCat("Failed to clEnqueueNDRangeKernel - ",
81                      CLErrorCodeToString(error_code)));
82   }
83   return absl::OkStatus();
84 }
85 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)86 absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
87                                       const int3& work_groups_count,
88                                       const int3& work_group_size) {
89   return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
90 }
91 
EnqueueEvent(CLEvent * event)92 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
93   cl_event resulting_event;
94   const int error_code = clEnqueueMarker(queue_, &resulting_event);
95   *event = CLEvent(resulting_event);
96   if (error_code != CL_SUCCESS) {
97     return absl::UnknownError(absl::StrCat("Failed to clEnqueueMarker - ",
98                                            CLErrorCodeToString(error_code)));
99   }
100   return absl::OkStatus();
101 }
102 
EnqueueWriteImage(cl_mem memory,int3 region,const void * data,bool async)103 absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region,
104                                                const void* data, bool async) {
105   const size_t origin[] = {0, 0, 0};
106   const size_t r[] = {static_cast<size_t>(region.x),
107                       static_cast<size_t>(region.y),
108                       static_cast<size_t>(region.z)};
109   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
110   auto error_code = clEnqueueWriteImage(queue_, memory, blocking, origin, r, 0,
111                                         0, data, 0, nullptr, nullptr);
112   if (error_code != CL_SUCCESS) {
113     return absl::UnknownError(
114         absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
115                      CLErrorCodeToString(error_code)));
116   }
117 
118   return absl::OkStatus();
119 }
120 
EnqueueReadImage(cl_mem memory,int3 region,void * data,bool async)121 absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region,
122                                               void* data, bool async) {
123   const size_t origin[] = {0, 0, 0};
124   const size_t r[] = {static_cast<size_t>(region.x),
125                       static_cast<size_t>(region.y),
126                       static_cast<size_t>(region.z)};
127   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
128   auto error_code = clEnqueueReadImage(queue_, memory, blocking, origin, r, 0,
129                                        0, data, 0, nullptr, nullptr);
130   if (error_code != CL_SUCCESS) {
131     return absl::UnknownError(
132         absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
133                      CLErrorCodeToString(error_code)));
134   }
135 
136   return absl::OkStatus();
137 }
138 
EnqueueWriteBuffer(cl_mem memory,size_t size_in_bytes,const void * data,bool async)139 absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory,
140                                                 size_t size_in_bytes,
141                                                 const void* data, bool async) {
142   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
143   auto error_code = clEnqueueWriteBuffer(
144       queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
145   if (error_code != CL_SUCCESS) {
146     return absl::UnknownError(
147         absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
148                      CLErrorCodeToString(error_code)));
149   }
150   return absl::OkStatus();
151 }
152 
EnqueueReadBuffer(cl_mem memory,size_t size_in_bytes,void * data,bool async)153 absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory,
154                                                size_t size_in_bytes, void* data,
155                                                bool async) {
156   const cl_bool blocking = async ? CL_FALSE : CL_TRUE;
157   auto error_code = clEnqueueReadBuffer(
158       queue_, memory, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr);
159   if (error_code != CL_SUCCESS) {
160     return absl::UnknownError(
161         absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
162                      CLErrorCodeToString(error_code)));
163   }
164   return absl::OkStatus();
165 }
166 
WaitForCompletion()167 absl::Status CLCommandQueue::WaitForCompletion() {
168   auto error_code = clFinish(queue_);
169   if (error_code != CL_SUCCESS) {
170     return absl::UnknownError(
171         absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
172   }
173   return absl::OkStatus();
174 }
175 
ProfilingCommandQueue(cl_command_queue queue)176 ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
177     : CLCommandQueue(queue, true) {
178   events_.reserve(128);
179 }
180 
ProfilingCommandQueue(ProfilingCommandQueue && queue)181 ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
182     : CLCommandQueue(std::move(queue)),
183       events_(std::move(queue.events_)),
184       number_of_dispatches_(std::move(queue.number_of_dispatches_)),
185       current_label_(std::move(queue.current_label_)) {}
186 
operator =(ProfilingCommandQueue && queue)187 ProfilingCommandQueue& ProfilingCommandQueue::operator=(
188     ProfilingCommandQueue&& queue) {
189   if (this != &queue) {
190     events_ = std::move(queue.events_);
191     number_of_dispatches_ = std::move(queue.number_of_dispatches_);
192     current_label_ = std::move(queue.current_label_);
193     CLCommandQueue::operator=(std::move(queue));
194   }
195   return *this;
196 }
197 
SetEventsLabel(const std::string & name)198 void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
199   current_label_ = name;
200 }
201 
ResetMeasurements()202 void ProfilingCommandQueue::ResetMeasurements() {
203   events_.clear();
204   number_of_dispatches_.clear();
205 }
206 
Dispatch(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size)207 absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
208                                              const int3& work_groups_count,
209                                              const int3& work_group_size) {
210   events_.push_back(CLEvent());
211   number_of_dispatches_.push_back(1);
212   RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
213                                            work_group_size,
214                                            &events_[events_.size() - 1]));
215   events_.back().SetName(current_label_);
216   return absl::OkStatus();
217 }
218 
DispatchNTimes(const CLKernel & kernel,const int3 & work_groups_count,const int3 & work_group_size,int n,int flush_period)219 absl::Status ProfilingCommandQueue::DispatchNTimes(
220     const CLKernel& kernel, const int3& work_groups_count,
221     const int3& work_group_size, int n, int flush_period) {
222   number_of_dispatches_.push_back(n);
223   if (n == 1) {
224     events_.push_back(CLEvent());
225     RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
226                                              work_group_size,
227                                              &events_[events_.size() - 1]));
228     events_.back().SetName(current_label_);
229   } else {
230     events_.push_back(CLEvent());
231     events_.push_back(CLEvent());
232     RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
233                                              work_group_size,
234                                              &events_[events_.size() - 2]));
235     for (int i = 1; i < n - 1; ++i) {
236       RETURN_IF_ERROR(
237           CLCommandQueue::Dispatch(kernel, work_groups_count, work_group_size));
238       if (flush_period && i % flush_period == 0) {
239         clFlush(queue_);
240       }
241     }
242     RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
243                                              work_group_size,
244                                              &events_[events_.size() - 1]));
245     clFlush(queue_);
246     events_[events_.size() - 2].SetName(current_label_);
247     events_[events_.size() - 1].SetName(current_label_);
248   }
249   return absl::OkStatus();
250 }
251 
GetProfilingInfo() const252 ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
253   ProfilingInfo result;
254   result.dispatches.resize(number_of_dispatches_.size());
255   int events_counter = 0;
256   for (int i = 0; i < number_of_dispatches_.size(); ++i) {
257     result.dispatches[i].label = events_[events_counter].GetName();
258     if (number_of_dispatches_[i] == 1) {
259       result.dispatches[i].duration =
260           absl::Nanoseconds(events_[events_counter].GetEventTimeNs());
261       events_counter += 1;
262     } else {
263       result.dispatches[i].duration =
264           absl::Nanoseconds(events_[events_counter + 1].GetFinishedTimeNs() -
265                             events_[events_counter].GetStartedTimeNs()) /
266           number_of_dispatches_[i];
267       events_counter += 2;
268     }
269   }
270   return result;
271 }
272 
GetBestWorkGroupIndex(const CLKernel & kernel,const GpuInfo & gpu_info,const std::vector<int3> & work_groups_count,const std::vector<int3> & work_group_sizes,int * index)273 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
274     const CLKernel& kernel, const GpuInfo& gpu_info,
275     const std::vector<int3>& work_groups_count,
276     const std::vector<int3>& work_group_sizes, int* index) {
277   // Some Adreno 3xx can have wrong numbers for some events
278   const bool possible_bug_with_events =
279       gpu_info.IsAdreno() && gpu_info.adreno_info.IsAdreno3xx();
280   events_.resize(work_group_sizes.size());
281   for (int i = 0; i < work_group_sizes.size(); ++i) {
282     RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
283                                              work_group_sizes[i], &events_[i]));
284 
285     // reducing the speed of memory leak on Mali for some kernels
286     if (gpu_info.IsMali() && i % 8 == 7) {
287       events_[i - 7].Wait();
288     }
289     if (possible_bug_with_events) {
290       // We are trying to increase probability for correct result.
291       RETURN_IF_ERROR(WaitForCompletion());
292     }
293   }
294 
295   RETURN_IF_ERROR(WaitForCompletion());
296 
297   // To release memory of some kernel pool on Mali.
298   if (gpu_info.IsMali()) {
299     RETURN_IF_ERROR(kernel.ReInit());
300   }
301 
302   int minimum_index = 0;
303   double minimum_time = std::numeric_limits<double>::max();
304   if (possible_bug_with_events) {  // we will try to cut out suspicious results
305     double average_time = 0.0;
306     int average_samples_count = 0;
307     for (int i = 0; i < work_group_sizes.size(); ++i) {
308       if (events_[i].GetEventTimeMs() < 100 * 1000) {  // 100 sec
309         average_time += events_[i].GetEventTimeMs();
310         average_samples_count++;
311       }
312     }
313     average_time /= average_samples_count;
314     for (int i = 0; i < work_group_sizes.size(); ++i) {
315       double time = events_[i].GetEventTimeMs();
316       if (time < minimum_time && time >= 0.1 * average_time) {
317         minimum_index = i;
318         minimum_time = time;
319       }
320     }
321   } else {
322     for (int i = 0; i < work_group_sizes.size(); ++i) {
323       double time = events_[i].GetEventTimeMs();
324       if (time < minimum_time) {
325         minimum_index = i;
326         minimum_time = time;
327       }
328     }
329   }
330 
331   *index = minimum_index;
332 
333   return absl::OkStatus();
334 }
335 
CreateCLCommandQueue(const CLDevice & device,const CLContext & context,CLCommandQueue * result)336 absl::Status CreateCLCommandQueue(const CLDevice& device,
337                                   const CLContext& context,
338                                   CLCommandQueue* result) {
339   int error_code;
340   cl_command_queue queue =
341       clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
342   if (!queue) {
343     return absl::UnknownError(
344         absl::StrCat("Failed to create a command queue - ",
345                      CLErrorCodeToString(error_code)));
346   }
347   *result = CLCommandQueue(queue, true);
348   return absl::OkStatus();
349 }
350 
GetQueueExecutionTimeMs() const351 double ProfilingCommandQueue::GetQueueExecutionTimeMs() const {
352   const uint64_t start = events_.front().GetStartedTimeNs();
353   const uint64_t end = events_.back().GetFinishedTimeNs();
354   const uint64_t time_ns = (end - start);
355 
356   return static_cast<double>(time_ns) / 1000000.0;
357 }
358 
GetSumOfEventsTimeMs() const359 double ProfilingCommandQueue::GetSumOfEventsTimeMs() const {
360   double sum = 0.0;
361   for (int i = 0; i < events_.size(); ++i) {
362     sum += events_[i].GetEventTimeMs();
363   }
364   return sum;
365 }
366 
CreateProfilingCommandQueue(const CLDevice & device,const CLContext & context,ProfilingCommandQueue * result)367 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
368                                          const CLContext& context,
369                                          ProfilingCommandQueue* result) {
370   int error_code;
371   cl_command_queue queue = clCreateCommandQueue(
372       context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
373   if (!queue) {
374     return absl::UnknownError(
375         absl::StrCat("Failed to create a command queue - ",
376                      CLErrorCodeToString(error_code)));
377   }
378 
379   *result = ProfilingCommandQueue(queue);
380   return absl::OkStatus();
381 }
382 
383 }  // namespace cl
384 }  // namespace gpu
385 }  // namespace tflite
386