• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
18 
19 #include <cstdint>
20 #include <string>
21 #include <vector>
22 
23 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
24 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
25 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h"
26 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
27 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
28 #include "tensorflow/lite/delegates/gpu/common/status.h"
29 #include "tensorflow/lite/delegates/gpu/common/task/profiling_info.h"
30 #include "tensorflow/lite/delegates/gpu/common/types.h"
31 
32 namespace tflite {
33 namespace gpu {
34 namespace cl {
35 
36 // A wrapper around opencl command queue
37 class CLCommandQueue {
38  public:
CLCommandQueue()39   CLCommandQueue() {}
40   CLCommandQueue(cl_command_queue queue, bool has_ownership);
41 
42   // Move only
43   CLCommandQueue(CLCommandQueue&& queue);
44   CLCommandQueue& operator=(CLCommandQueue&& queue);
45   CLCommandQueue(const CLCommandQueue&) = delete;
46   CLCommandQueue& operator=(const CLCommandQueue&) = delete;
47 
48   virtual ~CLCommandQueue();
49 
queue()50   cl_command_queue queue() const { return queue_; }
51 
52   virtual absl::Status Dispatch(const CLKernel& kernel,
53                                 const int3& work_groups_count,
54                                 const int3& work_group_size);
55 
56   absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
57                         const int3& work_group_size, CLEvent* event);
58 
59   absl::Status EnqueueEvent(CLEvent* event);
60 
61   absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data,
62                                  bool async = false);
63   absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data,
64                                 bool async = false);
65 
66   absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
67                                   const void* data, bool async = false);
68   absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes,
69                                  void* data, bool async = false);
70 
71   absl::Status WaitForCompletion();
72 
73  protected:
74   void Release();
75 
76   cl_command_queue queue_ = nullptr;
77   bool has_ownership_ = false;
78 };
79 
80 class ProfilingCommandQueue : public CLCommandQueue {
81  public:
ProfilingCommandQueue()82   ProfilingCommandQueue() {}
83   explicit ProfilingCommandQueue(cl_command_queue queue);
84 
85   // Move only
86   ProfilingCommandQueue(ProfilingCommandQueue&& queue);
87   ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue);
88   ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
89   ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
90 
91   absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
92                         const int3& work_group_size) override;
93 
94   // for better profiling
95   absl::Status DispatchNTimes(const CLKernel& kernel,
96                               const int3& work_groups_count,
97                               const int3& work_group_size, int n,
98                               int flush_period = 0);
99 
100   // will write index for fastest work_group among work_group_sizes
101   absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
102                                      const GpuInfo& gpu_info,
103                                      const std::vector<int3>& work_groups_count,
104                                      const std::vector<int3>& work_group_sizes,
105                                      int* index);
106 
107   // call ResetMeasurements() to start new seriese of measurements
108   void ResetMeasurements();
109 
110   double GetQueueExecutionTimeMs() const;
111 
112   // Difference from GetQueueExecutionTimeMs is that this number doesn't include
113   // time between kernels(kernels launches or preparing) on GPU. Usually, this
114   // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
115   // spend on something else(maybe kernels launches or preparing)
116   double GetSumOfEventsTimeMs() const;
117 
118   // This label will be used for all subsequent dispatches.
119   void SetEventsLabel(const std::string& name);
120 
121   ProfilingInfo GetProfilingInfo() const;
122 
123  private:
124   std::vector<CLEvent> events_;
125   std::vector<int> number_of_dispatches_;
126   std::string current_label_;
127 };
128 
129 absl::Status CreateCLCommandQueue(const CLDevice& device,
130                                   const CLContext& context,
131                                   CLCommandQueue* result);
132 
133 absl::Status CreateProfilingCommandQueue(const CLDevice& device,
134                                          const CLContext& context,
135                                          ProfilingCommandQueue* result);
136 
137 }  // namespace cl
138 }  // namespace gpu
139 }  // namespace tflite
140 
141 #endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_
142