1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_ 17 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_ 18 19 #include <cstdint> 20 #include <string> 21 #include <vector> 22 23 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h" 24 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h" 25 #include "tensorflow/lite/delegates/gpu/cl/cl_event.h" 26 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h" 27 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" 28 #include "tensorflow/lite/delegates/gpu/common/status.h" 29 #include "tensorflow/lite/delegates/gpu/common/task/profiling_info.h" 30 #include "tensorflow/lite/delegates/gpu/common/types.h" 31 32 namespace tflite { 33 namespace gpu { 34 namespace cl { 35 36 // A wrapper around opencl command queue 37 class CLCommandQueue { 38 public: CLCommandQueue()39 CLCommandQueue() {} 40 CLCommandQueue(cl_command_queue queue, bool has_ownership); 41 42 // Move only 43 CLCommandQueue(CLCommandQueue&& queue); 44 CLCommandQueue& operator=(CLCommandQueue&& queue); 45 CLCommandQueue(const CLCommandQueue&) = delete; 46 CLCommandQueue& operator=(const CLCommandQueue&) = delete; 47 48 virtual ~CLCommandQueue(); 49 queue()50 cl_command_queue queue() const { return queue_; } 51 52 virtual absl::Status Dispatch(const CLKernel& kernel, 53 const int3& work_groups_count, 54 const int3& work_group_size); 55 56 absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count, 57 const int3& work_group_size, CLEvent* event); 58 59 absl::Status EnqueueEvent(CLEvent* event); 60 61 absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data, 62 bool async = false); 63 absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data, 64 bool async = false); 65 66 absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes, 67 const void* data, bool async = false); 68 absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, 69 void* data, bool async = false); 70 71 absl::Status WaitForCompletion(); 72 73 protected: 74 void Release(); 75 76 cl_command_queue queue_ = nullptr; 77 bool has_ownership_ = false; 78 }; 79 80 class ProfilingCommandQueue : public CLCommandQueue { 81 public: ProfilingCommandQueue()82 ProfilingCommandQueue() {} 83 explicit ProfilingCommandQueue(cl_command_queue queue); 84 85 // Move only 86 ProfilingCommandQueue(ProfilingCommandQueue&& queue); 87 ProfilingCommandQueue& operator=(ProfilingCommandQueue&& queue); 88 ProfilingCommandQueue(const ProfilingCommandQueue&) = delete; 89 ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete; 90 91 absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count, 92 const int3& work_group_size) override; 93 94 // for better profiling 95 absl::Status DispatchNTimes(const CLKernel& kernel, 96 const int3& work_groups_count, 97 const int3& work_group_size, int n, 98 int flush_period = 0); 99 100 // will write index for fastest work_group among work_group_sizes 101 absl::Status GetBestWorkGroupIndex(const CLKernel& kernel, 102 const GpuInfo& gpu_info, 103 const std::vector<int3>& work_groups_count, 104 const std::vector<int3>& work_group_sizes, 105 int* index); 106 107 // call ResetMeasurements() to start new seriese of measurements 108 void ResetMeasurements(); 109 110 double GetQueueExecutionTimeMs() const; 111 112 // Difference from GetQueueExecutionTimeMs is that this number doesn't include 113 // time between kernels(kernels launches or preparing) on GPU. Usually, this 114 // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10% 115 // spend on something else(maybe kernels launches or preparing) 116 double GetSumOfEventsTimeMs() const; 117 118 // This label will be used for all subsequent dispatches. 119 void SetEventsLabel(const std::string& name); 120 121 ProfilingInfo GetProfilingInfo() const; 122 123 private: 124 std::vector<CLEvent> events_; 125 std::vector<int> number_of_dispatches_; 126 std::string current_label_; 127 }; 128 129 absl::Status CreateCLCommandQueue(const CLDevice& device, 130 const CLContext& context, 131 CLCommandQueue* result); 132 133 absl::Status CreateProfilingCommandQueue(const CLDevice& device, 134 const CLContext& context, 135 ProfilingCommandQueue* result); 136 137 } // namespace cl 138 } // namespace gpu 139 } // namespace tflite 140 141 #endif // TENSORFLOW_LITE_DELEGATES_GPU_CL_CL_COMMAND_QUEUE_H_ 142