• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2019 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "OpenCLTimer.h"
25 
26 #include "../Framework.h"
27 #include "../Utils.h"
28 
29 #include "arm_compute/graph/INode.h"
30 #include "arm_compute/runtime/CL/CLScheduler.h"
31 
32 #ifndef ARM_COMPUTE_CL
33 #error "You can't use OpenCLTimer without OpenCL"
34 #endif /* ARM_COMPUTE_CL */
35 
36 namespace arm_compute
37 {
38 namespace test
39 {
40 namespace framework
41 {
42 template <bool output_timestamps>
id() const43 std::string    OpenCLClock<output_timestamps>::id() const
44 {
45     if(output_timestamps)
46     {
47         return "OpenCLTimestamps";
48     }
49     else
50     {
51         return "OpenCLTimer";
52     }
53 }
54 
55 template <bool output_timestamps>
OpenCLClock(ScaleFactor scale_factor)56 OpenCLClock<output_timestamps>::OpenCLClock(ScaleFactor scale_factor)
57     : _kernels(), _real_function(nullptr), _real_graph_function(nullptr), _prefix(), _timer_enabled(false)
58 {
59     auto                        q     = CLScheduler::get().queue();
60     cl_command_queue_properties props = q.getInfo<CL_QUEUE_PROPERTIES>();
61     if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
62     {
63         CLScheduler::get().set_queue(cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE));
64     }
65 
66     switch(scale_factor)
67     {
68         case ScaleFactor::NONE:
69             _scale_factor = 1.f;
70             _unit         = "ns";
71             break;
72         case ScaleFactor::TIME_US:
73             _scale_factor = 1000.f;
74             _unit         = "us";
75             break;
76         case ScaleFactor::TIME_MS:
77             _scale_factor = 1000000.f;
78             _unit         = "ms";
79             break;
80         case ScaleFactor::TIME_S:
81             _scale_factor = 1000000000.f;
82             _unit         = "s";
83             break;
84         default:
85             ARM_COMPUTE_ERROR("Invalid scale");
86     }
87 }
88 
89 template <bool output_timestamps>
test_start()90 void           OpenCLClock<output_timestamps>::test_start()
91 {
92     // Start intercepting enqueues:
93     ARM_COMPUTE_ERROR_ON(_real_function != nullptr);
94     ARM_COMPUTE_ERROR_ON(_real_graph_function != nullptr);
95     _real_function       = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
96     _real_graph_function = graph::TaskExecutor::get().execute_function;
97     auto interceptor     = [this](
98                                cl_command_queue command_queue,
99                                cl_kernel        kernel,
100                                cl_uint          work_dim,
101                                const size_t    *gwo,
102                                const size_t    *gws,
103                                const size_t    *lws,
104                                cl_uint          num_events_in_wait_list,
105                                const cl_event * event_wait_list,
106                                cl_event *       event)
107     {
108         if(this->_timer_enabled)
109         {
110             kernel_info       info;
111             cl::Kernel        cpp_kernel(kernel, true);
112             std::stringstream ss;
113             ss << this->_prefix << cpp_kernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
114             if(gws != nullptr)
115             {
116                 ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]";
117             }
118             if(lws != nullptr)
119             {
120                 ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]";
121             }
122             info.name = ss.str();
123             cl_event tmp;
124             cl_int   retval = this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
125             info.event      = tmp;
126             this->_kernels.push_back(std::move(info));
127 
128             if(event != nullptr)
129             {
130                 //return cl_event from the intercepted call
131                 clRetainEvent(tmp);
132                 *event = tmp;
133             }
134             return retval;
135         }
136         else
137         {
138             return this->_real_function(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, event);
139         }
140     };
141 
142     // Start intercepting tasks:
143     auto task_interceptor = [this](graph::ExecutionTask & task)
144     {
145         if(task.node != nullptr && !task.node->name().empty())
146         {
147             this->_prefix = task.node->name() + "/";
148         }
149         else
150         {
151             this->_prefix = "";
152         }
153         this->_real_graph_function(task);
154         this->_prefix = "";
155     };
156 
157     CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor;
158     graph::TaskExecutor::get().execute_function = task_interceptor;
159 }
160 
161 template <bool output_timestamps>
start()162 void           OpenCLClock<output_timestamps>::start()
163 {
164     _kernels.clear();
165     _timer_enabled = true;
166 }
167 template <bool output_timestamps>
stop()168 void           OpenCLClock<output_timestamps>::stop()
169 {
170     _timer_enabled = false;
171 }
172 
173 template <bool output_timestamps>
test_stop()174 void           OpenCLClock<output_timestamps>::test_stop()
175 {
176     // Restore real function
177     CLSymbols::get().clEnqueueNDRangeKernel_ptr = _real_function;
178     graph::TaskExecutor::get().execute_function = _real_graph_function;
179     _real_graph_function                        = nullptr;
180     _real_function                              = nullptr;
181 }
182 
183 template <bool              output_timestamps>
measurements() const184 Instrument::MeasurementsMap OpenCLClock<output_timestamps>::measurements() const
185 {
186     MeasurementsMap measurements;
187     unsigned int    kernel_number = 0;
188     for(auto const &kernel : _kernels)
189     {
190         cl_ulong queued;
191         cl_ulong flushed;
192         cl_ulong start;
193         cl_ulong end;
194         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &queued);
195         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &flushed);
196         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
197         kernel.event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
198         std::string name = kernel.name + " #" + support::cpp11::to_string(kernel_number++);
199 
200         if(output_timestamps)
201         {
202             measurements.emplace("[start]" + name, Measurement(start / static_cast<cl_ulong>(_scale_factor), _unit));
203             measurements.emplace("[queued]" + name, Measurement(queued / static_cast<cl_ulong>(_scale_factor), _unit));
204             measurements.emplace("[flushed]" + name, Measurement(flushed / static_cast<cl_ulong>(_scale_factor), _unit));
205             measurements.emplace("[end]" + name, Measurement(end / static_cast<cl_ulong>(_scale_factor), _unit));
206         }
207         else
208         {
209             measurements.emplace(name, Measurement((end - start) / _scale_factor, _unit));
210         }
211     }
212 
213     return measurements;
214 }
215 
216 template <bool              output_timestamps>
test_measurements() const217 Instrument::MeasurementsMap OpenCLClock<output_timestamps>::test_measurements() const
218 {
219     MeasurementsMap measurements;
220 
221     if(output_timestamps)
222     {
223         // The OpenCL clock and the wall clock are not in sync, so we use
224         // this trick to calculate the offset between the two clocks:
225         ::cl::Event event;
226         cl_ulong    now_gpu;
227 
228         // Enqueue retrieve current CPU clock and enqueue a dummy marker
229         std::chrono::high_resolution_clock::time_point now_cpu = std::chrono::high_resolution_clock::now();
230         CLScheduler::get().queue().enqueueMarker(&event);
231 
232         CLScheduler::get().queue().finish();
233         //Access the time at which the marker was enqueued:
234         event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &now_gpu);
235 
236         measurements.emplace("Now Wall clock", Measurement(now_cpu.time_since_epoch().count() / 1000, "us"));
237         measurements.emplace("Now OpenCL", Measurement(now_gpu / static_cast<cl_ulong>(_scale_factor), _unit));
238     }
239 
240     return measurements;
241 }
242 
243 } // namespace framework
244 } // namespace test
245 } // namespace arm_compute
246 
247 template class arm_compute::test::framework::OpenCLClock<true>;
248 template class arm_compute::test::framework::OpenCLClock<false>;
249