• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef ARM_COMPUTE_ICLKERNEL_H
25 #define ARM_COMPUTE_ICLKERNEL_H
26 
27 #include "arm_compute/core/CL/CLKernelLibrary.h"
28 #include "arm_compute/core/CL/CLTypes.h"
29 #include "arm_compute/core/CL/OpenCL.h"
30 #include "arm_compute/core/GPUTarget.h"
31 #include "arm_compute/core/IKernel.h"
32 #include "arm_compute/core/Validate.h"
33 #include "arm_compute/core/experimental/Types.h"
34 
35 #include <string>
36 
37 namespace arm_compute
38 {
39 template <typename T>
40 class ICLArray;
41 class ICLTensor;
42 class Window;
43 
44 /** Common interface for all the OpenCL kernels */
45 class ICLKernel : public IKernel
46 {
47 private:
48     /** Returns the number of arguments enqueued per array object.
49      *
50      * @return The number of arguments enqueued per array object.
51      */
52     template <unsigned int        dimension_size>
num_arguments_per_array()53     constexpr static unsigned int num_arguments_per_array()
54     {
55         return num_arguments_per_tensor<dimension_size>();
56     }
57     /** Returns the number of arguments enqueued per tensor object.
58      *
59      * @return The number of arguments enqueued per tensor object.
60      */
61     template <unsigned int        dimension_size>
num_arguments_per_tensor()62     constexpr static unsigned int num_arguments_per_tensor()
63     {
64         return 2 + 2 * dimension_size;
65     }
66     using IKernel::configure; //Prevent children from calling IKernel::configure() directly
67 protected:
68     /** Configure the kernel's window and local workgroup size hint.
69      *
70      * @param[in] window   The maximum window which will be returned by window()
71      * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
72      */
73     void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
74     {
75         _lws_hint = lws_hint;
76         IKernel::configure(window);
77     }
78 
79 public:
80     /** Constructor */
ICLKernel()81     ICLKernel()
82         : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
83     {
84     }
85     /** Returns a reference to the OpenCL kernel of this object.
86      *
87      * @return A reference to the OpenCL kernel of this object.
88      */
kernel()89     cl::Kernel &kernel()
90     {
91         return _kernel;
92     }
93     /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
94      *
95      * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
96      * @param[in]     array          Array to set as an argument of the object's kernel.
97      * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
98      * @param[in]     num_dimensions Number of dimensions of the @p array.
99      * @param[in]     window         Window the kernel will be executed on.
100      */
101     template <typename T>
add_1D_array_argument(unsigned int & idx,const ICLArray<T> * array,const Strides & strides,unsigned int num_dimensions,const Window & window)102     void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
103     {
104         add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
105     }
106     /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
107      *
108      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
109      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
110      * @param[in]     window Window the kernel will be executed on.
111      */
add_1D_tensor_argument(unsigned int & idx,const ICLTensor * tensor,const Window & window)112     void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
113     {
114         add_tensor_argument<1>(idx, tensor, window);
115     }
116     /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
117      *
118      * @param[in]     cond   Condition to check
119      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
120      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
121      * @param[in]     window Window the kernel will be executed on.
122      */
add_1D_tensor_argument_if(bool cond,unsigned int & idx,const ICLTensor * tensor,const Window & window)123     void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
124     {
125         if(cond)
126         {
127             add_1D_tensor_argument(idx, tensor, window);
128         }
129     }
130     /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
131      *
132      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
133      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
134      * @param[in]     window Window the kernel will be executed on.
135      */
add_2D_tensor_argument(unsigned int & idx,const ICLTensor * tensor,const Window & window)136     void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
137     {
138         add_tensor_argument<2>(idx, tensor, window);
139     }
140     /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
141      *
142      * @param[in]     cond   Condition to check
143      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
144      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
145      * @param[in]     window Window the kernel will be executed on.
146      */
add_2D_tensor_argument_if(bool cond,unsigned int & idx,const ICLTensor * tensor,const Window & window)147     void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
148     {
149         if(cond)
150         {
151             add_2D_tensor_argument(idx, tensor, window);
152         }
153     }
154     /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
155      *
156      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
157      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
158      * @param[in]     window Window the kernel will be executed on.
159      */
add_3D_tensor_argument(unsigned int & idx,const ICLTensor * tensor,const Window & window)160     void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
161     {
162         add_tensor_argument<3>(idx, tensor, window);
163     }
164     /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
165      *
166      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
167      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
168      * @param[in]     window Window the kernel will be executed on.
169      */
add_4D_tensor_argument(unsigned int & idx,const ICLTensor * tensor,const Window & window)170     void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
171     {
172         add_tensor_argument<4>(idx, tensor, window);
173     }
174     /** Returns the number of arguments enqueued per 1D array object.
175      *
176      * @return The number of arguments enqueues per 1D array object.
177      */
num_arguments_per_1D_array()178     constexpr static unsigned int num_arguments_per_1D_array()
179     {
180         return num_arguments_per_array<1>();
181     }
182     /** Returns the number of arguments enqueued per 1D tensor object.
183      *
184      * @return The number of arguments enqueues per 1D tensor object.
185      */
num_arguments_per_1D_tensor()186     constexpr static unsigned int num_arguments_per_1D_tensor()
187     {
188         return num_arguments_per_tensor<1>();
189     }
190     /** Returns the number of arguments enqueued per 2D tensor object.
191      *
192      * @return The number of arguments enqueues per 2D tensor object.
193      */
num_arguments_per_2D_tensor()194     constexpr static unsigned int num_arguments_per_2D_tensor()
195     {
196         return num_arguments_per_tensor<2>();
197     }
198     /** Returns the number of arguments enqueued per 3D tensor object.
199      *
200      * @return The number of arguments enqueues per 3D tensor object.
201      */
num_arguments_per_3D_tensor()202     constexpr static unsigned int num_arguments_per_3D_tensor()
203     {
204         return num_arguments_per_tensor<3>();
205     }
206     /** Returns the number of arguments enqueued per 4D tensor object.
207      *
208      * @return The number of arguments enqueues per 4D tensor object.
209      */
num_arguments_per_4D_tensor()210     constexpr static unsigned int num_arguments_per_4D_tensor()
211     {
212         return num_arguments_per_tensor<4>();
213     }
214     /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
215      *
216      * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
217      *
218      * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
219      * @param[in,out] queue  Command queue on which to enqueue the kernel.
220      */
run(const Window & window,cl::CommandQueue & queue)221     virtual void run(const Window &window, cl::CommandQueue &queue)
222     {
223         ARM_COMPUTE_UNUSED(window, queue);
224     }
225     /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
226      *
227      * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
228      *
229      * @param[in]     tensors A vector containing the tensors to operato on.
230      * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
231      * @param[in,out] queue   Command queue on which to enqueue the kernel.
232      */
run_op(ITensorPack & tensors,const Window & window,cl::CommandQueue & queue)233     virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
234     {
235         ARM_COMPUTE_UNUSED(tensors, window, queue);
236     }
237     /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
238      *
239      * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
240      * @param[in]     value Value to set as an argument of the object's kernel.
241      */
242     template <typename T>
add_argument(unsigned int & idx,T value)243     void add_argument(unsigned int &idx, T value)
244     {
245         _kernel.setArg(idx++, value);
246     }
247 
248     /** Set the Local-Workgroup-Size hint
249      *
250      * @note This method should be called after the configuration of the kernel
251      *
252      * @param[in] lws_hint Local-Workgroup-Size to use
253      */
set_lws_hint(const cl::NDRange & lws_hint)254     void set_lws_hint(const cl::NDRange &lws_hint)
255     {
256         ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
257         _lws_hint = lws_hint;
258     }
259 
260     /** Return the Local-Workgroup-Size hint
261      *
262      * @return Current lws hint
263      */
lws_hint()264     cl::NDRange lws_hint() const
265     {
266         return _lws_hint;
267     }
268 
269     /** Get the configuration ID
270      *
271      * @note The configuration ID can be used by the caller to distinguish different calls of the same OpenCL kernel
272      *       In particular, this method can be used by CLScheduler to keep track of the best LWS for each configuration of the same kernel.
273      *       The configuration ID should be provided only for the kernels potentially affected by the LWS geometry
274      *
275      * @note This method should be called after the configuration of the kernel
276      *
277      * @return configuration id string
278      */
config_id()279     const std::string &config_id() const
280     {
281         return _config_id;
282     }
283 
284     /** Set the targeted GPU architecture
285      *
286      * @param[in] target The targeted GPU architecture
287      */
set_target(GPUTarget target)288     void set_target(GPUTarget target)
289     {
290         _target = target;
291     }
292 
293     /** Set the targeted GPU architecture according to the CL device
294      *
295      * @param[in] device A CL device
296      */
297     void set_target(cl::Device &device);
298 
299     /** Get the targeted GPU architecture
300      *
301      * @return The targeted GPU architecture.
302      */
get_target()303     GPUTarget get_target() const
304     {
305         return _target;
306     }
307 
308     /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
309      *
310      * @return The maximum workgroup size value.
311      */
312     size_t get_max_workgroup_size();
313     /** Get the global work size given an execution window
314      *
315      * @param[in] window Execution window
316      *
317      * @return Global work size of the given execution window
318      */
319     static cl::NDRange gws_from_window(const Window &window);
320 
321 private:
322     /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
323      *
324      * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
325      * @param[in]     array          Array to set as an argument of the object's kernel.
326      * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
327      * @param[in]     num_dimensions Number of dimensions of the @p array.
328      * @param[in]     window         Window the kernel will be executed on.
329      */
330     template <typename T, unsigned int dimension_size>
331     void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
332     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
333      *
334      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
335      * @param[in]     tensor Tensor to set as an argument of the object's kernel.
336      * @param[in]     window Window the kernel will be executed on.
337      */
338     template <unsigned int dimension_size>
339     void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
340 
341 protected:
342     cl::Kernel  _kernel;             /**< OpenCL kernel to run */
343     GPUTarget   _target;             /**< The targeted GPU */
344     std::string _config_id;          /**< Configuration ID */
345     size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
346 private:
347     cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
348 };
349 
350 /** Add the kernel to the command queue with the given window.
351  *
352  * @note Depending on the size of the window, this might translate into several jobs being enqueued.
353  *
354  * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
355  *
356  * @param[in,out] queue                OpenCL command queue.
357  * @param[in]     kernel               Kernel to enqueue
358  * @param[in]     window               Window the kernel has to process.
359  * @param[in]     lws_hint             (Optional) Local workgroup size requested. Default is based on the device target.
360  * @param[in]     use_dummy_work_items (Optional) Use dummy work items in order to have two dimensional power of two NDRange. Default is false
361  *                                     Note: it is kernel responsibility to check if the work-item is out-of-range
362  *
363  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
364  */
365 void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
366 
367 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
368  *
369  * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
370  * @param[in]     array          Array to set as an argument of the object's kernel.
371  * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
372  * @param[in]     num_dimensions Number of dimensions of the @p array.
373  * @param[in]     window         Window the kernel will be executed on.
374  */
375 template <typename T, unsigned int dimension_size>
add_array_argument(unsigned & idx,const ICLArray<T> * array,const Strides & strides,unsigned int num_dimensions,const Window & window)376 void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
377 {
378     ARM_COMPUTE_ERROR_ON(array == nullptr);
379 
380     // Calculate offset to the start of the window
381     unsigned int offset_first_element = 0;
382 
383     for(unsigned int n = 0; n < num_dimensions; ++n)
384     {
385         offset_first_element += window[n].start() * strides[n];
386     }
387 
388     unsigned int idx_start = idx;
389     _kernel.setArg(idx++, array->cl_buffer());
390 
391     for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
392     {
393         _kernel.setArg<cl_uint>(idx++, strides[dimension]);
394         _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
395     }
396 
397     _kernel.setArg<cl_uint>(idx++, offset_first_element);
398 
399     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
400                                  "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
401     ARM_COMPUTE_UNUSED(idx_start);
402 }
403 }
404 #endif /*ARM_COMPUTE_ICLKERNEL_H */
405