• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2018-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
25 
26 #include "arm_compute/core/CL/CLHelpers.h"
27 #include "src/core/CL/CLKernels.h"
28 #include "support/Cast.h"
29 
30 namespace arm_compute
31 {
32 namespace tuners
33 {
34 namespace
35 {
36 /** Tunes a @ref CLDirectConvolutionLayerKernel for a bifrost target
37  *
38  * @param[in] k Kernels to tune
39  */
tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel & k)40 void tune_direct_convolution_kernel(CLDirectConvolutionLayerKernel &k)
41 {
42     cl::NDRange lws_hint = k.lws_hint();
43 
44     const GPUTarget    gpu_target    = k.get_target();
45     const DataType     dt            = k._input->info()->data_type();
46     const TensorShape  weights_shape = k._weights->info()->tensor_shape();
47     const TensorShape  inputs_shape  = k._input->info()->tensor_shape();
48     const size_t       kernel_size   = weights_shape.x();
49     const unsigned int stride_x      = k._conv_stride_x;
50     const unsigned int stride_y      = k._conv_stride_y;
51 
52     if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (stride_x == 1) && (stride_y == 1) && (dt == DataType::F32))
53     {
54         // Through extensive experimentation with over 30 representative tensor
55         // shapes, we found a small number of local work size configurations
56         // that result in nearly optimal execution times. Selecting the right
57         // lws for a given shape, however, required a complex decision tree,
58         // until we constructed a simple feature as described below.
59         //
60         // We started from the number of multiply-accumulate operations for a
61         // convolution layer, which is equal to the product of the input
62         // dimensions 0..2 and the weights dimensions 0..2.  Unfortunately,
63         // this resulted in ties between distinct shapes that required distinct
64         // lws configurations. Replacing the width of the input with the kernel
65         // size, however, resulted in nearly optimal predictions. We use underscores
66         // in variable names to indicate when they are intentionally misleading.
67         const size_t product_of_weights_dimensions = weights_shape[0] * weights_shape[1] * weights_shape[2];
68         const size_t product_of_input_dimensions_  = inputs_shape[0] * inputs_shape[1] * inputs_shape[2];
69         const float  mega_ops_                     = 1e-6 * product_of_weights_dimensions * product_of_input_dimensions_;
70 
71         switch(kernel_size)
72         {
73             case 1:
74             {
75                 if(mega_ops_ < 1.f)
76                 {
77                     lws_hint = cl::NDRange(1, 1, 8);
78                 }
79                 else if(mega_ops_ < 7.f)
80                 {
81                     lws_hint = cl::NDRange(1, 1, 4);
82                 }
83                 else
84                 {
85                     lws_hint = cl::NDRange(1, 1, 2);
86                 }
87                 break;
88             }
89             case 3:
90             {
91                 if(mega_ops_ < 1.f)
92                 {
93                     lws_hint = cl::NDRange(1, 1, 8);
94                 }
95                 else if(mega_ops_ < 13.f)
96                 {
97                     lws_hint = cl::NDRange(2, 1, 4);
98                 }
99                 else if(mega_ops_ < 50.f)
100                 {
101                     lws_hint = cl::NDRange(3, 1, 4);
102                 }
103                 else
104                 {
105                     lws_hint = cl::NDRange(2, 1, 6);
106                 }
107                 break;
108             }
109             case 5:
110             {
111                 if(mega_ops_ < 2.f || mega_ops_ > 80.f)
112                 {
113                     lws_hint = cl::NDRange(2, 1, 4);
114                 }
115                 else
116                 {
117                     lws_hint = cl::NDRange(2, 1, 8);
118                 }
119                 break;
120             }
121             default:
122                 break;
123         }
124         k.set_lws_hint(lws_hint);
125     }
126 }
127 
tune_col2im_kernel(CLCol2ImKernel & k)128 void tune_col2im_kernel(CLCol2ImKernel &k)
129 {
130     cl::NDRange     lws_hint   = k.lws_hint();
131     const GPUTarget gpu_target = k.get_target();
132 
133     // Configure the local work size for Bifrost with a value obtained
134     // via exhaustive autotuning over 30 representative tensor shapes.
135     if(gpu_target_is_in(gpu_target,
136                         GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
137                         GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
138                         GPUTarget::G52, GPUTarget::G52LIT))
139     {
140         if((k._convolved_dims.width == 7) || (k._convolved_dims.width == 14))
141         {
142             lws_hint = cl::NDRange(1, 7, 1);
143         }
144         else
145         {
146             lws_hint = cl::NDRange(1, 8, 1);
147         }
148     }
149 
150     k.set_lws_hint(lws_hint);
151 }
152 
tune_im2col_kernel(CLIm2ColKernel & k)153 void tune_im2col_kernel(CLIm2ColKernel &k)
154 {
155     cl::NDRange     lws_hint   = k.lws_hint();
156     const GPUTarget gpu_target = k.get_target();
157 
158     // Local work size optimized for the 11x11 AlexNet convolution on Bifrost.
159     if(gpu_target_is_in(gpu_target,
160                         GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
161                         GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
162                         GPUTarget::G52, GPUTarget::G52LIT)
163        && k._kernel_dims.width == 11)
164     {
165         const bool is_square_kernel = (k._kernel_dims.width == k._kernel_dims.height);
166         if(!is_square_kernel && k._kernel_dims.width > 1 && !k._conv_info.has_padding())
167         {
168             lws_hint = cl::NDRange(1, 1, 1);
169         }
170     }
171     k.set_lws_hint(lws_hint);
172 }
173 
tune_gemv_kernel(CLGEMMMatrixVectorMultiplyKernel & k)174 void tune_gemv_kernel(CLGEMMMatrixVectorMultiplyKernel &k)
175 {
176     cl::NDRange     lws_hint   = k.lws_hint();
177     const GPUTarget gpu_target = k.get_target();
178 
179     // Configure the local work size for Bifrost with a value obtained
180     // via exhaustive autotuning for the MobileNets tensor shapes.
181     if(gpu_target_is_in(gpu_target,
182                         GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
183                         GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
184                         GPUTarget::G52, GPUTarget::G52LIT))
185     {
186         lws_hint = cl::NDRange(1, 1, 1);
187     }
188 
189     k.set_lws_hint(lws_hint);
190 }
191 
tune_gemm_kernel(CLGEMMMatrixMultiplyKernel & k)192 void tune_gemm_kernel(CLGEMMMatrixMultiplyKernel &k)
193 {
194     cl::NDRange     lws_hint   = k.lws_hint();
195     const GPUTarget gpu_target = k.get_target();
196 
197     // Configure LWS hint
198     switch(gpu_target)
199     {
200         case GPUTarget::G71:
201         case GPUTarget::G72:
202         case GPUTarget::G51:
203         case GPUTarget::G51BIG:
204         case GPUTarget::G51LIT:
205         case GPUTarget::G52:
206         case GPUTarget::G52LIT:
207         case GPUTarget::G76:
208             if(k._input1->info()->dimension(1) == 24)
209             {
210                 // LWS optimized for the 11x11 AlexNet convolution on Bifrost.
211                 lws_hint = cl::NDRange(2, 2);
212             }
213             else if(k._output->info()->dimension(1) == 196)
214             {
215                 lws_hint = cl::NDRange(1, 7);
216             }
217             else
218             {
219                 lws_hint = cl::NDRange(8, 8);
220             }
221             break;
222         default:
223             lws_hint = cl::NullRange;
224     }
225 
226     k.set_lws_hint(lws_hint);
227 }
228 
tune_pooling_kernel(CLPoolingLayerKernel & k)229 void tune_pooling_kernel(CLPoolingLayerKernel &k)
230 {
231     cl::NDRange     lws_hint   = k.lws_hint();
232     const GPUTarget gpu_target = k.get_target();
233 
234     // Configure the local work size (hint) from the first two dimensions of the global work size.
235     // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
236     // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
237     // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
238     if(k._input->info()->data_layout() == DataLayout::NCHW)
239     {
240         if(gpu_target_is_in(gpu_target,
241                             GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
242                             GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
243                             GPUTarget::G52, GPUTarget::G52LIT))
244         {
245             cl::NDRange gws = ICLKernel::gws_from_window(k.window());
246             lws_hint        = cl::NDRange(gws[0], gws[1], 1);
247         }
248     }
249 
250     k.set_lws_hint(lws_hint);
251 }
252 
tune_scale_kernel(CLScaleKernel & k)253 void tune_scale_kernel(CLScaleKernel &k)
254 {
255     cl::NDRange               lws_hint      = k.lws_hint();
256     const GPUTarget           gpu_target    = k.get_target();
257     const DataType            dt            = k.input()->info()->data_type();
258     const InterpolationPolicy interpolation = k.get_interpolation_policy();
259 
260     // Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32.
261     // The value are obtained via exhaustive autotuning.
262     if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (dt == DataType::F32) && (interpolation == InterpolationPolicy::BILINEAR))
263     {
264         auto dim_0 = k.output()->info()->dimension(0);
265         if(dim_0 == 480)
266         {
267             lws_hint = cl::NDRange(2, 1);
268         }
269         else if(dim_0 == 3120)
270         {
271             lws_hint = cl::NDRange(2, 8);
272         }
273         else if(dim_0 == 4160)
274         {
275             lws_hint = cl::NDRange(4, 8);
276         }
277         k.set_lws_hint(lws_hint);
278     }
279 }
280 } // namespace
281 
tune_kernel_static(ICLKernel & kernel)282 void BifrostTuner::tune_kernel_static(ICLKernel &kernel)
283 {
284     if(dynamic_cast<CLDirectConvolutionLayerKernel *>(&kernel) != nullptr)
285     {
286         tune_direct_convolution_kernel(*utils::cast::polymorphic_downcast<CLDirectConvolutionLayerKernel *>(&kernel));
287     }
288     else if(dynamic_cast<CLCol2ImKernel *>(&kernel) != nullptr)
289     {
290         tune_col2im_kernel(*utils::cast::polymorphic_downcast<CLCol2ImKernel *>(&kernel));
291     }
292     else if(dynamic_cast<CLIm2ColKernel *>(&kernel) != nullptr)
293     {
294         tune_im2col_kernel(*utils::cast::polymorphic_downcast<CLIm2ColKernel *>(&kernel));
295     }
296     else if(dynamic_cast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel) != nullptr)
297     {
298         tune_gemv_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixVectorMultiplyKernel *>(&kernel));
299     }
300     else if(dynamic_cast<CLGEMMMatrixMultiplyKernel *>(&kernel) != nullptr)
301     {
302         tune_gemm_kernel(*utils::cast::polymorphic_downcast<CLGEMMMatrixMultiplyKernel *>(&kernel));
303     }
304     else if(dynamic_cast<CLPoolingLayerKernel *>(&kernel) != nullptr)
305     {
306         tune_pooling_kernel(*utils::cast::polymorphic_downcast<CLPoolingLayerKernel *>(&kernel));
307     }
308     else if(dynamic_cast<CLScaleKernel *>(&kernel) != nullptr)
309     {
310         tune_scale_kernel(*utils::cast::polymorphic_downcast<CLScaleKernel *>(&kernel));
311     }
312 }
313 
tune_kernel_dynamic(ICLKernel & kernel)314 void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel)
315 {
316     ARM_COMPUTE_UNUSED(kernel);
317 }
318 
tune_kernel_dynamic(ICLKernel & kernel,ITensorPack & tensors)319 void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
320 {
321     ARM_COMPUTE_UNUSED(kernel, tensors);
322 }
323 } // namespace tuners
324 } // namespace arm_compute