• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h"
25 
26 #include "arm_compute/core/CL/CLHelpers.h"
27 #include "arm_compute/core/GPUTarget.h"
28 #include "arm_compute/core/TensorInfo.h"
29 #include "arm_compute/core/TensorShape.h"
30 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
31 
32 namespace arm_compute
33 {
34 namespace cl_indirect_conv
35 {
36 using namespace arm_compute::misc::shape_calculator;
37 
ClIndirectConvDefaultConfigValhall(GPUTarget gpu)38 ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu)
39     : IClIndirectConvKernelConfig(gpu)
40 {
41 }
42 
configure(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info)43 DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
44 {
45     using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
46 
47     ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClIndirectConvDefaultConfigValhall::configure_G77_f32,
48                                                                             &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
49 
50     // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
51     // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
52     // for the pointwise convolution cases.
53 
54     ConfigurationFunctionExecutorPtr func = configs_G77.get_function(src->data_type());
55 
56     ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for indirect convolution");
57     return (this->*func)(src, wei, conv_info);
58 }
59 
configure_G77_f32(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info)60 DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
61 {
62     DirectConvComputeKernelInfo desc;
63 
64     if(src->data_layout() == DataLayout::NHWC)
65     {
66         const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
67         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
68         const int32_t stride_x     = conv_info.stride().first;
69         const int32_t stride_y     = conv_info.stride().second;
70         const int32_t ofm          = dst_shape[0];
71         const int32_t m            = (dst_shape[1]/ stride_x) * (dst_shape[2] / stride_y);
72 
73         desc.export_weights_to_cl_image = export_weights_to_cl_image;
74 
75         if(ofm <= 4)
76         {
77             desc.m0 = 1;
78             desc.n0 = 2;
79             desc.k0 = 16;
80         }
81         else
82         {
83             // The 16000 threshold value has been identified as the right
84             // one for using the biggest block size allowed on F32: 5x4x4
85             if(m < 16000)
86             {
87                 desc.m0 = 4;
88                 desc.n0 = 4;
89                 desc.k0 = 4;
90             }
91             else
92             {
93                 desc.m0 = 5;
94                 desc.n0 = 4;
95                 desc.k0 = 4;
96             }
97         }
98     }
99 
100     return desc;
101 }
102 
configure_G77_f16(const ITensorInfo * src,const ITensorInfo * wei,const PadStrideInfo & conv_info)103 DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
104 {
105     DirectConvComputeKernelInfo desc;
106 
107     if(src->data_layout() == DataLayout::NHWC)
108     {
109         const TensorShape wei_shape                  = wei->tensor_shape();
110         const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
111         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
112 
113         const int32_t ofm          = dst_shape[0];
114         const int32_t m            = dst_shape[1] * dst_shape[2];
115         const int32_t k            = wei_shape[0];
116 
117         desc.export_weights_to_cl_image = export_weights_to_cl_image;
118 
119         if(ofm <= 4)
120         {
121             // k0 should be as larger as possible. However, we should avoid
122             // having left-over for loops that make the implementation slower.
123             if((k % 16) == 0)
124             {
125                 desc.k0 = 16;
126             }
127             else if((k % 8) == 0)
128             {
129                 desc.k0 = 8;
130             }
131             else
132             {
133                 desc.k0 = 4;
134             }
135 
136             desc.m0 = 1;
137             desc.n0 = ofm;
138         }
139         else
140         {
141             // The 16000 threshold value has been identified as the right
142             // one for using the biggest block size allowed on F16: 8x4
143             if(m >= 16000 && k < 4)
144             {
145                 desc.m0 = 8;
146                 desc.n0 = 4;
147                 desc.k0 = 4;    // k0 is clamped to k inside the kernel when k is less than 4
148             }
149             else
150             {
151                 desc.m0 = 5;
152                 desc.n0 = 4;
153                 desc.k0 = 8;
154             }
155         }
156     }
157 
158     return desc;
159 }
160 } // namespace cl_indirect_conv
161 } // namespace arm_compute
162