1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either convolutionress or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #if defined(ENABLE_AVX) || defined(ENABLE_ARM64)
18 #include "nnacl/kernel/convolution_slidewindow.h"
19 #include "nnacl/fp32/conv_depthwise_fp32.h"
20 #include "nnacl/fp32/pack_fp32.h"
21 #include "nnacl/tensor_c.h"
22 #include "nnacl/tensor_c_utils.h"
23
ConvSWInitTmpBuffer(ConvolutionSWStruct * conv_sw)24 int ConvSWInitTmpBuffer(ConvolutionSWStruct *conv_sw) {
25 TensorC *input_tensor = conv_sw->conv_.base_.in_[FIRST_INPUT];
26 NNACL_CHECK_NULL_RETURN_ERR(input_tensor);
27 float *input_data = (float *)input_tensor->data_;
28 NNACL_CHECK_NULL_RETURN_ERR(input_data);
29 ConvComputeParam *compute = &conv_sw->conv_.compute_;
30 NNACL_CHECK_NULL_RETURN_ERR(compute);
31
32 if (conv_sw->ic_res_ != 0 && compute->kernel_h_ == 1 && compute->kernel_w_ == 1) {
33 int ic_block_num = UP_DIV(compute->in_c_, conv_sw->in_tile_);
34 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(compute->in_n_, compute->in_hw_, NNACL_ERR);
35 int input_bhw = compute->in_n_ * conv_sw->conv_.compute_.in_hw_;
36 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(input_bhw, ic_block_num * conv_sw->in_tile_, NNACL_ERR);
37
38 conv_sw->input_data_ = (float *)conv_sw->conv_.base_.env_->Alloc(
39 conv_sw->conv_.base_.env_->allocator_, input_bhw * ic_block_num * conv_sw->in_tile_ * sizeof(float));
40 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_sw->input_data_);
41
42 PackNHWCToNHWCXFp32(input_data, conv_sw->input_data_, compute->in_n_, compute->in_hw_, compute->in_c_,
43 conv_sw->oc_tile_);
44 } else {
45 conv_sw->input_data_ = input_data;
46 }
47
48 float *out_data = (float *)conv_sw->conv_.base_.out_[OUTPUT_INDEX]->data_;
49 NNACL_CHECK_NULL_RETURN_ERR(out_data);
50 if (conv_sw->oc_res_ == 0) { // not need to malloc dst
51 conv_sw->output_data_ = out_data;
52 } else { // need to malloc dst to align block
53 int oc_block_num = UP_DIV(compute->out_c_, conv_sw->oc_tile_);
54 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(compute->out_n_, compute->out_hw_, NNACL_ERR);
55 int output_bhw = compute->out_n_ * compute->out_hw_;
56 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(output_bhw, oc_block_num * conv_sw->oc_tile_, NNACL_ERR);
57 conv_sw->output_data_ = (float *)conv_sw->conv_.base_.env_->Alloc(
58 conv_sw->conv_.base_.env_->allocator_, output_bhw * oc_block_num * conv_sw->oc_tile_ * sizeof(float));
59 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_sw->output_data_);
60 }
61
62 return NNACL_OK;
63 }
64
ConvSWFreeTmpBuffer(ConvolutionSWStruct * conv_sw)65 void ConvSWFreeTmpBuffer(ConvolutionSWStruct *conv_sw) {
66 ConvParameter *conv_param = (ConvParameter *)conv_sw->conv_.base_.param_;
67 NNACL_CHECK_NULL_RETURN_VOID(conv_param);
68
69 if (conv_sw->output_data_ != NULL && conv_sw->oc_res_ != 0) {
70 conv_sw->conv_.base_.env_->Free(conv_sw->conv_.base_.env_->allocator_, conv_sw->output_data_);
71 conv_sw->output_data_ = NULL;
72 }
73 if (conv_sw->input_data_ != NULL && conv_sw->ic_res_ != 0 && conv_param->kernel_w_ == 1 &&
74 conv_param->kernel_h_ == 1) {
75 conv_sw->conv_.base_.env_->Free(conv_sw->conv_.base_.env_->allocator_, conv_sw->input_data_);
76 conv_sw->input_data_ = NULL;
77 }
78 }
79
ConvSWPackWeight(ConvolutionBaseStruct * conv)80 void ConvSWPackWeight(ConvolutionBaseStruct *conv) {
81 ConvolutionSWStruct *conv_sw = (ConvolutionSWStruct *)conv;
82 NNACL_CHECK_NULL_RETURN_VOID(conv_sw);
83 TensorC *filter_tensor = conv->base_.in_[SECOND_INPUT];
84 NNACL_CHECK_NULL_RETURN_VOID(filter_tensor);
85
86 int input_channel = GetChannel(filter_tensor);
87 int output_channel = GetBatch(filter_tensor);
88 int kernel_h = GetHeight(filter_tensor);
89 int kernel_w = GetWidth(filter_tensor);
90
91 int oc_block_num = UP_DIV(output_channel, conv_sw->oc_tile_);
92 void *origin_weight = (conv->base_.train_session_) ? filter_tensor->data_ : conv->origin_weight_;
93 NNACL_CHECK_NULL_RETURN_VOID(origin_weight);
94 PackNHWCToNXHWCXFp32(kernel_h, kernel_w, output_channel, oc_block_num, input_channel, (float *)conv->packed_weight_,
95 (float *)origin_weight);
96 }
97
ConvSWMallocWeightBiasData(ConvolutionBaseStruct * conv)98 int ConvSWMallocWeightBiasData(ConvolutionBaseStruct *conv) {
99 ConvolutionSWStruct *conv_sw = (ConvolutionSWStruct *)conv;
100 NNACL_CHECK_NULL_RETURN_ERR(conv_sw);
101 ConvParameter *conv_param = (ConvParameter *)conv->base_.param_;
102 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
103 TensorC *filter_tensor = conv->base_.in_[SECOND_INPUT];
104 NNACL_CHECK_NULL_RETURN_ERR(filter_tensor);
105
106 int input_channel = GetChannel(filter_tensor);
107 int output_channel = GetBatch(filter_tensor);
108 int kernel_h = GetHeight(filter_tensor);
109 int kernel_w = GetWidth(filter_tensor);
110
111 conv_param->input_channel_ = input_channel;
112 conv_param->output_channel_ = output_channel;
113 int kernel_plane = kernel_h * kernel_w;
114 int oc_block_num = UP_DIV(output_channel, conv_sw->oc_tile_);
115 int pack_weight_size = oc_block_num * conv_sw->oc_tile_ * input_channel * kernel_plane;
116 if (!conv_sw->conv_.base_.train_session_) {
117 conv_sw->conv_.packed_weight_ = ConvBaseGetConvPackWeightData(conv, pack_weight_size * sizeof(float));
118 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_sw->conv_.packed_weight_);
119 }
120
121 if (conv_sw->conv_.base_.in_size_ == THREE_TENSOR) {
122 int malloc_size = oc_block_num * conv_sw->oc_tile_ * sizeof(float);
123 conv->bias_data_ = conv->base_.env_->Alloc(conv->base_.env_->allocator_, malloc_size);
124 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->bias_data_);
125 memset(conv->bias_data_, 0, oc_block_num * conv_sw->oc_tile_ * sizeof(float));
126 }
127 return NNACL_OK;
128 }
129
ConvSWImpl(void * cdata,int task_id,float l,float r)130 int ConvSWImpl(void *cdata, int task_id, float l, float r) {
131 ConvolutionSWStruct *conv_sw = (ConvolutionSWStruct *)cdata;
132 NNACL_CHECK_NULL_RETURN_ERR(conv_sw);
133 return conv_sw->conv_.run_impl_(&conv_sw->conv_, task_id);
134 }
135
ConvolutionSWCompute(KernelBase * self)136 int ConvolutionSWCompute(KernelBase *self) {
137 ConvolutionSWStruct *conv_sw = (ConvolutionSWStruct *)self;
138 NNACL_CHECK_NULL_RETURN_ERR(conv_sw);
139
140 int ret = ConvSWInitTmpBuffer(conv_sw);
141 if (ret != NNACL_OK) {
142 ConvSWFreeTmpBuffer(conv_sw);
143 return ret;
144 }
145
146 ret = ConvBaseRepackWeight(&conv_sw->conv_);
147 if (ret != NNACL_OK) {
148 ConvSWFreeTmpBuffer(conv_sw);
149 return ret;
150 }
151
152 ret = self->env_->ParallelLaunch(self->env_->thread_pool_, ConvSWImpl, self, self->thread_nr_);
153 if (ret != NNACL_OK) {
154 ConvSWFreeTmpBuffer(conv_sw);
155 return ret;
156 }
157
158 if (conv_sw->oc_res_ != 0) {
159 ConvParameter *conv_param = (ConvParameter *)self->param_;
160 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
161 float *out_data = (float *)self->out_[OUTPUT_INDEX]->data_;
162 NNACL_CHECK_NULL_RETURN_ERR(out_data);
163 PackNHWCXToNHWCFp32(conv_sw->output_data_, out_data, conv_param->output_batch_,
164 conv_param->output_h_ * conv_param->output_w_, conv_param->output_channel_, conv_sw->oc_tile_);
165 }
166
167 ConvSWFreeTmpBuffer(conv_sw);
168 return NNACL_OK;
169 }
170
ConvolutionSWRelease(KernelBase * self)171 int ConvolutionSWRelease(KernelBase *self) {
172 ConvolutionBaseStruct *conv = (ConvolutionBaseStruct *)self;
173 NNACL_CHECK_NULL_RETURN_ERR(conv);
174 ConvBaseRelease(conv);
175 return NNACL_OK;
176 }
177
ConvolutionSWResize(KernelBase * self)178 int ConvolutionSWResize(KernelBase *self) {
179 ConvolutionSWStruct *conv_sw = (ConvolutionSWStruct *)self;
180 NNACL_CHECK_NULL_RETURN_ERR(conv_sw);
181 ConvParameter *conv_param = (ConvParameter *)self->param_;
182 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
183
184 int ret = ConvBaseCheckResizeValid(&conv_sw->conv_);
185 if (ret != NNACL_OK) {
186 return ret;
187 }
188
189 ret = ConvBasePrepare(&conv_sw->conv_);
190 if (ret != NNACL_OK) {
191 return ret;
192 }
193
194 InitSlidingParamConv(&conv_sw->sw_param_, conv_param, conv_sw->in_tile_, conv_sw->oc_tile_);
195 return NNACL_OK;
196 }
197
ConvolutionSWPrepare(KernelBase * self)198 int ConvolutionSWPrepare(KernelBase *self) {
199 ConvolutionSWStruct *conv_sw = (ConvolutionSWStruct *)self;
200 NNACL_CHECK_NULL_RETURN_ERR(conv_sw);
201
202 conv_sw->conv_.init_global_variable_(&conv_sw->conv_);
203
204 if (self->train_session_) {
205 TensorC *filter_tensor = self->in_[SECOND_INPUT];
206 NNACL_CHECK_NULL_RETURN_ERR(filter_tensor);
207 NNACL_CHECK_FALSE(filter_tensor->shape_size_ != DIMENSION_4D, NNACL_CONVOLUTION_WEIGHT_SHAPE_INVALID);
208
209 int input_channel = GetChannel(filter_tensor);
210 int output_channel = GetBatch(filter_tensor);
211 int kernel_h = GetHeight(filter_tensor);
212 int kernel_w = GetWidth(filter_tensor);
213
214 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(kernel_h, kernel_w, NNACL_ERR);
215 int kernel_hw = kernel_h * kernel_w;
216 int oc_block_num = UP_DIV(output_channel, conv_sw->oc_tile_);
217 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(input_channel, kernel_hw, NNACL_ERR);
218 int kernel_chw = input_channel * kernel_hw;
219 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(oc_block_num * conv_sw->oc_tile_, kernel_chw, NNACL_ERR);
220 int pack_weight_size = oc_block_num * conv_sw->oc_tile_ * kernel_chw;
221
222 conv_sw->conv_.base_.work_size_ = pack_weight_size * sizeof(float);
223 }
224
225 return ConvBaseInitConvWeightBias(&conv_sw->conv_);
226 }
227 #endif
228