1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either convolutionress or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #ifdef ENABLE_AVX
17 #include "nnacl/kernel/convolution_depthwise_sw_avx.h"
18 #include "nnacl/kernel/convolution_base.h"
19 #include "nnacl/fp32/conv_depthwise_fp32.h"
20 #include "nnacl/fp32/pack_fp32.h"
21 #include "nnacl/tensor_c.h"
22
ConvDwSWAVXInitPackedInputOutput(ConvolutionDepthwiseSWAVXStruct * conv_dw)23 int ConvDwSWAVXInitPackedInputOutput(ConvolutionDepthwiseSWAVXStruct *conv_dw) {
24 conv_dw->input_need_align_ = (conv_dw->conv_.compute_.in_c_ % conv_dw->oc_tile_ != 0);
25 conv_dw->output_need_align_ = (conv_dw->conv_.compute_.out_c_ % conv_dw->oc_tile_ != 0);
26
27 ExecEnv *env = conv_dw->conv_.base_.env_;
28 NNACL_CHECK_NULL_RETURN_ERR(env);
29
30 if (conv_dw->input_need_align_) {
31 int ic_algin = UP_DIV(conv_dw->conv_.compute_.in_c_, conv_dw->oc_tile_);
32 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_dw->conv_.compute_.in_n_, conv_dw->conv_.compute_.in_hw_, NNACL_ERR);
33 int input_bhw = conv_dw->conv_.compute_.in_n_ * conv_dw->conv_.compute_.in_hw_;
34 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(input_bhw, conv_dw->oc_tile_ * ic_algin, NNACL_ERR);
35 int pack_input_size = input_bhw * conv_dw->oc_tile_ * ic_algin;
36 conv_dw->packed_input_ = (float *)env->Alloc(env->allocator_, pack_input_size * sizeof(float));
37 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_dw->packed_input_);
38 }
39
40 if (conv_dw->output_need_align_) {
41 int oc_algin = UP_DIV(conv_dw->conv_.compute_.out_c_, conv_dw->oc_tile_);
42 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_dw->conv_.compute_.out_n_, conv_dw->conv_.compute_.out_hw_, NNACL_ERR);
43 int output_bhw = conv_dw->conv_.compute_.out_n_ * conv_dw->conv_.compute_.out_hw_;
44 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(output_bhw, conv_dw->oc_tile_ * oc_algin, NNACL_ERR);
45 int pack_output_size = output_bhw * conv_dw->oc_tile_ * oc_algin;
46 conv_dw->packed_output_ = (float *)env->Alloc(env->allocator_, pack_output_size * sizeof(float));
47 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_dw->packed_output_);
48 }
49
50 return NNACL_OK;
51 }
52
ConvDwSWAVXPackWeight(ConvolutionBaseStruct * conv)53 void ConvDwSWAVXPackWeight(ConvolutionBaseStruct *conv) {
54 ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)conv;
55 NNACL_CHECK_NULL_RETURN_VOID(conv_dw);
56
57 int oc_algin = UP_DIV(conv->compute_.out_c_, conv_dw->oc_tile_);
58 void *origin_weight = conv->base_.train_session_ ? conv->base_.in_[SECOND_INPUT]->data_ : conv->origin_weight_;
59 NNACL_CHECK_NULL_RETURN_VOID(origin_weight);
60
61 PackNHWCToNXHWCXFp32(conv->compute_.kernel_h_, conv->compute_.kernel_w_, conv->compute_.out_c_, oc_algin, 1,
62 (float *)conv->packed_weight_, (float *)conv->origin_weight_);
63 }
64
ConvDwSWAVXMallocWeightBiasData(ConvolutionBaseStruct * conv)65 int ConvDwSWAVXMallocWeightBiasData(ConvolutionBaseStruct *conv) {
66 ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)conv;
67 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
68
69 int oc_algin = UP_DIV(conv->compute_.out_c_, conv_dw->oc_tile_);
70 int pack_weight_size = oc_algin * conv_dw->oc_tile_ * conv->compute_.kernel_hw_;
71
72 if (!conv->base_.train_session_) {
73 NNACL_CHECK_MALLOC_SIZE(pack_weight_size * sizeof(float));
74 conv->packed_weight_ = ConvBaseGetConvPackWeightData(conv, pack_weight_size * sizeof(float));
75 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->packed_weight_);
76 }
77
78 if (conv->base_.in_size_ == THREE_TENSOR) {
79 int bias_size = oc_algin * conv_dw->oc_tile_;
80 NNACL_CHECK_MALLOC_SIZE(bias_size * sizeof(float));
81 conv->bias_data_ = conv->base_.env_->Alloc(conv->base_.env_->allocator_, bias_size * sizeof(float));
82 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->bias_data_);
83 memset(conv->bias_data_, 0, bias_size * sizeof(float));
84 }
85 return NNACL_OK;
86 }
87
ConvDwSWAvxRun(void * cdata,int task_id,float l,float r)88 int ConvDwSWAvxRun(void *cdata, int task_id, float l, float r) {
89 ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)cdata;
90 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
91 ConvParameter *conv_param = (ConvParameter *)conv_dw->conv_.base_.param_;
92 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
93
94 DepthwiseSWAvxFp32(conv_dw->packed_output_, conv_dw->packed_input_, (float *)conv_dw->conv_.packed_weight_,
95 (float *)conv_dw->conv_.bias_data_, conv_param, &conv_dw->sliding_param_, task_id);
96 return NNACL_OK;
97 }
98
ConvDwSWAVXFreePackedInputOutput(ConvolutionDepthwiseSWAVXStruct * conv_dw)99 void ConvDwSWAVXFreePackedInputOutput(ConvolutionDepthwiseSWAVXStruct *conv_dw) {
100 if (conv_dw->input_need_align_) {
101 conv_dw->conv_.base_.env_->Free(conv_dw->conv_.base_.env_->allocator_, conv_dw->packed_input_);
102 conv_dw->packed_input_ = NULL;
103 conv_dw->input_need_align_ = false;
104 }
105 if (conv_dw->output_need_align_) {
106 conv_dw->conv_.base_.env_->Free(conv_dw->conv_.base_.env_->allocator_, conv_dw->packed_output_);
107 conv_dw->packed_output_ = NULL;
108 conv_dw->output_need_align_ = false;
109 }
110 }
111
ConvolutionDepthwiseSWAVXCompute(KernelBase * self)112 int ConvolutionDepthwiseSWAVXCompute(KernelBase *self) {
113 ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)self;
114 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
115
116 int ret = ConvDwSWAVXInitPackedInputOutput(conv_dw);
117 if (ret != NNACL_OK) {
118 ConvDwSWAVXFreePackedInputOutput(conv_dw);
119 return ret;
120 }
121
122 ret = ConvBaseRepackWeight(&conv_dw->conv_);
123 if (ret != NNACL_OK) {
124 ConvDwSWAVXFreePackedInputOutput(conv_dw);
125 return ret;
126 }
127
128 TensorC *input_tensor = self->in_[FIRST_INPUT];
129 NNACL_CHECK_NULL_RETURN_ERR(input_tensor);
130 float *input_ptr = (float *)input_tensor->data_;
131 NNACL_CHECK_NULL_RETURN_ERR(input_ptr);
132
133 if (conv_dw->input_need_align_) {
134 PackNHWCToNHWCXFp32(input_ptr, conv_dw->packed_input_, conv_dw->conv_.compute_.in_n_,
135 conv_dw->conv_.compute_.in_hw_, conv_dw->conv_.compute_.in_c_, conv_dw->oc_tile_);
136 } else {
137 conv_dw->packed_input_ = input_ptr;
138 }
139
140 TensorC *output_tensor = self->out_[OUTPUT_INDEX];
141 NNACL_CHECK_NULL_RETURN_ERR(output_tensor);
142 float *output_ptr = (float *)output_tensor->data_;
143 NNACL_CHECK_NULL_RETURN_ERR(output_ptr);
144
145 if (!conv_dw->output_need_align_) {
146 conv_dw->packed_output_ = output_ptr;
147 }
148
149 ret = self->env_->ParallelLaunch(self->env_->thread_pool_, ConvDwSWAvxRun, self, self->thread_nr_);
150
151 if (conv_dw->output_need_align_) {
152 PackNHWCXToNHWCFp32(conv_dw->packed_output_, output_ptr, conv_dw->conv_.compute_.out_n_,
153 conv_dw->conv_.compute_.out_hw_, conv_dw->conv_.compute_.out_c_, conv_dw->oc_tile_);
154 }
155
156 ConvDwSWAVXFreePackedInputOutput(conv_dw);
157 return ret;
158 }
159
ConvolutionDepthwiseSWAVXPrepare(KernelBase * self)160 int ConvolutionDepthwiseSWAVXPrepare(KernelBase *self) {
161 NNACL_CHECK_FALSE(self->in_size_ < TWO_TENSOR, NNACL_INPUT_TENSOR_ERROR);
162 NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
163
164 ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)self;
165 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
166 ConvParameter *conv_param = (ConvParameter *)self->param_;
167 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
168
169 conv_dw->oc_tile_ = C8NUM;
170 ConvBaseUpdateOriginWeightAndBias(&conv_dw->conv_);
171
172 if (self->train_session_) {
173 int oc_algin = UP_DIV(conv_dw->conv_.compute_.out_c_, conv_dw->oc_tile_);
174 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(oc_algin * conv_dw->oc_tile_, conv_dw->conv_.compute_.kernel_hw_, NNACL_ERR);
175 int pack_weight_size = oc_algin * conv_dw->oc_tile_ * conv_dw->conv_.compute_.kernel_hw_;
176 self->work_size_ = pack_weight_size * sizeof(float);
177 }
178
179 return ConvBaseInitConvWeightBias(&conv_dw->conv_);
180 }
181
ConvolutionDepthwiseSWAVXResize(KernelBase * self)182 int ConvolutionDepthwiseSWAVXResize(KernelBase *self) {
183 ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)self;
184 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
185 ConvParameter *conv_param = (ConvParameter *)self->param_;
186 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
187
188 ConvBasePrepare(&conv_dw->conv_);
189
190 InitSlidingParamConvDw(&conv_dw->sliding_param_, conv_param, conv_dw->oc_tile_);
191 return NNACL_OK;
192 }
193
ConvolutionDepthwiseSWAVXRelease(KernelBase * self)194 int ConvolutionDepthwiseSWAVXRelease(KernelBase *self) {
195 ConvolutionBaseStruct *conv = (ConvolutionBaseStruct *)self;
196 NNACL_CHECK_NULL_RETURN_ERR(conv);
197 ConvBaseRelease(conv);
198 return NNACL_OK;
199 }
200
CreateConvDwSWAVX(ConvParameter * conv_param)201 KernelBase *CreateConvDwSWAVX(ConvParameter *conv_param) {
202 ConvolutionDepthwiseSWAVXStruct *conv_dw =
203 (ConvolutionDepthwiseSWAVXStruct *)malloc(sizeof(ConvolutionDepthwiseSWAVXStruct));
204 NNACL_MALLOC_CHECK_NULL_RETURN_NULL(conv_dw);
205 memset(conv_dw, 0, sizeof(ConvolutionDepthwiseSWAVXStruct));
206
207 conv_dw->conv_.pack_weight_ = ConvDwSWAVXPackWeight;
208 conv_dw->conv_.malloc_weight_bias_ = ConvDwSWAVXMallocWeightBiasData;
209
210 conv_dw->conv_.base_.Prepare = ConvolutionDepthwiseSWAVXPrepare;
211 conv_dw->conv_.base_.Compute = ConvolutionDepthwiseSWAVXCompute;
212 conv_dw->conv_.base_.Resize = ConvolutionDepthwiseSWAVXResize;
213 conv_dw->conv_.base_.Release = ConvolutionDepthwiseSWAVXRelease;
214 return (KernelBase *)conv_dw;
215 }
216 #endif
217