• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either convolutionress or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifdef ENABLE_AVX
17 #include "nnacl/kernel/convolution_depthwise_sw_avx.h"
18 #include "nnacl/kernel/convolution_base.h"
19 #include "nnacl/fp32/conv_depthwise_fp32.h"
20 #include "nnacl/fp32/pack_fp32.h"
21 #include "nnacl/tensor_c.h"
22 
ConvDwSWAVXInitPackedInputOutput(ConvolutionDepthwiseSWAVXStruct * conv_dw)23 int ConvDwSWAVXInitPackedInputOutput(ConvolutionDepthwiseSWAVXStruct *conv_dw) {
24   conv_dw->input_need_align_ = (conv_dw->conv_.compute_.in_c_ % conv_dw->oc_tile_ != 0);
25   conv_dw->output_need_align_ = (conv_dw->conv_.compute_.out_c_ % conv_dw->oc_tile_ != 0);
26 
27   ExecEnv *env = conv_dw->conv_.base_.env_;
28   NNACL_CHECK_NULL_RETURN_ERR(env);
29 
30   if (conv_dw->input_need_align_) {
31     int ic_algin = UP_DIV(conv_dw->conv_.compute_.in_c_, conv_dw->oc_tile_);
32     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_dw->conv_.compute_.in_n_, conv_dw->conv_.compute_.in_hw_, NNACL_ERR);
33     int input_bhw = conv_dw->conv_.compute_.in_n_ * conv_dw->conv_.compute_.in_hw_;
34     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(input_bhw, conv_dw->oc_tile_ * ic_algin, NNACL_ERR);
35     int pack_input_size = input_bhw * conv_dw->oc_tile_ * ic_algin;
36     conv_dw->packed_input_ = (float *)env->Alloc(env->allocator_, pack_input_size * sizeof(float));
37     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_dw->packed_input_);
38   }
39 
40   if (conv_dw->output_need_align_) {
41     int oc_algin = UP_DIV(conv_dw->conv_.compute_.out_c_, conv_dw->oc_tile_);
42     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(conv_dw->conv_.compute_.out_n_, conv_dw->conv_.compute_.out_hw_, NNACL_ERR);
43     int output_bhw = conv_dw->conv_.compute_.out_n_ * conv_dw->conv_.compute_.out_hw_;
44     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(output_bhw, conv_dw->oc_tile_ * oc_algin, NNACL_ERR);
45     int pack_output_size = output_bhw * conv_dw->oc_tile_ * oc_algin;
46     conv_dw->packed_output_ = (float *)env->Alloc(env->allocator_, pack_output_size * sizeof(float));
47     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_dw->packed_output_);
48   }
49 
50   return NNACL_OK;
51 }
52 
ConvDwSWAVXPackWeight(ConvolutionBaseStruct * conv)53 void ConvDwSWAVXPackWeight(ConvolutionBaseStruct *conv) {
54   ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)conv;
55   NNACL_CHECK_NULL_RETURN_VOID(conv_dw);
56 
57   int oc_algin = UP_DIV(conv->compute_.out_c_, conv_dw->oc_tile_);
58   void *origin_weight = conv->base_.train_session_ ? conv->base_.in_[SECOND_INPUT]->data_ : conv->origin_weight_;
59   NNACL_CHECK_NULL_RETURN_VOID(origin_weight);
60 
61   PackNHWCToNXHWCXFp32(conv->compute_.kernel_h_, conv->compute_.kernel_w_, conv->compute_.out_c_, oc_algin, 1,
62                        (float *)conv->packed_weight_, (float *)conv->origin_weight_);
63 }
64 
ConvDwSWAVXMallocWeightBiasData(ConvolutionBaseStruct * conv)65 int ConvDwSWAVXMallocWeightBiasData(ConvolutionBaseStruct *conv) {
66   ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)conv;
67   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
68 
69   int oc_algin = UP_DIV(conv->compute_.out_c_, conv_dw->oc_tile_);
70   int pack_weight_size = oc_algin * conv_dw->oc_tile_ * conv->compute_.kernel_hw_;
71 
72   if (!conv->base_.train_session_) {
73     NNACL_CHECK_MALLOC_SIZE(pack_weight_size * sizeof(float));
74     conv->packed_weight_ = ConvBaseGetConvPackWeightData(conv, pack_weight_size * sizeof(float));
75     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->packed_weight_);
76   }
77 
78   if (conv->base_.in_size_ == THREE_TENSOR) {
79     int bias_size = oc_algin * conv_dw->oc_tile_;
80     NNACL_CHECK_MALLOC_SIZE(bias_size * sizeof(float));
81     conv->bias_data_ = conv->base_.env_->Alloc(conv->base_.env_->allocator_, bias_size * sizeof(float));
82     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->bias_data_);
83     memset(conv->bias_data_, 0, bias_size * sizeof(float));
84   }
85   return NNACL_OK;
86 }
87 
ConvDwSWAvxRun(void * cdata,int task_id,float l,float r)88 int ConvDwSWAvxRun(void *cdata, int task_id, float l, float r) {
89   ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)cdata;
90   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
91   ConvParameter *conv_param = (ConvParameter *)conv_dw->conv_.base_.param_;
92   NNACL_CHECK_NULL_RETURN_ERR(conv_param);
93 
94   DepthwiseSWAvxFp32(conv_dw->packed_output_, conv_dw->packed_input_, (float *)conv_dw->conv_.packed_weight_,
95                      (float *)conv_dw->conv_.bias_data_, conv_param, &conv_dw->sliding_param_, task_id);
96   return NNACL_OK;
97 }
98 
ConvDwSWAVXFreePackedInputOutput(ConvolutionDepthwiseSWAVXStruct * conv_dw)99 void ConvDwSWAVXFreePackedInputOutput(ConvolutionDepthwiseSWAVXStruct *conv_dw) {
100   if (conv_dw->input_need_align_) {
101     conv_dw->conv_.base_.env_->Free(conv_dw->conv_.base_.env_->allocator_, conv_dw->packed_input_);
102     conv_dw->packed_input_ = NULL;
103     conv_dw->input_need_align_ = false;
104   }
105   if (conv_dw->output_need_align_) {
106     conv_dw->conv_.base_.env_->Free(conv_dw->conv_.base_.env_->allocator_, conv_dw->packed_output_);
107     conv_dw->packed_output_ = NULL;
108     conv_dw->output_need_align_ = false;
109   }
110 }
111 
ConvolutionDepthwiseSWAVXCompute(KernelBase * self)112 int ConvolutionDepthwiseSWAVXCompute(KernelBase *self) {
113   ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)self;
114   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
115 
116   int ret = ConvDwSWAVXInitPackedInputOutput(conv_dw);
117   if (ret != NNACL_OK) {
118     ConvDwSWAVXFreePackedInputOutput(conv_dw);
119     return ret;
120   }
121 
122   ret = ConvBaseRepackWeight(&conv_dw->conv_);
123   if (ret != NNACL_OK) {
124     ConvDwSWAVXFreePackedInputOutput(conv_dw);
125     return ret;
126   }
127 
128   TensorC *input_tensor = self->in_[FIRST_INPUT];
129   NNACL_CHECK_NULL_RETURN_ERR(input_tensor);
130   float *input_ptr = (float *)input_tensor->data_;
131   NNACL_CHECK_NULL_RETURN_ERR(input_ptr);
132 
133   if (conv_dw->input_need_align_) {
134     PackNHWCToNHWCXFp32(input_ptr, conv_dw->packed_input_, conv_dw->conv_.compute_.in_n_,
135                         conv_dw->conv_.compute_.in_hw_, conv_dw->conv_.compute_.in_c_, conv_dw->oc_tile_);
136   } else {
137     conv_dw->packed_input_ = input_ptr;
138   }
139 
140   TensorC *output_tensor = self->out_[OUTPUT_INDEX];
141   NNACL_CHECK_NULL_RETURN_ERR(output_tensor);
142   float *output_ptr = (float *)output_tensor->data_;
143   NNACL_CHECK_NULL_RETURN_ERR(output_ptr);
144 
145   if (!conv_dw->output_need_align_) {
146     conv_dw->packed_output_ = output_ptr;
147   }
148 
149   ret = self->env_->ParallelLaunch(self->env_->thread_pool_, ConvDwSWAvxRun, self, self->thread_nr_);
150 
151   if (conv_dw->output_need_align_) {
152     PackNHWCXToNHWCFp32(conv_dw->packed_output_, output_ptr, conv_dw->conv_.compute_.out_n_,
153                         conv_dw->conv_.compute_.out_hw_, conv_dw->conv_.compute_.out_c_, conv_dw->oc_tile_);
154   }
155 
156   ConvDwSWAVXFreePackedInputOutput(conv_dw);
157   return ret;
158 }
159 
ConvolutionDepthwiseSWAVXPrepare(KernelBase * self)160 int ConvolutionDepthwiseSWAVXPrepare(KernelBase *self) {
161   NNACL_CHECK_FALSE(self->in_size_ < TWO_TENSOR, NNACL_INPUT_TENSOR_ERROR);
162   NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
163 
164   ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)self;
165   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
166   ConvParameter *conv_param = (ConvParameter *)self->param_;
167   NNACL_CHECK_NULL_RETURN_ERR(conv_param);
168 
169   conv_dw->oc_tile_ = C8NUM;
170   ConvBaseUpdateOriginWeightAndBias(&conv_dw->conv_);
171 
172   if (self->train_session_) {
173     int oc_algin = UP_DIV(conv_dw->conv_.compute_.out_c_, conv_dw->oc_tile_);
174     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(oc_algin * conv_dw->oc_tile_, conv_dw->conv_.compute_.kernel_hw_, NNACL_ERR);
175     int pack_weight_size = oc_algin * conv_dw->oc_tile_ * conv_dw->conv_.compute_.kernel_hw_;
176     self->work_size_ = pack_weight_size * sizeof(float);
177   }
178 
179   return ConvBaseInitConvWeightBias(&conv_dw->conv_);
180 }
181 
ConvolutionDepthwiseSWAVXResize(KernelBase * self)182 int ConvolutionDepthwiseSWAVXResize(KernelBase *self) {
183   ConvolutionDepthwiseSWAVXStruct *conv_dw = (ConvolutionDepthwiseSWAVXStruct *)self;
184   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
185   ConvParameter *conv_param = (ConvParameter *)self->param_;
186   NNACL_CHECK_NULL_RETURN_ERR(conv_param);
187 
188   ConvBasePrepare(&conv_dw->conv_);
189 
190   InitSlidingParamConvDw(&conv_dw->sliding_param_, conv_param, conv_dw->oc_tile_);
191   return NNACL_OK;
192 }
193 
ConvolutionDepthwiseSWAVXRelease(KernelBase * self)194 int ConvolutionDepthwiseSWAVXRelease(KernelBase *self) {
195   ConvolutionBaseStruct *conv = (ConvolutionBaseStruct *)self;
196   NNACL_CHECK_NULL_RETURN_ERR(conv);
197   ConvBaseRelease(conv);
198   return NNACL_OK;
199 }
200 
CreateConvDwSWAVX(ConvParameter * conv_param)201 KernelBase *CreateConvDwSWAVX(ConvParameter *conv_param) {
202   ConvolutionDepthwiseSWAVXStruct *conv_dw =
203     (ConvolutionDepthwiseSWAVXStruct *)malloc(sizeof(ConvolutionDepthwiseSWAVXStruct));
204   NNACL_MALLOC_CHECK_NULL_RETURN_NULL(conv_dw);
205   memset(conv_dw, 0, sizeof(ConvolutionDepthwiseSWAVXStruct));
206 
207   conv_dw->conv_.pack_weight_ = ConvDwSWAVXPackWeight;
208   conv_dw->conv_.malloc_weight_bias_ = ConvDwSWAVXMallocWeightBiasData;
209 
210   conv_dw->conv_.base_.Prepare = ConvolutionDepthwiseSWAVXPrepare;
211   conv_dw->conv_.base_.Compute = ConvolutionDepthwiseSWAVXCompute;
212   conv_dw->conv_.base_.Resize = ConvolutionDepthwiseSWAVXResize;
213   conv_dw->conv_.base_.Release = ConvolutionDepthwiseSWAVXRelease;
214   return (KernelBase *)conv_dw;
215 }
216 #endif
217