• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either convolutionress or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #if defined(ENABLE_ARM) || (defined(ENABLE_SSE) && !defined(ENABLE_AVX))
18 #include "nnacl/kernel/convolution_depthwise_3x3.h"
19 #include "nnacl/kernel/convolution_base.h"
20 #include "nnacl/fp32/conv_depthwise_fp32.h"
21 #include "nnacl/fp32/pack_fp32.h"
22 
ConvDw3x3Run(void * cdata,int task_id,float l,float r)23 int ConvDw3x3Run(void *cdata, int task_id, float l, float r) {
24   ConvolutionDepthwise3x3Struct *conv_dw = (ConvolutionDepthwise3x3Struct *)cdata;
25   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
26 
27   int units = UP_DIV(conv_dw->conv_.compute_.out_w_, C2NUM);  // F(2, 3) contains 2 conv units
28   int c4 = UP_ROUND(conv_dw->conv_.compute_.in_c_, C4NUM);
29   int c12c4_units = C12NUM * c4 * units;
30   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c12c4_units, task_id, NNACL_ERR);
31   float *buffer = conv_dw->buffer_ + c12c4_units * task_id;
32   NNACL_CHECK_ZERO_RETURN_ERR(conv_dw->conv_.base_.thread_nr_);
33 
34   int step_oh = UP_DIV(conv_dw->conv_.compute_.out_h_, conv_dw->conv_.base_.thread_nr_);
35   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(step_oh, task_id, NNACL_ERR);
36   int start_oh = step_oh * task_id;
37   int end_oh = MSMIN(start_oh + step_oh, conv_dw->conv_.compute_.out_h_);
38 
39   ConvParameter *conv_param = (ConvParameter *)conv_dw->conv_.base_.param_;
40   NNACL_CHECK_NULL_RETURN_ERR(conv_param);
41   ConvDw3x3(conv_dw->output_ptr_, buffer, conv_dw->input_ptr_, (float *)conv_dw->conv_.packed_weight_,
42             (float *)conv_dw->conv_.bias_data_, conv_param, start_oh, end_oh);
43   return NNACL_OK;
44 }
45 
ConvDw3x3PackWeight(ConvolutionBaseStruct * conv)46 void ConvDw3x3PackWeight(ConvolutionBaseStruct *conv) {
47   void *origin_weight = (conv->base_.train_session_) ? conv->base_.in_[SECOND_INPUT]->data_ : conv->origin_weight_;
48   NNACL_CHECK_NULL_RETURN_VOID(origin_weight);
49   PackWeightConvDw3x3Fp32((float *)origin_weight, (float *)conv->packed_weight_, conv->compute_.out_c_);
50 }
51 
ConvDw3x3MallocWeightBiasData(ConvolutionBaseStruct * conv)52 int ConvDw3x3MallocWeightBiasData(ConvolutionBaseStruct *conv) {
53   int c4 = UP_ROUND(conv->compute_.out_c_, C4NUM);
54   if (!conv->base_.train_session_) {
55     if (conv->packed_weight_ == NULL) {
56       int pack_weight_size = c4 * C12NUM;
57       NNACL_CHECK_MALLOC_SIZE(pack_weight_size * sizeof(float));
58       conv->packed_weight_ = ConvBaseGetConvPackWeightData(conv, pack_weight_size * sizeof(float));
59       NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->packed_weight_);
60     }
61   }
62 
63   if (conv->bias_data_ == NULL) {
64     NNACL_CHECK_MALLOC_SIZE(c4 * sizeof(float));
65     conv->bias_data_ = conv->base_.env_->Alloc(conv->base_.env_->allocator_, c4 * sizeof(float));
66     NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->bias_data_);
67   }
68   memset(conv->bias_data_, 0, c4 * sizeof(float));
69   return NNACL_OK;
70 }
71 
ConvolutionDepthwise3x3Resize(KernelBase * self)72 int ConvolutionDepthwise3x3Resize(KernelBase *self) {
73   ConvolutionBaseStruct *conv = (ConvolutionBaseStruct *)self;
74   NNACL_CHECK_NULL_RETURN_ERR(conv);
75   int ret = ConvBasePrepare(conv);
76   if (ret != NNACL_OK) {
77     return ret;
78   }
79   self->thread_nr_ = NNACL_MIN(self->thread_nr_, conv->compute_.out_h_);
80   return NNACL_OK;
81 }
82 
ConvolutionDepthwise3x3Prepare(KernelBase * self)83 int ConvolutionDepthwise3x3Prepare(KernelBase *self) {
84   NNACL_CHECK_FALSE(self->in_size_ < TWO_TENSOR, NNACL_INPUT_TENSOR_ERROR);
85   NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
86 
87   ConvolutionDepthwise3x3Struct *conv_dw = (ConvolutionDepthwise3x3Struct *)self;
88   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
89 
90   ConvBaseUpdateOriginWeightAndBias(&conv_dw->conv_);
91 
92   if (self->train_session_) {
93     int c4 = UP_ROUND(conv_dw->conv_.compute_.out_c_, C4NUM);
94     NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c4, C12NUM, NNACL_ERR);
95     int pack_weight_size = c4 * C12NUM;
96     self->work_size_ = pack_weight_size * sizeof(float);
97   }
98 
99   return ConvBaseInitConvWeightBias(&conv_dw->conv_);
100 }
101 
ConvolutionDepthwise3x3Compute(KernelBase * self)102 int ConvolutionDepthwise3x3Compute(KernelBase *self) {
103   ConvolutionDepthwise3x3Struct *conv_dw = (ConvolutionDepthwise3x3Struct *)self;
104   NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
105 
106   int units = UP_DIV(conv_dw->conv_.compute_.out_w_, C2NUM);  // F(2, 3) contains 2 conv units
107   int c4 = UP_ROUND(conv_dw->conv_.compute_.in_c_, C4NUM);
108   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(C12NUM, c4, NNACL_ERR);
109   int c12c4 = C12NUM * c4;
110   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c12c4, units, NNACL_ERR);
111   int c12c4_units = c12c4 * units;
112   NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c12c4_units, self->thread_nr_, NNACL_ERR);
113   int buffer_size = c12c4_units * self->thread_nr_;
114 
115   conv_dw->buffer_ = self->env_->Alloc(self->env_->allocator_, buffer_size * sizeof(float));
116   NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_dw->buffer_);
117 
118   int ret = ConvBaseRepackWeight(&conv_dw->conv_);
119   if (ret != NNACL_OK) {
120     self->env_->Free(self->env_->allocator_, conv_dw->buffer_);
121     return ret;
122   }
123 
124   conv_dw->input_ptr_ = self->in_[FIRST_INPUT]->data_;
125   NNACL_CHECK_NULL_RETURN_ERR(conv_dw->input_ptr_);
126   conv_dw->output_ptr_ = self->out_[OUTPUT_INDEX]->data_;
127   NNACL_CHECK_NULL_RETURN_ERR(conv_dw->output_ptr_);
128 
129   ret = self->env_->ParallelLaunch(self->env_->thread_pool_, ConvDw3x3Run, self, self->thread_nr_);
130   self->env_->Free(self->env_->allocator_, conv_dw->buffer_);
131   return ret;
132 }
133 
ConvolutionDepthwise3x3Release(KernelBase * self)134 int ConvolutionDepthwise3x3Release(KernelBase *self) {
135   ConvolutionBaseStruct *conv = (ConvolutionBaseStruct *)self;
136   ConvBaseRelease(conv);
137   return NNACL_OK;
138 }
139 
CreateConvDw3x3(ConvParameter * conv_param)140 KernelBase *CreateConvDw3x3(ConvParameter *conv_param) {
141   ConvolutionDepthwise3x3Struct *conv_dw =
142     (ConvolutionDepthwise3x3Struct *)malloc(sizeof(ConvolutionDepthwise3x3Struct));
143   NNACL_CHECK_NULL_RETURN_NULL(conv_dw);
144   memset(conv_dw, 0, sizeof(ConvolutionDepthwise3x3Struct));
145   conv_dw->conv_.pack_weight_ = ConvDw3x3PackWeight;
146   conv_dw->conv_.malloc_weight_bias_ = ConvDw3x3MallocWeightBiasData;
147   conv_dw->conv_.base_.Resize = ConvolutionDepthwise3x3Resize;
148   conv_dw->conv_.base_.Prepare = ConvolutionDepthwise3x3Prepare;
149   conv_dw->conv_.base_.Compute = ConvolutionDepthwise3x3Compute;
150   conv_dw->conv_.base_.Release = ConvolutionDepthwise3x3Release;
151 
152   return (KernelBase *)conv_dw;
153 }
154 #endif
155