1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either convolutionress or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #if defined(ENABLE_ARM) || (defined(ENABLE_SSE) && !defined(ENABLE_AVX))
18 #include "nnacl/kernel/convolution_depthwise_3x3.h"
19 #include "nnacl/kernel/convolution_base.h"
20 #include "nnacl/fp32/conv_depthwise_fp32.h"
21 #include "nnacl/fp32/pack_fp32.h"
22
ConvDw3x3Run(void * cdata,int task_id,float l,float r)23 int ConvDw3x3Run(void *cdata, int task_id, float l, float r) {
24 ConvolutionDepthwise3x3Struct *conv_dw = (ConvolutionDepthwise3x3Struct *)cdata;
25 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
26
27 int units = UP_DIV(conv_dw->conv_.compute_.out_w_, C2NUM); // F(2, 3) contains 2 conv units
28 int c4 = UP_ROUND(conv_dw->conv_.compute_.in_c_, C4NUM);
29 int c12c4_units = C12NUM * c4 * units;
30 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c12c4_units, task_id, NNACL_ERR);
31 float *buffer = conv_dw->buffer_ + c12c4_units * task_id;
32 NNACL_CHECK_ZERO_RETURN_ERR(conv_dw->conv_.base_.thread_nr_);
33
34 int step_oh = UP_DIV(conv_dw->conv_.compute_.out_h_, conv_dw->conv_.base_.thread_nr_);
35 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(step_oh, task_id, NNACL_ERR);
36 int start_oh = step_oh * task_id;
37 int end_oh = MSMIN(start_oh + step_oh, conv_dw->conv_.compute_.out_h_);
38
39 ConvParameter *conv_param = (ConvParameter *)conv_dw->conv_.base_.param_;
40 NNACL_CHECK_NULL_RETURN_ERR(conv_param);
41 ConvDw3x3(conv_dw->output_ptr_, buffer, conv_dw->input_ptr_, (float *)conv_dw->conv_.packed_weight_,
42 (float *)conv_dw->conv_.bias_data_, conv_param, start_oh, end_oh);
43 return NNACL_OK;
44 }
45
ConvDw3x3PackWeight(ConvolutionBaseStruct * conv)46 void ConvDw3x3PackWeight(ConvolutionBaseStruct *conv) {
47 void *origin_weight = (conv->base_.train_session_) ? conv->base_.in_[SECOND_INPUT]->data_ : conv->origin_weight_;
48 NNACL_CHECK_NULL_RETURN_VOID(origin_weight);
49 PackWeightConvDw3x3Fp32((float *)origin_weight, (float *)conv->packed_weight_, conv->compute_.out_c_);
50 }
51
ConvDw3x3MallocWeightBiasData(ConvolutionBaseStruct * conv)52 int ConvDw3x3MallocWeightBiasData(ConvolutionBaseStruct *conv) {
53 int c4 = UP_ROUND(conv->compute_.out_c_, C4NUM);
54 if (!conv->base_.train_session_) {
55 if (conv->packed_weight_ == NULL) {
56 int pack_weight_size = c4 * C12NUM;
57 NNACL_CHECK_MALLOC_SIZE(pack_weight_size * sizeof(float));
58 conv->packed_weight_ = ConvBaseGetConvPackWeightData(conv, pack_weight_size * sizeof(float));
59 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->packed_weight_);
60 }
61 }
62
63 if (conv->bias_data_ == NULL) {
64 NNACL_CHECK_MALLOC_SIZE(c4 * sizeof(float));
65 conv->bias_data_ = conv->base_.env_->Alloc(conv->base_.env_->allocator_, c4 * sizeof(float));
66 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv->bias_data_);
67 }
68 memset(conv->bias_data_, 0, c4 * sizeof(float));
69 return NNACL_OK;
70 }
71
ConvolutionDepthwise3x3Resize(KernelBase * self)72 int ConvolutionDepthwise3x3Resize(KernelBase *self) {
73 ConvolutionBaseStruct *conv = (ConvolutionBaseStruct *)self;
74 NNACL_CHECK_NULL_RETURN_ERR(conv);
75 int ret = ConvBasePrepare(conv);
76 if (ret != NNACL_OK) {
77 return ret;
78 }
79 self->thread_nr_ = NNACL_MIN(self->thread_nr_, conv->compute_.out_h_);
80 return NNACL_OK;
81 }
82
ConvolutionDepthwise3x3Prepare(KernelBase * self)83 int ConvolutionDepthwise3x3Prepare(KernelBase *self) {
84 NNACL_CHECK_FALSE(self->in_size_ < TWO_TENSOR, NNACL_INPUT_TENSOR_ERROR);
85 NNACL_CHECK_FALSE(self->out_size_ < ONE_TENSOR, NNACL_OUTPUT_TENSOR_ERROR);
86
87 ConvolutionDepthwise3x3Struct *conv_dw = (ConvolutionDepthwise3x3Struct *)self;
88 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
89
90 ConvBaseUpdateOriginWeightAndBias(&conv_dw->conv_);
91
92 if (self->train_session_) {
93 int c4 = UP_ROUND(conv_dw->conv_.compute_.out_c_, C4NUM);
94 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c4, C12NUM, NNACL_ERR);
95 int pack_weight_size = c4 * C12NUM;
96 self->work_size_ = pack_weight_size * sizeof(float);
97 }
98
99 return ConvBaseInitConvWeightBias(&conv_dw->conv_);
100 }
101
ConvolutionDepthwise3x3Compute(KernelBase * self)102 int ConvolutionDepthwise3x3Compute(KernelBase *self) {
103 ConvolutionDepthwise3x3Struct *conv_dw = (ConvolutionDepthwise3x3Struct *)self;
104 NNACL_CHECK_NULL_RETURN_ERR(conv_dw);
105
106 int units = UP_DIV(conv_dw->conv_.compute_.out_w_, C2NUM); // F(2, 3) contains 2 conv units
107 int c4 = UP_ROUND(conv_dw->conv_.compute_.in_c_, C4NUM);
108 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(C12NUM, c4, NNACL_ERR);
109 int c12c4 = C12NUM * c4;
110 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c12c4, units, NNACL_ERR);
111 int c12c4_units = c12c4 * units;
112 NNACL_CHECK_INT_MUL_NOT_OVERFLOW(c12c4_units, self->thread_nr_, NNACL_ERR);
113 int buffer_size = c12c4_units * self->thread_nr_;
114
115 conv_dw->buffer_ = self->env_->Alloc(self->env_->allocator_, buffer_size * sizeof(float));
116 NNACL_MALLOC_CHECK_NULL_RETURN_ERR(conv_dw->buffer_);
117
118 int ret = ConvBaseRepackWeight(&conv_dw->conv_);
119 if (ret != NNACL_OK) {
120 self->env_->Free(self->env_->allocator_, conv_dw->buffer_);
121 return ret;
122 }
123
124 conv_dw->input_ptr_ = self->in_[FIRST_INPUT]->data_;
125 NNACL_CHECK_NULL_RETURN_ERR(conv_dw->input_ptr_);
126 conv_dw->output_ptr_ = self->out_[OUTPUT_INDEX]->data_;
127 NNACL_CHECK_NULL_RETURN_ERR(conv_dw->output_ptr_);
128
129 ret = self->env_->ParallelLaunch(self->env_->thread_pool_, ConvDw3x3Run, self, self->thread_nr_);
130 self->env_->Free(self->env_->allocator_, conv_dw->buffer_);
131 return ret;
132 }
133
ConvolutionDepthwise3x3Release(KernelBase * self)134 int ConvolutionDepthwise3x3Release(KernelBase *self) {
135 ConvolutionBaseStruct *conv = (ConvolutionBaseStruct *)self;
136 ConvBaseRelease(conv);
137 return NNACL_OK;
138 }
139
CreateConvDw3x3(ConvParameter * conv_param)140 KernelBase *CreateConvDw3x3(ConvParameter *conv_param) {
141 ConvolutionDepthwise3x3Struct *conv_dw =
142 (ConvolutionDepthwise3x3Struct *)malloc(sizeof(ConvolutionDepthwise3x3Struct));
143 NNACL_CHECK_NULL_RETURN_NULL(conv_dw);
144 memset(conv_dw, 0, sizeof(ConvolutionDepthwise3x3Struct));
145 conv_dw->conv_.pack_weight_ = ConvDw3x3PackWeight;
146 conv_dw->conv_.malloc_weight_bias_ = ConvDw3x3MallocWeightBiasData;
147 conv_dw->conv_.base_.Resize = ConvolutionDepthwise3x3Resize;
148 conv_dw->conv_.base_.Prepare = ConvolutionDepthwise3x3Prepare;
149 conv_dw->conv_.base_.Compute = ConvolutionDepthwise3x3Compute;
150 conv_dw->conv_.base_.Release = ConvolutionDepthwise3x3Release;
151
152 return (KernelBase *)conv_dw;
153 }
154 #endif
155