1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #ifdef ENABLE_AVX
17 #include "src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow_x86_fp32.h"
18 #include "include/errorcode.h"
19
20 using mindspore::lite::RET_ERROR;
21 using mindspore::lite::RET_INFER_INVALID;
22 using mindspore::lite::RET_NULL_PTR;
23 using mindspore::lite::RET_OK;
24
25 namespace mindspore::kernel {
~ConvolutionDepthwiseSWCPUKernelX86()26 ConvolutionDepthwiseSWCPUKernelX86::~ConvolutionDepthwiseSWCPUKernelX86() {
27 if (sliding_ != nullptr) {
28 delete sliding_;
29 sliding_ = nullptr;
30 }
31 }
32
InitPackedInputOutput()33 int ConvolutionDepthwiseSWCPUKernelX86::InitPackedInputOutput() {
34 CHECK_NULL_RETURN(conv_param_);
35 if (conv_param_->input_channel_ % oc_tile_ != 0) {
36 input_need_align_ = true;
37 int ic_algin = UP_DIV(conv_param_->input_channel_, oc_tile_);
38 int pack_input_size =
39 conv_param_->input_batch_ * conv_param_->input_h_ * conv_param_->input_w_ * oc_tile_ * ic_algin;
40 packed_input_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(pack_input_size * sizeof(float)));
41 if (packed_input_ == nullptr) {
42 MS_LOG(ERROR) << "Malloc packed_input_ buffer is failed.";
43 return RET_NULL_PTR;
44 }
45 }
46 if (conv_param_->output_channel_ % oc_tile_ != 0) {
47 output_need_align_ = true;
48 int oc_algin = UP_DIV(conv_param_->output_channel_, oc_tile_);
49 int pack_output_size =
50 conv_param_->output_batch_ * conv_param_->output_h_ * conv_param_->output_w_ * oc_tile_ * oc_algin;
51 packed_output_ = reinterpret_cast<float *>(ms_context_->allocator->Malloc(pack_output_size * sizeof(float)));
52 if (packed_output_ == nullptr) {
53 MS_LOG(ERROR) << "Malloc packed_output_ buffer is failed.";
54 return RET_NULL_PTR;
55 }
56 }
57 return RET_OK;
58 }
59
Init()60 int ConvolutionDepthwiseSWCPUKernelX86::Init() {
61 CHECK_LESS_RETURN(in_tensors_.size(), C2NUM);
62 CHECK_LESS_RETURN(out_tensors_.size(), 1);
63 #ifdef ENABLE_AVX
64 oc_tile_ = C8NUM;
65 #endif
66 if (op_parameter_->is_train_session_) {
67 auto weight_tensor = in_tensors_.at(kWeightIndex);
68 int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
69 int pack_weight_size = oc_algin * oc_tile_ * weight_tensor->Height() * weight_tensor->Width();
70 set_workspace_size(pack_weight_size * sizeof(float));
71 }
72 sliding_ = new (std::nothrow) SlidingWindowParam;
73 if (sliding_ == nullptr) {
74 MS_LOG(ERROR) << "new sliding window param failed.";
75 return RET_ERROR;
76 }
77
78 auto ret = InitConvWeightBias();
79 if (ret != 0) {
80 MS_LOG(ERROR) << "Convolution depthwise fp32 InitConvWeightBias failed.";
81 return RET_ERROR;
82 }
83 if (!InferShapeDone()) {
84 return RET_OK;
85 }
86 return ReSize();
87 }
88
ReSize()89 int ConvolutionDepthwiseSWCPUKernelX86::ReSize() {
90 ConvolutionBaseCPUKernel::Init();
91 InitSlidingParamConvDw(sliding_, conv_param_, oc_tile_);
92 return RET_OK;
93 }
94
Execute(int task_id)95 int ConvolutionDepthwiseSWCPUKernelX86::Execute(int task_id) {
96 DepthwiseSWAvxFp32(packed_output_, packed_input_, reinterpret_cast<float *>(packed_weight_),
97 reinterpret_cast<float *>(bias_data_), conv_param_, sliding_, task_id);
98 return RET_OK;
99 }
100
ConvDwSWAvxRun(void * cdata,int task_id,float lhs_scale,float rhs_scale)101 int ConvDwSWAvxRun(void *cdata, int task_id, float lhs_scale, float rhs_scale) {
102 auto conv_dw = reinterpret_cast<ConvolutionDepthwiseSWCPUKernelX86 *>(cdata);
103 auto ret = conv_dw->Execute(task_id);
104 if (ret != RET_OK) {
105 MS_LOG(ERROR) << "ConvolutionDepthwiseSWRun in x86 error task_id[" << task_id << "] error_code[" << ret << "]";
106 return RET_ERROR;
107 }
108 return RET_OK;
109 }
110
Run()111 int ConvolutionDepthwiseSWCPUKernelX86::Run() {
112 auto ret = InitPackedInputOutput();
113 if (ret != 0) {
114 MS_LOG(ERROR) << "Convolution depthwise x86 fp32 InitPackedInputOutput failed.";
115 FreePackedInputOutput();
116 return RET_ERROR;
117 }
118 if (RepackWeight() != RET_OK) {
119 MS_LOG(ERROR) << "Repack weight failed.";
120 return RET_ERROR;
121 }
122 auto input_tensor = in_tensors_.at(kInputIndex);
123 CHECK_NULL_RETURN(input_tensor);
124 auto input_ptr = reinterpret_cast<float *>(input_tensor->data());
125 CHECK_NULL_RETURN(input_ptr);
126
127 if (input_need_align_) {
128 PackNHWCToNHWCXFp32(input_ptr, packed_input_, conv_param_->input_batch_,
129 conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_, oc_tile_);
130 } else {
131 packed_input_ = input_ptr;
132 }
133
134 auto output_tensor = out_tensors_.at(kOutputIndex);
135 CHECK_NULL_RETURN(output_tensor);
136 auto output_ptr = reinterpret_cast<float *>(output_tensor->data());
137 CHECK_NULL_RETURN(output_ptr);
138
139 if (!output_need_align_) {
140 packed_output_ = output_ptr;
141 }
142
143 ret = ParallelLaunch(this->ms_context_, ConvDwSWAvxRun, this, conv_param_->thread_num_);
144 if (ret != RET_OK) {
145 MS_LOG(ERROR) << "ConvDwSWAvxRun error: error_code[" << ret << "]";
146 }
147
148 if (output_need_align_) {
149 PackNHWCXToNHWCFp32(packed_output_, output_ptr, conv_param_->output_batch_,
150 conv_param_->output_h_ * conv_param_->output_w_, conv_param_->output_channel_, oc_tile_);
151 }
152 FreePackedInputOutput();
153 return ret;
154 }
155
FreePackedInputOutput()156 void ConvolutionDepthwiseSWCPUKernelX86::FreePackedInputOutput() {
157 if (input_need_align_) {
158 ms_context_->allocator->Free(packed_input_);
159 packed_input_ = nullptr;
160 }
161 if (output_need_align_) {
162 ms_context_->allocator->Free(packed_output_);
163 packed_output_ = nullptr;
164 }
165 }
166
PackWeight()167 void ConvolutionDepthwiseSWCPUKernelX86::PackWeight() {
168 auto weight_tensor = in_tensors_.at(kWeightIndex);
169 int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
170 void *origin_weight = IsTrainable() ? weight_tensor->data() : origin_weight_;
171 MS_ASSERT(origin_weight != nullptr);
172 PackNHWCToNXHWCXFp32(weight_tensor->Height(), weight_tensor->Width(), weight_tensor->Batch(), oc_algin,
173 weight_tensor->Channel(), reinterpret_cast<float *>(packed_weight_),
174 reinterpret_cast<float *>(origin_weight));
175 }
176
MallocWeightBiasData()177 int ConvolutionDepthwiseSWCPUKernelX86::MallocWeightBiasData() {
178 auto weight_tensor = in_tensors_.at(kWeightIndex);
179 int oc_algin = UP_DIV(weight_tensor->Batch(), oc_tile_);
180 int pack_weight_size = oc_algin * oc_tile_ * weight_tensor->Height() * weight_tensor->Width();
181 if (!op_parameter_->is_train_session_) {
182 packed_weight_ = malloc(pack_weight_size * sizeof(float));
183 if (packed_weight_ == nullptr) {
184 MS_LOG(ERROR) << "Malloc packed_weight_ is failed!";
185 return RET_NULL_PTR;
186 }
187 }
188
189 if (in_tensors_.size() == kInputSize2) {
190 auto bias_size = oc_algin * oc_tile_;
191 bias_data_ = malloc(bias_size * sizeof(float));
192 if (bias_data_ == nullptr) {
193 MS_LOG(ERROR) << "Malloc bias_data buffer failed.";
194 return RET_NULL_PTR;
195 }
196 memset(bias_data_, 0, bias_size * sizeof(float));
197 }
198 return RET_OK;
199 }
200 } // namespace mindspore::kernel
201 #endif
202