1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/fp32/scale_fp32.h"
18 #include "nnacl/intrinsics/ms_simd_instructions.h"
19
ScaleInner(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size,int inner_size)20 void ScaleInner(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
21 int outer_end, int axis_size, int inner_size) {
22 for (int out = outer_start; out < outer_end; out++) {
23 int out_offset = out * axis_size * inner_size;
24 for (int i = 0; i < axis_size; i++) {
25 int axis_offset = out_offset + i * inner_size;
26 int in_index = 0;
27 #ifdef ENABLE_AVX
28 MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]);
29 MS_FLOAT32X8 offset_8 = MS_MOV256_F32(offset[i]);
30 for (; in_index <= inner_size - C8NUM; in_index += C8NUM) {
31 int in_offset = axis_offset + in_index;
32 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
33 MS_FLOAT32X8 result = MS_MLA256_F32(offset_8, data, scale_8);
34 MS_ST256_F32(out_data + in_offset, result);
35 }
36 #endif
37 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
38 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]);
39 MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]);
40 for (; in_index <= inner_size - C4NUM; in_index += C4NUM) {
41 int in_offset = axis_offset + in_index;
42 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
43 MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4);
44 MS_STQ_F32(out_data + in_offset, result);
45 }
46 #endif
47 for (; in_index < inner_size; in_index++) {
48 int in_offset = axis_offset + in_index;
49 out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
50 }
51 }
52 }
53 }
54
ScaleAxis(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size)55 void ScaleAxis(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
56 int outer_end, int axis_size) {
57 for (int out = outer_start; out < outer_end; out++) {
58 int out_offset = out * axis_size;
59 int index = 0;
60 #if defined(ENABLE_AVX)
61 for (; index <= axis_size - C8NUM; index += C8NUM) {
62 int in_offset = out_offset + index;
63 MS_FLOAT32X8 scale_8 = MS_LD256_F32(scale + index);
64 MS_FLOAT32X8 offset_8 = MS_LD256_F32(offset + index);
65 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
66 MS_FLOAT32X8 result = MS_MLA256_F32(offset_8, data, scale_8);
67 MS_ST256_F32(out_data + in_offset, result);
68 }
69 #endif
70 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
71 for (; index <= axis_size - C4NUM; index += C4NUM) {
72 MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index);
73 MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index);
74 int in_offset = out_offset + index;
75 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
76 MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4);
77 MS_STQ_F32(out_data + in_offset, result);
78 }
79 #endif
80 for (; index < axis_size; index++) {
81 int in_offset = out_offset + index;
82 out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index];
83 }
84 }
85 }
86
DoScale(const float * in_data,float * out_data,const float * scale,const float * offset,int task_id,const ScaleStruct * scale_param)87 void DoScale(const float *in_data, float *out_data, const float *scale, const float *offset, int task_id,
88 const ScaleStruct *scale_param) {
89 NNACL_CHECK_ZERO_RETURN(scale_param->base_.thread_nr_);
90 int outer_step = UP_DIV(scale_param->outer_size_, scale_param->base_.thread_nr_);
91 int outer_start = task_id * outer_step;
92 int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
93
94 if (scale_param->inner_size_ == 1) {
95 ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
96 } else {
97 ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
98 scale_param->inner_size_);
99 }
100 }
101
ScaleInnerRelu(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size,int inner_size)102 void ScaleInnerRelu(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
103 int outer_end, int axis_size, int inner_size) {
104 #ifdef ENABLE_AVX
105 MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
106 #endif
107 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
108 MS_FLOAT32X4 zeros = {0, 0, 0, 0};
109 #endif
110 for (int out = outer_start; out < outer_end; out++) {
111 int out_offset = out * axis_size * inner_size;
112 for (int i = 0; i < axis_size; i++) {
113 int axis_offset = out_offset + i * inner_size;
114 int in_index = 0;
115 #ifdef ENABLE_AVX
116 MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]);
117 MS_FLOAT32X8 offset_8 = MS_MOV256_F32(offset[i]);
118 for (; in_index <= inner_size - C8NUM; in_index += C8NUM) {
119 int in_offset = axis_offset + in_index;
120 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
121 MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
122 MS_FLOAT32X8 result = MS_MAX256_F32(tmp, zeros_8);
123 MS_ST256_F32(out_data + in_offset, result);
124 }
125 #endif
126 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
127 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]);
128 MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]);
129 for (; in_index <= inner_size - C4NUM; in_index += C4NUM) {
130 int in_offset = axis_offset + in_index;
131 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
132 MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
133 MS_FLOAT32X4 result = MS_MAXQ_F32(tmp, zeros);
134 MS_STQ_F32(out_data + in_offset, result);
135 }
136 #endif
137 for (; in_index < inner_size; in_index++) {
138 int in_offset = axis_offset + in_index;
139 float tmp = in_data[in_offset] * scale[i] + offset[i];
140 out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
141 }
142 }
143 }
144 }
145
ScaleAxisRelu(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size)146 void ScaleAxisRelu(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
147 int outer_end, int axis_size) {
148 #ifdef ENABLE_AVX
149 MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
150 #endif
151 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
152 MS_FLOAT32X4 zeros = {0, 0, 0, 0};
153 #endif
154 for (int out = outer_start; out < outer_end; out++) {
155 int out_offset = out * axis_size;
156 int index = 0;
157 #ifdef ENABLE_AVX
158 for (; index <= axis_size - C8NUM; index += C8NUM) {
159 int in_offset = out_offset + index;
160 MS_FLOAT32X8 scale_8 = MS_LD256_F32(scale + index);
161 MS_FLOAT32X8 offset_8 = MS_LD256_F32(offset + index);
162 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
163 MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
164 MS_FLOAT32X8 result = MS_MAX256_F32(tmp, zeros_8);
165 MS_ST256_F32(out_data + in_offset, result);
166 }
167 #endif
168 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
169 for (; index <= axis_size - C4NUM; index += C4NUM) {
170 int in_offset = out_offset + index;
171 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
172 MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index);
173 MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index);
174 MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
175 MS_FLOAT32X4 result = MS_MAXQ_F32(tmp, zeros);
176 MS_STQ_F32(out_data + in_offset, result);
177 }
178 #endif
179 for (; index < axis_size; index++) {
180 int in_offset = out_offset + index;
181 float tmp = in_data[in_offset] * scale[index] + offset[index];
182 out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
183 }
184 }
185 }
186
DoScaleRelu(const float * in_data,float * out_data,const float * scale,const float * offset,int task_id,const ScaleStruct * scale_param)187 void DoScaleRelu(const float *in_data, float *out_data, const float *scale, const float *offset, int task_id,
188 const ScaleStruct *scale_param) {
189 NNACL_CHECK_ZERO_RETURN(scale_param->base_.thread_nr_);
190 int outer_step = UP_DIV(scale_param->outer_size_, scale_param->base_.thread_nr_);
191 int outer_start = task_id * outer_step;
192 int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
193
194 if (scale_param->inner_size_ == 1) {
195 ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
196 } else {
197 ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
198 scale_param->inner_size_);
199 }
200 }
201
ScaleInnerRelu6(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size,int inner_size)202 void ScaleInnerRelu6(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
203 int outer_end, int axis_size, int inner_size) {
204 #ifdef ENABLE_AVX
205 MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
206 MS_FLOAT32X8 bounds_8 = {6, 6, 6, 6, 6, 6, 6, 6};
207 #endif
208 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
209 MS_FLOAT32X4 zeros = {0, 0, 0, 0};
210 MS_FLOAT32X4 bounds = {6, 6, 6, 6};
211 #endif
212 for (int out = outer_start; out < outer_end; out++) {
213 int out_offset = out * axis_size * inner_size;
214 for (int i = 0; i < axis_size; i++) {
215 int axis_offset = out_offset + i * inner_size;
216 int in_index = 0;
217 #if defined(ENABLE_AVX)
218 MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]);
219 MS_FLOAT32X8 offset_8 = MS_MOV256_F32(offset[i]);
220 for (; in_index <= inner_size - C8NUM; in_index += C8NUM) {
221 int in_offset = axis_offset + in_index;
222 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
223 MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
224 MS_FLOAT32X8 result = MS_MIN256_F32(MS_MAX256_F32(tmp, zeros_8), bounds_8);
225 MS_ST256_F32(out_data + in_offset, result);
226 }
227 #endif
228 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
229 for (; in_index < inner_size - C4NUM; in_index += C4NUM) {
230 int in_offset = axis_offset + in_index;
231 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
232 MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]);
233 MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]);
234 MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
235 MS_FLOAT32X4 result = MS_MINQ_F32(MS_MAXQ_F32(tmp, zeros), bounds);
236 MS_STQ_F32(out_data + in_offset, result);
237 }
238 #endif
239 for (; in_index < inner_size; in_index++) {
240 int in_offset = axis_offset + in_index;
241 float tmp = in_data[in_offset] * scale[i] + offset[i];
242 out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
243 }
244 }
245 }
246 }
247
ScaleAxisRelu6(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size)248 void ScaleAxisRelu6(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
249 int outer_end, int axis_size) {
250 #ifdef ENABLE_AVX
251 MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
252 MS_FLOAT32X8 bounds_8 = {6, 6, 6, 6, 6, 6, 6, 6};
253 #endif
254 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
255 MS_FLOAT32X4 zeros = {0, 0, 0, 0};
256 MS_FLOAT32X4 bounds = {6, 6, 6, 6};
257 #endif
258 for (int out = outer_start; out < outer_end; out++) {
259 int out_offset = out * axis_size;
260 int index = 0;
261 #ifdef ENABLE_AVX
262 for (; index <= axis_size - C8NUM; index += C8NUM) {
263 int in_offset = out_offset + index;
264 MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
265 MS_FLOAT32X8 scale_8 = MS_LD256_F32(scale + index);
266 MS_FLOAT32X8 offset_8 = MS_LD256_F32(offset + index);
267 MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
268 MS_FLOAT32X8 result = MS_MIN256_F32(MS_MAX256_F32(tmp, zeros_8), bounds_8);
269 MS_ST256_F32(out_data + in_offset, result);
270 }
271 #endif
272 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
273 for (; index <= axis_size - C4NUM; index += C4NUM) {
274 int in_offset = out_offset + index;
275 MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
276 MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index);
277 MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index);
278 MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
279 MS_FLOAT32X4 result = MS_MINQ_F32(MS_MAXQ_F32(tmp, zeros), bounds);
280 MS_STQ_F32(out_data + in_offset, result);
281 }
282 #endif
283 for (; index < axis_size; index++) {
284 int in_offset = out_offset + index;
285 float tmp = in_data[in_offset] * scale[index] + offset[index];
286 out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
287 }
288 }
289 }
290
DoScaleRelu6(const float * in_data,float * out_data,const float * scale,const float * offset,int task_id,const ScaleStruct * scale_param)291 void DoScaleRelu6(const float *in_data, float *out_data, const float *scale, const float *offset, int task_id,
292 const ScaleStruct *scale_param) {
293 NNACL_CHECK_ZERO_RETURN(scale_param->base_.thread_nr_);
294 int outer_step = UP_DIV(scale_param->outer_size_, scale_param->base_.thread_nr_);
295 int outer_start = task_id * outer_step;
296 int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
297
298 if (scale_param->inner_size_ == 1) {
299 ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
300 } else {
301 ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
302 scale_param->inner_size_);
303 }
304 }
305