• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/fp32/scale_fp32.h"
18 #include "nnacl/intrinsics/ms_simd_instructions.h"
19 
ScaleInner(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size,int inner_size)20 void ScaleInner(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
21                 int outer_end, int axis_size, int inner_size) {
22   for (int out = outer_start; out < outer_end; out++) {
23     int out_offset = out * axis_size * inner_size;
24     for (int i = 0; i < axis_size; i++) {
25       int axis_offset = out_offset + i * inner_size;
26       int in_index = 0;
27 #ifdef ENABLE_AVX
28       MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]);
29       MS_FLOAT32X8 offset_8 = MS_MOV256_F32(offset[i]);
30       for (; in_index <= inner_size - C8NUM; in_index += C8NUM) {
31         int in_offset = axis_offset + in_index;
32         MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
33         MS_FLOAT32X8 result = MS_MLA256_F32(offset_8, data, scale_8);
34         MS_ST256_F32(out_data + in_offset, result);
35       }
36 #endif
37 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
38       MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]);
39       MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]);
40       for (; in_index <= inner_size - C4NUM; in_index += C4NUM) {
41         int in_offset = axis_offset + in_index;
42         MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
43         MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4);
44         MS_STQ_F32(out_data + in_offset, result);
45       }
46 #endif
47       for (; in_index < inner_size; in_index++) {
48         int in_offset = axis_offset + in_index;
49         out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
50       }
51     }
52   }
53 }
54 
ScaleAxis(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size)55 void ScaleAxis(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
56                int outer_end, int axis_size) {
57   for (int out = outer_start; out < outer_end; out++) {
58     int out_offset = out * axis_size;
59     int index = 0;
60 #if defined(ENABLE_AVX)
61     for (; index <= axis_size - C8NUM; index += C8NUM) {
62       int in_offset = out_offset + index;
63       MS_FLOAT32X8 scale_8 = MS_LD256_F32(scale + index);
64       MS_FLOAT32X8 offset_8 = MS_LD256_F32(offset + index);
65       MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
66       MS_FLOAT32X8 result = MS_MLA256_F32(offset_8, data, scale_8);
67       MS_ST256_F32(out_data + in_offset, result);
68     }
69 #endif
70 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
71     for (; index <= axis_size - C4NUM; index += C4NUM) {
72       MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index);
73       MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index);
74       int in_offset = out_offset + index;
75       MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
76       MS_FLOAT32X4 result = MS_MLAQ_F32(offset_4, data, scale_4);
77       MS_STQ_F32(out_data + in_offset, result);
78     }
79 #endif
80     for (; index < axis_size; index++) {
81       int in_offset = out_offset + index;
82       out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index];
83     }
84   }
85 }
86 
DoScale(const float * in_data,float * out_data,const float * scale,const float * offset,int task_id,const ScaleStruct * scale_param)87 void DoScale(const float *in_data, float *out_data, const float *scale, const float *offset, int task_id,
88              const ScaleStruct *scale_param) {
89   NNACL_CHECK_ZERO_RETURN(scale_param->base_.thread_nr_);
90   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->base_.thread_nr_);
91   int outer_start = task_id * outer_step;
92   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
93 
94   if (scale_param->inner_size_ == 1) {
95     ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
96   } else {
97     ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
98                scale_param->inner_size_);
99   }
100 }
101 
ScaleInnerRelu(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size,int inner_size)102 void ScaleInnerRelu(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
103                     int outer_end, int axis_size, int inner_size) {
104 #ifdef ENABLE_AVX
105   MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
106 #endif
107 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
108   MS_FLOAT32X4 zeros = {0, 0, 0, 0};
109 #endif
110   for (int out = outer_start; out < outer_end; out++) {
111     int out_offset = out * axis_size * inner_size;
112     for (int i = 0; i < axis_size; i++) {
113       int axis_offset = out_offset + i * inner_size;
114       int in_index = 0;
115 #ifdef ENABLE_AVX
116       MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]);
117       MS_FLOAT32X8 offset_8 = MS_MOV256_F32(offset[i]);
118       for (; in_index <= inner_size - C8NUM; in_index += C8NUM) {
119         int in_offset = axis_offset + in_index;
120         MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
121         MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
122         MS_FLOAT32X8 result = MS_MAX256_F32(tmp, zeros_8);
123         MS_ST256_F32(out_data + in_offset, result);
124       }
125 #endif
126 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
127       MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]);
128       MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]);
129       for (; in_index <= inner_size - C4NUM; in_index += C4NUM) {
130         int in_offset = axis_offset + in_index;
131         MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
132         MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
133         MS_FLOAT32X4 result = MS_MAXQ_F32(tmp, zeros);
134         MS_STQ_F32(out_data + in_offset, result);
135       }
136 #endif
137       for (; in_index < inner_size; in_index++) {
138         int in_offset = axis_offset + in_index;
139         float tmp = in_data[in_offset] * scale[i] + offset[i];
140         out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
141       }
142     }
143   }
144 }
145 
ScaleAxisRelu(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size)146 void ScaleAxisRelu(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
147                    int outer_end, int axis_size) {
148 #ifdef ENABLE_AVX
149   MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
150 #endif
151 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
152   MS_FLOAT32X4 zeros = {0, 0, 0, 0};
153 #endif
154   for (int out = outer_start; out < outer_end; out++) {
155     int out_offset = out * axis_size;
156     int index = 0;
157 #ifdef ENABLE_AVX
158     for (; index <= axis_size - C8NUM; index += C8NUM) {
159       int in_offset = out_offset + index;
160       MS_FLOAT32X8 scale_8 = MS_LD256_F32(scale + index);
161       MS_FLOAT32X8 offset_8 = MS_LD256_F32(offset + index);
162       MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
163       MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
164       MS_FLOAT32X8 result = MS_MAX256_F32(tmp, zeros_8);
165       MS_ST256_F32(out_data + in_offset, result);
166     }
167 #endif
168 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
169     for (; index <= axis_size - C4NUM; index += C4NUM) {
170       int in_offset = out_offset + index;
171       MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
172       MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index);
173       MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index);
174       MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
175       MS_FLOAT32X4 result = MS_MAXQ_F32(tmp, zeros);
176       MS_STQ_F32(out_data + in_offset, result);
177     }
178 #endif
179     for (; index < axis_size; index++) {
180       int in_offset = out_offset + index;
181       float tmp = in_data[in_offset] * scale[index] + offset[index];
182       out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
183     }
184   }
185 }
186 
DoScaleRelu(const float * in_data,float * out_data,const float * scale,const float * offset,int task_id,const ScaleStruct * scale_param)187 void DoScaleRelu(const float *in_data, float *out_data, const float *scale, const float *offset, int task_id,
188                  const ScaleStruct *scale_param) {
189   NNACL_CHECK_ZERO_RETURN(scale_param->base_.thread_nr_);
190   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->base_.thread_nr_);
191   int outer_start = task_id * outer_step;
192   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
193 
194   if (scale_param->inner_size_ == 1) {
195     ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
196   } else {
197     ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
198                    scale_param->inner_size_);
199   }
200 }
201 
ScaleInnerRelu6(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size,int inner_size)202 void ScaleInnerRelu6(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
203                      int outer_end, int axis_size, int inner_size) {
204 #ifdef ENABLE_AVX
205   MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
206   MS_FLOAT32X8 bounds_8 = {6, 6, 6, 6, 6, 6, 6, 6};
207 #endif
208 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
209   MS_FLOAT32X4 zeros = {0, 0, 0, 0};
210   MS_FLOAT32X4 bounds = {6, 6, 6, 6};
211 #endif
212   for (int out = outer_start; out < outer_end; out++) {
213     int out_offset = out * axis_size * inner_size;
214     for (int i = 0; i < axis_size; i++) {
215       int axis_offset = out_offset + i * inner_size;
216       int in_index = 0;
217 #if defined(ENABLE_AVX)
218       MS_FLOAT32X8 scale_8 = MS_MOV256_F32(scale[i]);
219       MS_FLOAT32X8 offset_8 = MS_MOV256_F32(offset[i]);
220       for (; in_index <= inner_size - C8NUM; in_index += C8NUM) {
221         int in_offset = axis_offset + in_index;
222         MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
223         MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
224         MS_FLOAT32X8 result = MS_MIN256_F32(MS_MAX256_F32(tmp, zeros_8), bounds_8);
225         MS_ST256_F32(out_data + in_offset, result);
226       }
227 #endif
228 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
229       for (; in_index < inner_size - C4NUM; in_index += C4NUM) {
230         int in_offset = axis_offset + in_index;
231         MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
232         MS_FLOAT32X4 scale_4 = MS_MOVQ_F32(scale[i]);
233         MS_FLOAT32X4 offset_4 = MS_MOVQ_F32(offset[i]);
234         MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
235         MS_FLOAT32X4 result = MS_MINQ_F32(MS_MAXQ_F32(tmp, zeros), bounds);
236         MS_STQ_F32(out_data + in_offset, result);
237       }
238 #endif
239       for (; in_index < inner_size; in_index++) {
240         int in_offset = axis_offset + in_index;
241         float tmp = in_data[in_offset] * scale[i] + offset[i];
242         out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
243       }
244     }
245   }
246 }
247 
ScaleAxisRelu6(const float * in_data,float * out_data,const float * scale,const float * offset,int outer_start,int outer_end,int axis_size)248 void ScaleAxisRelu6(const float *in_data, float *out_data, const float *scale, const float *offset, int outer_start,
249                     int outer_end, int axis_size) {
250 #ifdef ENABLE_AVX
251   MS_FLOAT32X8 zeros_8 = {0, 0, 0, 0, 0, 0, 0, 0};
252   MS_FLOAT32X8 bounds_8 = {6, 6, 6, 6, 6, 6, 6, 6};
253 #endif
254 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
255   MS_FLOAT32X4 zeros = {0, 0, 0, 0};
256   MS_FLOAT32X4 bounds = {6, 6, 6, 6};
257 #endif
258   for (int out = outer_start; out < outer_end; out++) {
259     int out_offset = out * axis_size;
260     int index = 0;
261 #ifdef ENABLE_AVX
262     for (; index <= axis_size - C8NUM; index += C8NUM) {
263       int in_offset = out_offset + index;
264       MS_FLOAT32X8 data = MS_LD256_F32(in_data + in_offset);
265       MS_FLOAT32X8 scale_8 = MS_LD256_F32(scale + index);
266       MS_FLOAT32X8 offset_8 = MS_LD256_F32(offset + index);
267       MS_FLOAT32X8 tmp = MS_MLA256_F32(offset_8, data, scale_8);
268       MS_FLOAT32X8 result = MS_MIN256_F32(MS_MAX256_F32(tmp, zeros_8), bounds_8);
269       MS_ST256_F32(out_data + in_offset, result);
270     }
271 #endif
272 #if defined(ENABLE_ARM64) || defined(ENABLE_SSE)
273     for (; index <= axis_size - C4NUM; index += C4NUM) {
274       int in_offset = out_offset + index;
275       MS_FLOAT32X4 data = MS_LDQ_F32(in_data + in_offset);
276       MS_FLOAT32X4 scale_4 = MS_LDQ_F32(scale + index);
277       MS_FLOAT32X4 offset_4 = MS_LDQ_F32(offset + index);
278       MS_FLOAT32X4 tmp = MS_MLAQ_F32(offset_4, data, scale_4);
279       MS_FLOAT32X4 result = MS_MINQ_F32(MS_MAXQ_F32(tmp, zeros), bounds);
280       MS_STQ_F32(out_data + in_offset, result);
281     }
282 #endif
283     for (; index < axis_size; index++) {
284       int in_offset = out_offset + index;
285       float tmp = in_data[in_offset] * scale[index] + offset[index];
286       out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
287     }
288   }
289 }
290 
DoScaleRelu6(const float * in_data,float * out_data,const float * scale,const float * offset,int task_id,const ScaleStruct * scale_param)291 void DoScaleRelu6(const float *in_data, float *out_data, const float *scale, const float *offset, int task_id,
292                   const ScaleStruct *scale_param) {
293   NNACL_CHECK_ZERO_RETURN(scale_param->base_.thread_nr_);
294   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->base_.thread_nr_);
295   int outer_start = task_id * outer_step;
296   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
297 
298   if (scale_param->inner_size_ == 1) {
299     ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
300   } else {
301     ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
302                     scale_param->inner_size_);
303   }
304 }
305