• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/fp16/scale_fp16.h"
18 
Fp16ScaleInner(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size,int inner_size)19 void Fp16ScaleInner(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
20                     int outer_start, int outer_end, int axis_size, int inner_size) {
21   for (int out = outer_start; out < outer_end; out++) {
22     int out_offset = out * axis_size * inner_size;
23     for (int i = 0; i < axis_size; i++) {
24       int axis_offset = out_offset + i * inner_size;
25       int in_index = 0;
26 #ifdef ENABLE_NEON
27       for (; in_index < inner_size - 8; in_index += 8) {
28         int in_offset = axis_offset + in_index;
29         float16x8_t data = vld1q_f16(in_data + in_offset);
30         float16x8_t scale_8 = vdupq_n_f16(scale[i]);
31         float16x8_t offset_8 = vdupq_n_f16(offset[i]);
32         float16x8_t result = vfmaq_f16(offset_8, data, scale_8);
33 
34         vst1q_f16(out_data + in_offset, result);
35       }
36 #endif
37       for (; in_index < inner_size; in_index++) {
38         int in_offset = axis_offset + in_index;
39         out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
40       }
41     }
42   }
43 }
44 
Fp16ScaleAxis(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size)45 void Fp16ScaleAxis(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
46                    int outer_start, int outer_end, int axis_size) {
47   for (int out = outer_start; out < outer_end; out++) {
48     int out_offset = out * axis_size;
49     int index = 0;
50 #ifdef ENABLE_NEON
51     for (; index < axis_size - 8; index += 8) {
52       int in_offset = out_offset + index;
53       float16x8_t data = vld1q_f16(in_data + in_offset);
54       float16x8_t scale_8 = vld1q_f16(scale + index);
55       float16x8_t offset_8 = vld1q_f16(offset + index);
56       float16x8_t result = vfmaq_f16(offset_8, data, scale_8);
57       vst1q_f16(out_data + in_offset, result);
58     }
59 #endif
60     for (; index < axis_size; index++) {
61       int in_offset = out_offset + index;
62       out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index];
63     }
64   }
65 }
66 
DoScaleFp16(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int task_id,const ScaleParameter * scale_param)67 void DoScaleFp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
68                  int task_id, const ScaleParameter *scale_param) {
69   NNACL_CHECK_ZERO_RETURN(scale_param->op_parameter_.thread_num_);
70   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
71   int outer_start = task_id * outer_step;
72   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
73 
74   if (scale_param->inner_size_ == 1) {
75     Fp16ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
76   } else {
77     Fp16ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
78                    scale_param->inner_size_);
79   }
80 }
81 
Fp16ScaleInnerRelu(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size,int inner_size)82 void Fp16ScaleInnerRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
83                         int outer_start, int outer_end, int axis_size, int inner_size) {
84 #ifdef ENABLE_NEON
85   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
86 #endif
87   for (int out = outer_start; out < outer_end; out++) {
88     int out_offset = out * axis_size * inner_size;
89     for (int i = 0; i < axis_size; i++) {
90       int axis_offset = out_offset + i * inner_size;
91       int in_index = 0;
92 #ifdef ENABLE_NEON
93       for (; in_index < inner_size - 8; in_index += 8) {
94         int in_offset = axis_offset + in_index;
95         float16x8_t data = vld1q_f16(in_data + in_offset);
96         float16x8_t scale_8 = vdupq_n_f16(scale[i]);
97         float16x8_t offset_8 = vdupq_n_f16(offset[i]);
98         float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
99         float16x8_t result = vmaxq_f16(tmp, zeros);
100         vst1q_f16(out_data + in_offset, result);
101       }
102 #endif
103       for (; in_index < inner_size; in_index++) {
104         int in_offset = axis_offset + in_index;
105         float tmp = in_data[in_offset] * scale[i] + offset[i];
106         out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
107       }
108     }
109   }
110 }
111 
Fp16ScaleAxisRelu(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size)112 void Fp16ScaleAxisRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
113                        int outer_start, int outer_end, int axis_size) {
114 #ifdef ENABLE_NEON
115   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
116 #endif
117   for (int out = outer_start; out < outer_end; out++) {
118     int out_offset = out * axis_size;
119     int index = 0;
120 #ifdef ENABLE_NEON
121     for (; index < axis_size - 8; index += 8) {
122       int in_offset = out_offset + index;
123       float16x8_t data = vld1q_f16(in_data + in_offset);
124       float16x8_t scale_8 = vld1q_f16(scale + index);
125       float16x8_t offset_8 = vld1q_f16(offset + index);
126       float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
127       float16x8_t result = vmaxq_f16(tmp, zeros);
128       vst1q_f16(out_data + in_offset, result);
129     }
130 #endif
131     for (; index < axis_size; index++) {
132       int in_offset = out_offset + index;
133       float tmp = in_data[in_offset] * scale[index] + offset[index];
134       out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
135     }
136   }
137 }
138 
Fp16DoScaleRelu(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int task_id,const ScaleParameter * scale_param)139 void Fp16DoScaleRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
140                      int task_id, const ScaleParameter *scale_param) {
141   NNACL_CHECK_ZERO_RETURN(scale_param->op_parameter_.thread_num_);
142   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
143   int outer_start = task_id * outer_step;
144   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
145 
146   if (scale_param->inner_size_ == 1) {
147     Fp16ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
148   } else {
149     Fp16ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
150                        scale_param->inner_size_);
151   }
152 }
153 
Fp16ScaleInnerRelu6(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size,int inner_size)154 void Fp16ScaleInnerRelu6(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
155                          int outer_start, int outer_end, int axis_size, int inner_size) {
156 #ifdef ENABLE_NEON
157   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
158   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
159 #endif
160   for (int out = outer_start; out < outer_end; out++) {
161     int out_offset = out * axis_size * inner_size;
162     for (int i = 0; i < axis_size; i++) {
163       int axis_offset = out_offset + i * inner_size;
164       int in_index = 0;
165 #ifdef ENABLE_NEON
166       for (; in_index < inner_size - 8; in_index += 8) {
167         int in_offset = axis_offset + in_index;
168         float16x8_t data = vld1q_f16(in_data + in_offset);
169         float16x8_t scale_8 = vdupq_n_f16(scale[i]);
170         float16x8_t offset_8 = vdupq_n_f16(offset[i]);
171         float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
172         float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
173         vst1q_f16(out_data + in_offset, result);
174       }
175 #endif
176       for (; in_index < inner_size; in_index++) {
177         int in_offset = axis_offset + in_index;
178         float tmp = in_data[in_offset] * scale[i] + offset[i];
179         out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
180       }
181     }
182   }
183 }
184 
Fp16ScaleAxisRelu6(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size)185 void Fp16ScaleAxisRelu6(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
186                         int outer_start, int outer_end, int axis_size) {
187 #ifdef ENABLE_NEON
188   float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
189   float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
190 #endif
191   for (int out = outer_start; out < outer_end; out++) {
192     int out_offset = out * axis_size;
193     int index = 0;
194 #ifdef ENABLE_NEON
195     for (; index < axis_size - 8; index += 8) {
196       int in_offset = out_offset + index;
197       float16x8_t data = vld1q_f16(in_data + in_offset);
198       float16x8_t scale_8 = vld1q_f16(scale + index);
199       float16x8_t offset_8 = vld1q_f16(offset + index);
200       float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
201       float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
202       vst1q_f16(out_data + in_offset, result);
203     }
204 #endif
205     for (; index < axis_size; index++) {
206       int in_offset = out_offset + index;
207       float tmp = in_data[in_offset] * scale[index] + offset[index];
208       out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
209     }
210   }
211 }
212 
DoScaleRelu6Fp16(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int task_id,const ScaleParameter * scale_param)213 void DoScaleRelu6Fp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
214                       int task_id, const ScaleParameter *scale_param) {
215   NNACL_CHECK_ZERO_RETURN(scale_param->op_parameter_.thread_num_);
216   int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
217   int outer_start = task_id * outer_step;
218   int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
219 
220   if (scale_param->inner_size_ == 1) {
221     Fp16ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
222   } else {
223     Fp16ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
224                         scale_param->inner_size_);
225   }
226 }
227