1 /**
2 * Copyright 2020 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/fp16/scale_fp16.h"
18
Fp16ScaleInner(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size,int inner_size)19 void Fp16ScaleInner(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
20 int outer_start, int outer_end, int axis_size, int inner_size) {
21 for (int out = outer_start; out < outer_end; out++) {
22 int out_offset = out * axis_size * inner_size;
23 for (int i = 0; i < axis_size; i++) {
24 int axis_offset = out_offset + i * inner_size;
25 int in_index = 0;
26 #ifdef ENABLE_NEON
27 for (; in_index < inner_size - 8; in_index += 8) {
28 int in_offset = axis_offset + in_index;
29 float16x8_t data = vld1q_f16(in_data + in_offset);
30 float16x8_t scale_8 = vdupq_n_f16(scale[i]);
31 float16x8_t offset_8 = vdupq_n_f16(offset[i]);
32 float16x8_t result = vfmaq_f16(offset_8, data, scale_8);
33
34 vst1q_f16(out_data + in_offset, result);
35 }
36 #endif
37 for (; in_index < inner_size; in_index++) {
38 int in_offset = axis_offset + in_index;
39 out_data[in_offset] = in_data[in_offset] * scale[i] + offset[i];
40 }
41 }
42 }
43 }
44
Fp16ScaleAxis(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size)45 void Fp16ScaleAxis(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
46 int outer_start, int outer_end, int axis_size) {
47 for (int out = outer_start; out < outer_end; out++) {
48 int out_offset = out * axis_size;
49 int index = 0;
50 #ifdef ENABLE_NEON
51 for (; index < axis_size - 8; index += 8) {
52 int in_offset = out_offset + index;
53 float16x8_t data = vld1q_f16(in_data + in_offset);
54 float16x8_t scale_8 = vld1q_f16(scale + index);
55 float16x8_t offset_8 = vld1q_f16(offset + index);
56 float16x8_t result = vfmaq_f16(offset_8, data, scale_8);
57 vst1q_f16(out_data + in_offset, result);
58 }
59 #endif
60 for (; index < axis_size; index++) {
61 int in_offset = out_offset + index;
62 out_data[in_offset] = in_data[in_offset] * scale[index] + offset[index];
63 }
64 }
65 }
66
DoScaleFp16(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int task_id,const ScaleParameter * scale_param)67 void DoScaleFp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
68 int task_id, const ScaleParameter *scale_param) {
69 NNACL_CHECK_ZERO_RETURN(scale_param->op_parameter_.thread_num_);
70 int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
71 int outer_start = task_id * outer_step;
72 int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
73
74 if (scale_param->inner_size_ == 1) {
75 Fp16ScaleAxis(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
76 } else {
77 Fp16ScaleInner(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
78 scale_param->inner_size_);
79 }
80 }
81
Fp16ScaleInnerRelu(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size,int inner_size)82 void Fp16ScaleInnerRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
83 int outer_start, int outer_end, int axis_size, int inner_size) {
84 #ifdef ENABLE_NEON
85 float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
86 #endif
87 for (int out = outer_start; out < outer_end; out++) {
88 int out_offset = out * axis_size * inner_size;
89 for (int i = 0; i < axis_size; i++) {
90 int axis_offset = out_offset + i * inner_size;
91 int in_index = 0;
92 #ifdef ENABLE_NEON
93 for (; in_index < inner_size - 8; in_index += 8) {
94 int in_offset = axis_offset + in_index;
95 float16x8_t data = vld1q_f16(in_data + in_offset);
96 float16x8_t scale_8 = vdupq_n_f16(scale[i]);
97 float16x8_t offset_8 = vdupq_n_f16(offset[i]);
98 float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
99 float16x8_t result = vmaxq_f16(tmp, zeros);
100 vst1q_f16(out_data + in_offset, result);
101 }
102 #endif
103 for (; in_index < inner_size; in_index++) {
104 int in_offset = axis_offset + in_index;
105 float tmp = in_data[in_offset] * scale[i] + offset[i];
106 out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
107 }
108 }
109 }
110 }
111
Fp16ScaleAxisRelu(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size)112 void Fp16ScaleAxisRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
113 int outer_start, int outer_end, int axis_size) {
114 #ifdef ENABLE_NEON
115 float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
116 #endif
117 for (int out = outer_start; out < outer_end; out++) {
118 int out_offset = out * axis_size;
119 int index = 0;
120 #ifdef ENABLE_NEON
121 for (; index < axis_size - 8; index += 8) {
122 int in_offset = out_offset + index;
123 float16x8_t data = vld1q_f16(in_data + in_offset);
124 float16x8_t scale_8 = vld1q_f16(scale + index);
125 float16x8_t offset_8 = vld1q_f16(offset + index);
126 float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
127 float16x8_t result = vmaxq_f16(tmp, zeros);
128 vst1q_f16(out_data + in_offset, result);
129 }
130 #endif
131 for (; index < axis_size; index++) {
132 int in_offset = out_offset + index;
133 float tmp = in_data[in_offset] * scale[index] + offset[index];
134 out_data[in_offset] = tmp > 0.0f ? tmp : 0.0f;
135 }
136 }
137 }
138
Fp16DoScaleRelu(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int task_id,const ScaleParameter * scale_param)139 void Fp16DoScaleRelu(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
140 int task_id, const ScaleParameter *scale_param) {
141 NNACL_CHECK_ZERO_RETURN(scale_param->op_parameter_.thread_num_);
142 int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
143 int outer_start = task_id * outer_step;
144 int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
145
146 if (scale_param->inner_size_ == 1) {
147 Fp16ScaleAxisRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
148 } else {
149 Fp16ScaleInnerRelu(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
150 scale_param->inner_size_);
151 }
152 }
153
Fp16ScaleInnerRelu6(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size,int inner_size)154 void Fp16ScaleInnerRelu6(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
155 int outer_start, int outer_end, int axis_size, int inner_size) {
156 #ifdef ENABLE_NEON
157 float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
158 float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
159 #endif
160 for (int out = outer_start; out < outer_end; out++) {
161 int out_offset = out * axis_size * inner_size;
162 for (int i = 0; i < axis_size; i++) {
163 int axis_offset = out_offset + i * inner_size;
164 int in_index = 0;
165 #ifdef ENABLE_NEON
166 for (; in_index < inner_size - 8; in_index += 8) {
167 int in_offset = axis_offset + in_index;
168 float16x8_t data = vld1q_f16(in_data + in_offset);
169 float16x8_t scale_8 = vdupq_n_f16(scale[i]);
170 float16x8_t offset_8 = vdupq_n_f16(offset[i]);
171 float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
172 float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
173 vst1q_f16(out_data + in_offset, result);
174 }
175 #endif
176 for (; in_index < inner_size; in_index++) {
177 int in_offset = axis_offset + in_index;
178 float tmp = in_data[in_offset] * scale[i] + offset[i];
179 out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
180 }
181 }
182 }
183 }
184
Fp16ScaleAxisRelu6(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int outer_start,int outer_end,int axis_size)185 void Fp16ScaleAxisRelu6(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
186 int outer_start, int outer_end, int axis_size) {
187 #ifdef ENABLE_NEON
188 float16x8_t zeros = {0, 0, 0, 0, 0, 0, 0, 0};
189 float16x8_t bounds = {6, 6, 6, 6, 6, 6, 6, 6};
190 #endif
191 for (int out = outer_start; out < outer_end; out++) {
192 int out_offset = out * axis_size;
193 int index = 0;
194 #ifdef ENABLE_NEON
195 for (; index < axis_size - 8; index += 8) {
196 int in_offset = out_offset + index;
197 float16x8_t data = vld1q_f16(in_data + in_offset);
198 float16x8_t scale_8 = vld1q_f16(scale + index);
199 float16x8_t offset_8 = vld1q_f16(offset + index);
200 float16x8_t tmp = vfmaq_f16(offset_8, data, scale_8);
201 float16x8_t result = vminq_f16(vmaxq_f16(tmp, zeros), bounds);
202 vst1q_f16(out_data + in_offset, result);
203 }
204 #endif
205 for (; index < axis_size; index++) {
206 int in_offset = out_offset + index;
207 float tmp = in_data[in_offset] * scale[index] + offset[index];
208 out_data[in_offset] = MSMIN(MSMAX(tmp, 0.0f), 6.0f);
209 }
210 }
211 }
212
DoScaleRelu6Fp16(const float16_t * in_data,float16_t * out_data,const float16_t * scale,const float16_t * offset,int task_id,const ScaleParameter * scale_param)213 void DoScaleRelu6Fp16(const float16_t *in_data, float16_t *out_data, const float16_t *scale, const float16_t *offset,
214 int task_id, const ScaleParameter *scale_param) {
215 NNACL_CHECK_ZERO_RETURN(scale_param->op_parameter_.thread_num_);
216 int outer_step = UP_DIV(scale_param->outer_size_, scale_param->op_parameter_.thread_num_);
217 int outer_start = task_id * outer_step;
218 int outer_end = MSMIN(outer_start + outer_step, scale_param->outer_size_);
219
220 if (scale_param->inner_size_ == 1) {
221 Fp16ScaleAxisRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_);
222 } else {
223 Fp16ScaleInnerRelu6(in_data, out_data, scale, offset, outer_start, outer_end, scale_param->axis_size_,
224 scale_param->inner_size_);
225 }
226 }
227