1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "nnacl/fp16/arithmetic_fp16.h"
18 #include <math.h>
19 #include "nnacl/common_func.h"
20 #include "nnacl/nnacl_utils.h"
21
BroadcastAddFp16(const float16_t * in0,const float16_t * in1,float16_t * tile_in0,float16_t * tile_in1,float16_t * out,int size,ArithmeticParameter * param)22 int BroadcastAddFp16(const float16_t *in0, const float16_t *in1, float16_t *tile_in0, float16_t *tile_in1,
23 float16_t *out, int size, ArithmeticParameter *param) {
24 TileDimensionsFp16(in0, in1, tile_in0, tile_in1, param);
25 return ElementAddFp16(tile_in0, tile_in1, out, size);
26 }
27
TileOneDimensionFp16(const void * input,void * output,int dim,size_t ndim,const int * inShape,const int * inStrides,const int * outStrides,const int * multiple)28 void TileOneDimensionFp16(const void *input, void *output, int dim, size_t ndim, const int *inShape,
29 const int *inStrides, const int *outStrides, const int *multiple) {
30 const float16_t *inData = (const float16_t *)input;
31 float16_t *outData = (float16_t *)output;
32
33 int srcDimSize = inShape[dim];
34 if (dim == ndim - 1) {
35 for (int i = 0; i < multiple[dim]; i++) {
36 memcpy(outData, inData, srcDimSize * sizeof(float16_t));
37 outData += srcDimSize;
38 }
39 return;
40 }
41 for (size_t i = 0; i < srcDimSize; i++) {
42 for (size_t j = 0; j < multiple[dim]; j++) {
43 TileOneDimensionFp16(inData + inStrides[dim] * i, outData + outStrides[dim] * (i + j * srcDimSize), dim + 1, ndim,
44 inShape, inStrides, outStrides, multiple);
45 }
46 }
47 }
48
TileDimensionsFp16(const float16_t * data0,const float16_t * data1,float16_t * tile_data0,float16_t * tile_data1,ArithmeticParameter * param)49 void TileDimensionsFp16(const float16_t *data0, const float16_t *data1, float16_t *tile_data0, float16_t *tile_data1,
50 ArithmeticParameter *param) {
51 CalcMultiplesAndStrides(param);
52 TileOneDimensionFp16(data0, tile_data0, 0, param->ndim_, param->in_shape0_, param->in_strides0_, param->out_strides_,
53 param->multiples0_);
54 TileOneDimensionFp16(data1, tile_data1, 0, param->ndim_, param->in_shape1_, param->in_strides1_, param->out_strides_,
55 param->multiples1_);
56 }
57
ElementMulFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)58 int ElementMulFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
59 int index = 0;
60 #ifdef ENABLE_NEON
61 for (; index <= element_size - 8; index += C8NUM) {
62 float16x8_t vin0 = vld1q_f16(input0 + index);
63 float16x8_t vin1 = vld1q_f16(input1 + index);
64 float16x8_t vout = vmulq_f16(vin0, vin1);
65 vst1q_f16(output + index, vout);
66 }
67 #endif
68 for (; index < element_size; index++) {
69 output[index] = input0[index] * input1[index];
70 }
71 return NNACL_OK;
72 }
73
ElementOptMulFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)74 int ElementOptMulFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
75 bool first_scalar) {
76 #ifdef ENABLE_NEON
77 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
78 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
79 #endif
80 int index = 0;
81 if (first_scalar) {
82 #ifdef ENABLE_NEON
83 for (; index <= element_size - 8; index += C8NUM) {
84 float16x8_t vin1 = vld1q_f16(input1 + index);
85 float16x8_t vout = vmulq_f16(vin0_opt, vin1);
86 vst1q_f16(output + index, vout);
87 }
88 #endif
89 for (; index < element_size; index++) {
90 output[index] = input0[0] * input1[index];
91 }
92 } else {
93 #ifdef ENABLE_NEON
94 for (; index <= element_size - 8; index += C8NUM) {
95 float16x8_t vin0 = vld1q_f16(input0 + index);
96 float16x8_t vout = vmulq_f16(vin0, vin1_opt);
97 vst1q_f16(output + index, vout);
98 }
99 #endif
100 for (; index < element_size; index++) {
101 output[index] = input0[index] * input1[0];
102 }
103 }
104 return NNACL_OK;
105 }
106
ElementMulReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)107 int ElementMulReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
108 #ifdef ENABLE_NEON
109 float16x8_t zeros = vdupq_n_f16(0.0);
110 #endif
111 int index = 0;
112 #ifdef ENABLE_NEON
113 for (; index <= element_size - 8; index += C8NUM) {
114 float16x8_t vin0 = vld1q_f16(input0 + index);
115 float16x8_t vin1 = vld1q_f16(input1 + index);
116 float16x8_t vout = vmulq_f16(vin0, vin1);
117 vout = vmaxq_f16(vout, zeros);
118 vst1q_f16(output + index, vout);
119 }
120 #endif
121 for (; index < element_size; index++) {
122 float16_t res = input0[index] * input1[index];
123 output[index] = res > 0 ? res : 0;
124 }
125 return NNACL_OK;
126 }
127
ElementOptMulReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)128 int ElementOptMulReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
129 bool first_scalar) {
130 #ifdef ENABLE_NEON
131 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
132 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
133 float16x8_t zeros = vdupq_n_f16(0.0);
134 #endif
135 int index = 0;
136 if (first_scalar) {
137 #ifdef ENABLE_NEON
138 for (; index <= element_size - 8; index += C8NUM) {
139 float16x8_t vin1 = vld1q_f16(input1 + index);
140 float16x8_t vout = vmulq_f16(vin0_opt, vin1);
141 vout = vmaxq_f16(vout, zeros);
142 vst1q_f16(output + index, vout);
143 }
144 #endif
145 for (; index < element_size; index++) {
146 float16_t res = input0[0] * input1[index];
147 output[index] = res > 0 ? res : 0;
148 }
149 } else {
150 #ifdef ENABLE_NEON
151 for (; index <= element_size - 8; index += C8NUM) {
152 float16x8_t vin0 = vld1q_f16(input0 + index);
153 float16x8_t vout = vmulq_f16(vin0, vin1_opt);
154 vout = vmaxq_f16(vout, zeros);
155 vst1q_f16(output + index, vout);
156 }
157 #endif
158 for (; index < element_size; index++) {
159 float16_t res = input0[index] * input1[0];
160 output[index] = res > 0 ? res : 0;
161 }
162 }
163 return NNACL_OK;
164 }
165
ElementMulRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)166 int ElementMulRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
167 int index = 0;
168 #ifdef ENABLE_NEON
169 float16x8_t zeros = vdupq_n_f16(0.0);
170 float16x8_t bounds = vdupq_n_f16(6.0);
171 for (; index <= element_size - 8; index += C8NUM) {
172 float16x8_t vin0 = vld1q_f16(input0 + index);
173 float16x8_t vin1 = vld1q_f16(input1 + index);
174 float16x8_t vout = vmulq_f16(vin0, vin1);
175 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
176 vst1q_f16(output + index, vout);
177 }
178 #endif
179 for (; index < element_size; index++) {
180 output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6);
181 }
182 return NNACL_OK;
183 }
184
ElementOptMulRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)185 int ElementOptMulRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
186 bool first_scalar) {
187 #ifdef ENABLE_NEON
188 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
189 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
190 float16x8_t zeros = vdupq_n_f16(0.0);
191 float16x8_t bounds = vdupq_n_f16(6.0);
192 #endif
193 int index = 0;
194 if (first_scalar) {
195 #ifdef ENABLE_NEON
196 for (; index <= element_size - 8; index += C8NUM) {
197 float16x8_t vin1 = vld1q_f16(input1 + index);
198 float16x8_t vout = vmulq_f16(vin0_opt, vin1);
199 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
200 vst1q_f16(output + index, vout);
201 }
202 #endif
203 for (; index < element_size; index++) {
204 output[index] = MSMIN(MSMAX(input0[0] * input1[index], 0), 6);
205 }
206 } else {
207 #ifdef ENABLE_NEON
208 for (; index <= element_size - 8; index += C8NUM) {
209 float16x8_t vin0 = vld1q_f16(input0 + index);
210 float16x8_t vout = vmulq_f16(vin0, vin1_opt);
211 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
212 vst1q_f16(output + index, vout);
213 }
214 #endif
215 for (; index < element_size; index++) {
216 output[index] = MSMIN(MSMAX(input0[index] * input1[0], 0), 6);
217 }
218 }
219 return NNACL_OK;
220 }
221
ElementAddFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)222 int ElementAddFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
223 int index = 0;
224 #ifdef ENABLE_NEON
225 for (; index <= element_size - 8; index += C8NUM) {
226 float16x8_t vin0 = vld1q_f16(input0 + index);
227 float16x8_t vin1 = vld1q_f16(input1 + index);
228 float16x8_t vout = vaddq_f16(vin0, vin1);
229 vst1q_f16(output + index, vout);
230 }
231 for (; index <= element_size - 4; index += C4NUM) {
232 float16x4_t vin0 = vld1_f16(input0 + index);
233 float16x4_t vin1 = vld1_f16(input1 + index);
234 float16x4_t vout = vadd_f16(vin0, vin1);
235 vst1_f16(output + index, vout);
236 }
237 #endif
238 for (; index < element_size; index++) {
239 output[index] = input0[index] + input1[index];
240 }
241 return NNACL_OK;
242 }
243
ElementOptAddFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)244 int ElementOptAddFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
245 bool first_scalar) {
246 #ifdef ENABLE_NEON
247 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
248 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
249 #endif
250 int index = 0;
251 if (first_scalar) {
252 #ifdef ENABLE_NEON
253 for (; index <= element_size - 8; index += C8NUM) {
254 float16x8_t vin1 = vld1q_f16(input1 + index);
255 float16x8_t vout = vaddq_f16(vin0_opt, vin1);
256 vst1q_f16(output + index, vout);
257 }
258 #endif
259 for (; index < element_size; index++) {
260 output[index] = input0[0] + input1[index];
261 }
262 } else {
263 #ifdef ENABLE_NEON
264 for (; index <= element_size - 8; index += C8NUM) {
265 float16x8_t vin0 = vld1q_f16(input0 + index);
266 float16x8_t vout = vaddq_f16(vin0, vin1_opt);
267 vst1q_f16(output + index, vout);
268 }
269 #endif
270 for (; index < element_size; index++) {
271 output[index] = input0[index] + input1[0];
272 }
273 }
274 return NNACL_OK;
275 }
276
ElementAddReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)277 int ElementAddReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
278 int index = 0;
279 #ifdef ENABLE_NEON
280 float16x8_t zeros = vdupq_n_f16(0.0);
281 for (; index <= element_size - 8; index += C8NUM) {
282 float16x8_t vin0 = vld1q_f16(input0 + index);
283 float16x8_t vin1 = vld1q_f16(input1 + index);
284 float16x8_t vout = vaddq_f16(vin0, vin1);
285 vout = vmaxq_f16(vout, zeros);
286 vst1q_f16(output + index, vout);
287 }
288 float16x4_t zeros1 = vdup_n_f16(0.0f);
289 for (; index <= element_size - 4; index += C4NUM) {
290 float16x4_t vin0 = vld1_f16(input0 + index);
291 float16x4_t vin1 = vld1_f16(input1 + index);
292 float16x4_t vout = vadd_f16(vin0, vin1);
293 vout = vmax_f16(vout, zeros1);
294 vst1_f16(output + index, vout);
295 }
296 #endif
297 for (; index < element_size; index++) {
298 float16_t res = input0[index] + input1[index];
299 output[index] = res > 0 ? res : 0;
300 }
301 return NNACL_OK;
302 }
303
ElementOptAddReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)304 int ElementOptAddReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
305 bool first_scalar) {
306 #ifdef ENABLE_NEON
307 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
308 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
309 float16x8_t zeros = vdupq_n_f16(0.0);
310 #endif
311 int index = 0;
312 if (first_scalar) {
313 #ifdef ENABLE_NEON
314 for (; index <= element_size - 8; index += C8NUM) {
315 float16x8_t vin1 = vld1q_f16(input1 + index);
316 float16x8_t vout = vaddq_f16(vin0_opt, vin1);
317 vout = vmaxq_f16(vout, zeros);
318 vst1q_f16(output + index, vout);
319 }
320 #endif
321 for (; index < element_size; index++) {
322 float16_t res = input0[0] + input1[index];
323 output[index] = res > 0 ? res : 0;
324 }
325 } else {
326 #ifdef ENABLE_NEON
327 for (; index <= element_size - 8; index += C8NUM) {
328 float16x8_t vin0 = vld1q_f16(input0 + index);
329 float16x8_t vout = vaddq_f16(vin0, vin1_opt);
330 vout = vmaxq_f16(vout, zeros);
331 vst1q_f16(output + index, vout);
332 }
333 #endif
334 for (; index < element_size; index++) {
335 float16_t res = input0[index] + input1[0];
336 output[index] = res > 0 ? res : 0;
337 }
338 }
339 return NNACL_OK;
340 }
341
ElementAddRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)342 int ElementAddRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
343 int index = 0;
344 #ifdef ENABLE_NEON
345 float16x8_t zeros = vdupq_n_f16(0.0);
346 float16x8_t bounds = vdupq_n_f16(6.0);
347 for (; index <= element_size - 8; index += C8NUM) {
348 float16x8_t vin0 = vld1q_f16(input0 + index);
349 float16x8_t vin1 = vld1q_f16(input1 + index);
350 float16x8_t vout = vaddq_f16(vin0, vin1);
351 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
352 vst1q_f16(output + index, vout);
353 }
354 float16x4_t zeros1 = vdup_n_f16(0.0);
355 float16x4_t bounds1 = vdup_n_f16(6.0);
356 for (; index <= element_size - 4; index += C4NUM) {
357 float16x4_t vin0 = vld1_f16(input0 + index);
358 float16x4_t vin1 = vld1_f16(input1 + index);
359 float16x4_t vout = vadd_f16(vin0, vin1);
360 vout = vmin_f16(vmax_f16(vout, zeros1), bounds1);
361 vst1_f16(output + index, vout);
362 }
363 #endif
364 for (; index < element_size; index++) {
365 output[index] = MSMIN(MSMAX(input0[index] + input1[index], 0), 6);
366 }
367 return NNACL_OK;
368 }
369
ElementOptAddRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)370 int ElementOptAddRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
371 bool first_scalar) {
372 #ifdef ENABLE_NEON
373 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
374 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
375 float16x8_t zeros = vdupq_n_f16(0.0);
376 float16x8_t bounds = vdupq_n_f16(6.0);
377 #endif
378 int index = 0;
379 if (first_scalar) {
380 #ifdef ENABLE_NEON
381 for (; index <= element_size - 8; index += C8NUM) {
382 float16x8_t vin1 = vld1q_f16(input1 + index);
383 float16x8_t vout = vaddq_f16(vin0_opt, vin1);
384 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
385 vst1q_f16(output + index, vout);
386 }
387 #endif
388 for (; index < element_size; index++) {
389 output[index] = MSMIN(MSMAX(input0[0] + input1[index], 0), 6);
390 }
391 } else {
392 #ifdef ENABLE_NEON
393 for (; index <= element_size - 8; index += C8NUM) {
394 float16x8_t vin0 = vld1q_f16(input0 + index);
395 float16x8_t vout = vaddq_f16(vin0, vin1_opt);
396 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
397 vst1q_f16(output + index, vout);
398 }
399 #endif
400 for (; index < element_size; index++) {
401 output[index] = MSMIN(MSMAX(input0[index] + input1[0], 0), 6);
402 }
403 }
404 return NNACL_OK;
405 }
406
ElementSubFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)407 int ElementSubFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
408 int index = 0;
409 #ifdef ENABLE_NEON
410 for (; index <= element_size - 8; index += C8NUM) {
411 float16x8_t vin0 = vld1q_f16(input0 + index);
412 float16x8_t vin1 = vld1q_f16(input1 + index);
413 float16x8_t vout = vsubq_f16(vin0, vin1);
414 vst1q_f16(output + index, vout);
415 }
416 #endif
417 for (; index < element_size; index++) {
418 output[index] = input0[index] - input1[index];
419 }
420 return NNACL_OK;
421 }
422
ElementOptSubFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)423 int ElementOptSubFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
424 bool first_scalar) {
425 #ifdef ENABLE_NEON
426 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
427 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
428 #endif
429 int index = 0;
430 if (first_scalar) {
431 #ifdef ENABLE_NEON
432 for (; index <= element_size - 8; index += C8NUM) {
433 float16x8_t vin1 = vld1q_f16(input1 + index);
434 float16x8_t vout = vsubq_f16(vin0_opt, vin1);
435 vst1q_f16(output + index, vout);
436 }
437 #endif
438 for (; index < element_size; index++) {
439 output[index] = input0[0] - input1[index];
440 }
441 } else {
442 #ifdef ENABLE_NEON
443 for (; index <= element_size - 8; index += C8NUM) {
444 float16x8_t vin0 = vld1q_f16(input0 + index);
445 float16x8_t vout = vsubq_f16(vin0, vin1_opt);
446 vst1q_f16(output + index, vout);
447 }
448 #endif
449 for (; index < element_size; index++) {
450 output[index] = input0[index] - input1[0];
451 }
452 }
453 return NNACL_OK;
454 }
455
ElementSubReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)456 int ElementSubReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
457 int index = 0;
458 #ifdef ENABLE_NEON
459 float16x8_t zeros = vdupq_n_f16(0.0);
460 for (; index <= element_size - 8; index += C8NUM) {
461 float16x8_t vin0 = vld1q_f16(input0 + index);
462 float16x8_t vin1 = vld1q_f16(input1 + index);
463 float16x8_t vout = vsubq_f16(vin0, vin1);
464 vout = vmaxq_f16(vout, zeros);
465 vst1q_f16(output + index, vout);
466 }
467 #endif
468 for (; index < element_size; index++) {
469 float16_t res = input0[index] - input1[index];
470 output[index] = res > 0 ? res : 0;
471 }
472 return NNACL_OK;
473 }
474
ElementOptSubReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)475 int ElementOptSubReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
476 bool first_scalar) {
477 #ifdef ENABLE_NEON
478 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
479 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
480 float16x8_t zeros = vdupq_n_f16(0.0);
481 #endif
482 int index = 0;
483 if (first_scalar) {
484 #ifdef ENABLE_NEON
485 for (; index <= element_size - 8; index += C8NUM) {
486 float16x8_t vin1 = vld1q_f16(input1 + index);
487 float16x8_t vout = vsubq_f16(vin0_opt, vin1);
488 vout = vmaxq_f16(vout, zeros);
489 vst1q_f16(output + index, vout);
490 }
491 #endif
492 for (; index < element_size; index++) {
493 float16_t res = input0[0] - input1[index];
494 output[index] = res > 0 ? res : 0;
495 }
496 } else {
497 #ifdef ENABLE_NEON
498 for (; index <= element_size - 8; index += C8NUM) {
499 float16x8_t vin0 = vld1q_f16(input0 + index);
500 float16x8_t vout = vsubq_f16(vin0, vin1_opt);
501 vout = vmaxq_f16(vout, zeros);
502 vst1q_f16(output + index, vout);
503 }
504 #endif
505 for (; index < element_size; index++) {
506 float16_t res = input0[index] - input1[0];
507 output[index] = res > 0 ? res : 0;
508 }
509 }
510 return NNACL_OK;
511 }
512
ElementSubRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)513 int ElementSubRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
514 int index = 0;
515 #ifdef ENABLE_NEON
516 float16x8_t zeros = vdupq_n_f16(0.0);
517 float16x8_t bounds = vdupq_n_f16(6.0);
518 for (; index <= element_size - 8; index += C8NUM) {
519 float16x8_t vin0 = vld1q_f16(input0 + index);
520 float16x8_t vin1 = vld1q_f16(input1 + index);
521 float16x8_t vout = vsubq_f16(vin0, vin1);
522 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
523 vst1q_f16(output + index, vout);
524 }
525 #endif
526 for (; index < element_size; index++) {
527 output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6);
528 }
529 return NNACL_OK;
530 }
531
ElementOptSubRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)532 int ElementOptSubRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
533 bool first_scalar) {
534 #ifdef ENABLE_NEON
535 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
536 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
537 float16x8_t zeros = vdupq_n_f16(0.0);
538 float16x8_t bounds = vdupq_n_f16(6.0);
539 #endif
540 int index = 0;
541 if (first_scalar) {
542 #ifdef ENABLE_NEON
543 for (; index <= element_size - 8; index += C8NUM) {
544 float16x8_t vin1 = vld1q_f16(input1 + index);
545 float16x8_t vout = vsubq_f16(vin0_opt, vin1);
546 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
547 vst1q_f16(output + index, vout);
548 }
549 #endif
550 for (; index < element_size; index++) {
551 output[index] = MSMIN(MSMAX(input0[0] - input1[index], 0), 6);
552 }
553 } else {
554 #ifdef ENABLE_NEON
555 for (; index <= element_size - 8; index += C8NUM) {
556 float16x8_t vin0 = vld1q_f16(input0 + index);
557 float16x8_t vout = vsubq_f16(vin0, vin1_opt);
558 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
559 vst1q_f16(output + index, vout);
560 }
561 #endif
562 for (; index < element_size; index++) {
563 output[index] = MSMIN(MSMAX(input0[index] - input1[0], 0), 6);
564 }
565 }
566 return NNACL_OK;
567 }
568
ElementDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)569 int ElementDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
570 int index = 0;
571 #ifdef ENABLE_NEON
572 for (; index <= element_size - 8; index += C8NUM) {
573 float16x8_t vin0 = vld1q_f16(input0 + index);
574 float16x8_t vin1 = vld1q_f16(input1 + index);
575 float16x8_t vout = MS_DIVQ_F16(vin0, vin1);
576 vst1q_f16(output + index, vout);
577 }
578 #endif
579 for (; index < element_size; index++) {
580 output[index] = input0[index] / input1[index];
581 }
582 return NNACL_OK;
583 }
584
ElementOptDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)585 int ElementOptDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
586 bool first_scalar) {
587 #ifdef ENABLE_NEON
588 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
589 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
590 #endif
591 int index = 0;
592 if (first_scalar) {
593 #ifdef ENABLE_NEON
594 for (; index <= element_size - 8; index += C8NUM) {
595 float16x8_t vin1 = vld1q_f16(input1 + index);
596 float16x8_t vout = MS_DIVQ_F16(vin0_opt, vin1);
597 vst1q_f16(output + index, vout);
598 }
599 #endif
600 for (; index < element_size; index++) {
601 output[index] = input0[0] / input1[index];
602 }
603 } else {
604 if (input1[0] == 0) {
605 return NNACL_ERRCODE_DIVISOR_ZERO;
606 }
607 #ifdef ENABLE_NEON
608 for (; index <= element_size - 8; index += C8NUM) {
609 float16x8_t vin0 = vld1q_f16(input0 + index);
610 float16x8_t vout = MS_DIVQ_F16(vin0, vin1_opt);
611 vst1q_f16(output + index, vout);
612 }
613 #endif
614 for (; index < element_size; index++) {
615 output[index] = input0[index] / input1[0];
616 }
617 }
618 return NNACL_OK;
619 }
620
ElementDivReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)621 int ElementDivReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
622 int index = 0;
623 #ifdef ENABLE_NEON
624 float16x8_t zeros = vdupq_n_f16(0.0);
625 for (; index <= element_size - 8; index += C8NUM) {
626 float16x8_t vin0 = vld1q_f16(input0 + index);
627 float16x8_t vin1 = vld1q_f16(input1 + index);
628 float16x8_t vout = MS_DIVQ_F16(vin0, vin1);
629 vout = vmaxq_f16(vout, zeros);
630 vst1q_f16(output + index, vout);
631 }
632 #endif
633 for (; index < element_size; index++) {
634 if (input1[index] == 0) {
635 return NNACL_ERRCODE_DIVISOR_ZERO;
636 }
637 float16_t res = input0[index] / input1[index];
638 output[index] = res > 0 ? res : 0;
639 }
640 return NNACL_OK;
641 }
642
ElementOptDivReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)643 int ElementOptDivReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
644 bool first_scalar) {
645 #ifdef ENABLE_NEON
646 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
647 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
648 float16x8_t zeros = vdupq_n_f16(0.0);
649 #endif
650 int index = 0;
651 if (first_scalar) {
652 #ifdef ENABLE_NEON
653 for (; index <= element_size - 8; index += C8NUM) {
654 float16x8_t vin1 = vld1q_f16(input1 + index);
655 float16x8_t vout = vmaxq_f16(MS_DIVQ_F16(vin0_opt, vin1), zeros);
656 vst1q_f16(output + index, vout);
657 }
658 #endif
659 for (; index < element_size; index++) {
660 if (input1[index] == 0) {
661 return NNACL_ERRCODE_DIVISOR_ZERO;
662 }
663 output[index] = MSMAX(input0[0] / input1[index], 0);
664 }
665 } else {
666 if (input1[0] == 0) {
667 return NNACL_ERRCODE_DIVISOR_ZERO;
668 }
669 #ifdef ENABLE_NEON
670 for (; index <= element_size - 8; index += C8NUM) {
671 float16x8_t vin0 = vld1q_f16(input0 + index);
672 float16x8_t vout = vmaxq_f16(MS_DIVQ_F16(vin0, vin1_opt), zeros);
673 vst1q_f16(output + index, vout);
674 }
675 #endif
676 for (; index < element_size; index++) {
677 output[index] = MSMAX(input0[index] / input1[0], 0);
678 }
679 }
680 return NNACL_OK;
681 }
682
ElementDivRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)683 int ElementDivRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
684 int index = 0;
685 #ifdef ENABLE_NEON
686 float16x8_t zeros = vdupq_n_f16(0.0);
687 float16x8_t bounds = vdupq_n_f16(6.0);
688 for (; index <= element_size - 8; index += C8NUM) {
689 float16x8_t vin0 = vld1q_f16(input0 + index);
690 float16x8_t vin1 = vld1q_f16(input1 + index);
691 float16x8_t vout = MS_DIVQ_F16(vin0, vin1);
692 vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
693 vst1q_f16(output + index, vout);
694 }
695 #endif
696 for (; index < element_size; index++) {
697 if (input1[index] == 0) {
698 return NNACL_ERRCODE_DIVISOR_ZERO;
699 }
700 output[index] = MSMIN(MSMAX(input0[index] / input1[index], 0), 6);
701 }
702 return NNACL_OK;
703 }
704
ElementOptDivRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)705 int ElementOptDivRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
706 bool first_scalar) {
707 #ifdef ENABLE_NEON
708 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
709 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
710 float16x8_t zeros = vdupq_n_f16(0.0);
711 float16x8_t bounds = vdupq_n_f16(6.0);
712 #endif
713 int index = 0;
714 if (first_scalar) {
715 #ifdef ENABLE_NEON
716 for (; index <= element_size - 8; index += C8NUM) {
717 float16x8_t vin1 = vld1q_f16(input1 + index);
718 float16x8_t vout = vminq_f16(vmaxq_f16(MS_DIVQ_F16(vin0_opt, vin1), zeros), bounds);
719 vst1q_f16(output + index, vout);
720 }
721 #endif
722 for (; index < element_size; index++) {
723 if (input1[index] == 0) {
724 return NNACL_ERRCODE_DIVISOR_ZERO;
725 }
726 output[index] = MSMIN(MSMAX(input0[0] / input1[index], 0), 6);
727 }
728 } else {
729 if (input1[0] == 0) {
730 return NNACL_ERRCODE_DIVISOR_ZERO;
731 }
732 #ifdef ENABLE_NEON
733 for (; index <= element_size - 8; index += C8NUM) {
734 float16x8_t vin0 = vld1q_f16(input0 + index);
735 float16x8_t vout = vminq_f16(vmaxq_f16(MS_DIVQ_F16(vin0, vin1_opt), zeros), bounds);
736 vst1q_f16(output + index, vout);
737 }
738 #endif
739 for (; index < element_size; index++) {
740 output[index] = MSMIN(MSMAX(input0[index] / input1[0], 0), 6);
741 }
742 }
743 return NNACL_OK;
744 }
745
ElementFloorModFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)746 int ElementFloorModFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
747 for (int i = 0; i < element_size; ++i) {
748 if (input1[i] == 0) {
749 return NNACL_ERRCODE_DIVISOR_ZERO;
750 }
751 output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
752 }
753 return NNACL_OK;
754 }
755
ElementOptFloorModFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)756 int ElementOptFloorModFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
757 bool first_scalar) {
758 if (!first_scalar) {
759 for (int i = 0; i < element_size; ++i) {
760 output[i] = input0[i] - floorf(input0[i] / input1[0]) * input1[0];
761 }
762 } else {
763 for (int i = 0; i < element_size; ++i) {
764 output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
765 }
766 }
767 return NNACL_OK;
768 }
769
ElementFloorDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)770 int ElementFloorDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
771 for (int i = 0; i < element_size; ++i) {
772 output[i] = floorf(input0[i] / input1[i]);
773 }
774 return NNACL_OK;
775 }
ElementOptFloorDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)776 int ElementOptFloorDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
777 bool first_scalar) {
778 if (!first_scalar) {
779 for (int i = 0; i < element_size; ++i) {
780 output[i] = floorf(input0[i] / input1[0]);
781 }
782 } else {
783 for (int i = 0; i < element_size; ++i) {
784 output[i] = floorf(input0[i] / input1[i]);
785 }
786 }
787 return NNACL_OK;
788 }
789
ElementLogicalAndFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)790 int ElementLogicalAndFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
791 int index = 0;
792 #ifdef ENABLE_NEON
793 float16x8_t vtrue = vdupq_n_f16(1);
794 float16x8_t vfalse = vdupq_n_f16(0);
795 uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
796 uint16x8_t zeros = vdupq_n_u16(0);
797 for (; index <= element_size - 8; index += C8NUM) {
798 uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
799 uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
800 float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
801 vst1q_f16(output + index, vout);
802 }
803 #endif
804 for (; index < element_size; index++) {
805 output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[index]));
806 }
807 return NNACL_OK;
808 }
809
ElementOptLogicalAndFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)810 int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
811 bool first_scalar) {
812 #ifdef ENABLE_NEON
813 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
814 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
815 float16x8_t vtrue = vdupq_n_f16(1);
816 float16x8_t vfalse = vdupq_n_f16(0);
817 uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
818 uint16x8_t zeros = vdupq_n_u16(0);
819 #endif
820 int index = 0;
821 if (first_scalar) {
822 #ifdef ENABLE_NEON
823 for (; index <= element_size - 8; index += C8NUM) {
824 float16x8_t vin1_ = vld1q_f16(input1 + index);
825 uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
826 uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
827 float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
828 vst1q_f16(output + index, vout);
829 }
830 #endif
831 for (; index < element_size; index++) {
832 output[index] = (float16_t)((bool)(input0[0]) & (bool)(input1[index]));
833 }
834 } else {
835 #ifdef ENABLE_NEON
836 for (; index <= element_size - 8; index += C8NUM) {
837 float16x8_t vin0_ = vld1q_f16(input0 + index);
838 uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
839 uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
840 float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
841 vst1q_f16(output + index, vout);
842 }
843 #endif
844 for (; index < element_size; index++) {
845 output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[0]));
846 }
847 }
848 return NNACL_OK;
849 }
850
ElementLogicalOrFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)851 int ElementLogicalOrFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
852 int index = 0;
853 #ifdef ENABLE_NEON
854 float16x8_t vtrue = vdupq_n_f16(1);
855 float16x8_t vfalse = vdupq_n_f16(0);
856 uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
857 uint16x8_t zeros = vdupq_n_u16(0);
858 for (; index <= element_size - 8; index += C8NUM) {
859 uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
860 uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
861 float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
862 vst1q_f16(output + index, vout);
863 }
864 #endif
865 for (; index < element_size; index++) {
866 output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[index]));
867 }
868 return NNACL_OK;
869 }
870
ElementOptLogicalOrFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)871 int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
872 bool first_scalar) {
873 #ifdef ENABLE_NEON
874 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
875 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
876 float16x8_t vtrue = vdupq_n_f16(1);
877 float16x8_t vfalse = vdupq_n_f16(0);
878 uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
879 uint16x8_t zeros = vdupq_n_u16(0);
880 #endif
881 int index = 0;
882 if (first_scalar) {
883 #ifdef ENABLE_NEON
884 for (; index <= element_size - 8; index += C8NUM) {
885 float16x8_t vin1_ = vld1q_f16(input1 + index);
886 uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
887 uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
888 float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
889 vst1q_f16(output + index, vout);
890 }
891 #endif
892 for (; index < element_size; index++) {
893 output[index] = (float16_t)((bool)(input0[0]) | (bool)(input1[index]));
894 }
895 } else {
896 #ifdef ENABLE_NEON
897 for (; index <= element_size - 8; index += C8NUM) {
898 float16x8_t vin0_ = vld1q_f16(input0 + index);
899 uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
900 uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
901 float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
902 vst1q_f16(output + index, vout);
903 }
904 #endif
905 for (; index < element_size; index++) {
906 output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[0]));
907 }
908 }
909 return NNACL_OK;
910 }
911
ElementSquaredDifferenceFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)912 int ElementSquaredDifferenceFp16(const float16_t *input0, const float16_t *input1, float16_t *output,
913 int element_size) {
914 ElementSubFp16(input0, input1, output, element_size);
915 return ElementMulFp16(output, output, output, element_size);
916 }
917
ElementOptSquaredDifferenceFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)918 int ElementOptSquaredDifferenceFp16(const float16_t *input0, const float16_t *input1, float16_t *output,
919 int element_size, bool first_scalar) {
920 ElementOptSubFp16(input0, input1, output, element_size, first_scalar);
921 return ElementMulFp16(output, output, output, element_size);
922 }
923
ElementMaximumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)924 int ElementMaximumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
925 int index = 0;
926 #ifdef ENABLE_NEON
927 for (; index <= element_size - 8; index += C8NUM) {
928 float16x8_t vin0 = vld1q_f16(input0 + index);
929 float16x8_t vin1 = vld1q_f16(input1 + index);
930 float16x8_t vout = vmaxq_f16(vin0, vin1);
931 vst1q_f16(output + index, vout);
932 }
933 #endif
934 for (; index < element_size; index++) {
935 output[index] = MSMAX(input0[index], input1[index]);
936 }
937 return NNACL_OK;
938 }
939
ElementOptMaximumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)940 int ElementOptMaximumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
941 bool first_scalar) {
942 #ifdef ENABLE_NEON
943 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
944 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
945 #endif
946 int index = 0;
947 if (first_scalar) {
948 #ifdef ENABLE_NEON
949 for (; index <= element_size - 8; index += C8NUM) {
950 float16x8_t vin1 = vld1q_f16(input1 + index);
951 float16x8_t vout = vmaxq_f16(vin0_opt, vin1);
952 vst1q_f16(output + index, vout);
953 }
954 #endif
955 for (; index < element_size; index++) {
956 output[index] = MSMAX(input0[0], input1[index]);
957 }
958 } else {
959 #ifdef ENABLE_NEON
960 for (; index <= element_size - 8; index += C8NUM) {
961 float16x8_t vin0 = vld1q_f16(input0 + index);
962 float16x8_t vout = vmaxq_f16(vin0, vin1_opt);
963 vst1q_f16(output + index, vout);
964 }
965 #endif
966 for (; index < element_size; index++) {
967 output[index] = MSMAX(input0[index], input1[0]);
968 }
969 }
970 return NNACL_OK;
971 }
972
ElementMinimumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)973 int ElementMinimumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
974 int index = 0;
975 #ifdef ENABLE_NEON
976 for (; index <= element_size - 8; index += C8NUM) {
977 float16x8_t vin0 = vld1q_f16(input0 + index);
978 float16x8_t vin1 = vld1q_f16(input1 + index);
979 float16x8_t vout = vminq_f16(vin0, vin1);
980 vst1q_f16(output + index, vout);
981 }
982 #endif
983 for (; index < element_size; index++) {
984 output[index] = MSMIN(input0[index], input1[index]);
985 }
986 return NNACL_OK;
987 }
988
ElementOptMinimumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)989 int ElementOptMinimumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
990 bool first_scalar) {
991 #ifdef ENABLE_NEON
992 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
993 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
994 #endif
995 int index = 0;
996 if (first_scalar) {
997 #ifdef ENABLE_NEON
998 for (; index <= element_size - 8; index += C8NUM) {
999 float16x8_t vin1 = vld1q_f16(input1 + index);
1000 float16x8_t vout = vminq_f16(vin0_opt, vin1);
1001 vst1q_f16(output + index, vout);
1002 }
1003 #endif
1004 for (; index < element_size; index++) {
1005 output[index] = MSMIN(input0[0], input1[index]);
1006 }
1007 } else {
1008 #ifdef ENABLE_NEON
1009 for (; index <= element_size - 8; index += C8NUM) {
1010 float16x8_t vin0 = vld1q_f16(input0 + index);
1011 float16x8_t vout = vminq_f16(vin0, vin1_opt);
1012 vst1q_f16(output + index, vout);
1013 }
1014 #endif
1015 for (; index < element_size; index++) {
1016 output[index] = MSMIN(input0[index], input1[0]);
1017 }
1018 }
1019 return NNACL_OK;
1020 }
1021
ElementNotEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1022 int ElementNotEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1023 int index = 0;
1024 #ifdef ENABLE_NEON
1025 for (; index <= element_size - 8; index += C8NUM) {
1026 float16x8_t vin0 = vld1q_f16(input0 + index);
1027 float16x8_t vin1 = vld1q_f16(input1 + index);
1028 uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1));
1029 vst1_u8(output + index, vout);
1030 }
1031 #endif
1032 for (; index < element_size; index++) {
1033 output[index] = input0[index] != input1[index];
1034 }
1035 return NNACL_OK;
1036 }
1037
ElementOptNotEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1038 int ElementOptNotEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1039 bool first_scalar) {
1040 #ifdef ENABLE_NEON
1041 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1042 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1043 #endif
1044 int index = 0;
1045 if (first_scalar) {
1046 #ifdef ENABLE_NEON
1047 for (; index <= element_size - 8; index += C8NUM) {
1048 float16x8_t vin1 = vld1q_f16(input1 + index);
1049 uint8x8_t vout = vmovn_u16(vceqq_f16(vin0_opt, vin1));
1050 vst1_u8(output + index, vout);
1051 }
1052 #endif
1053 for (; index < element_size; index++) {
1054 output[index] = input0[0] != input1[index];
1055 }
1056 } else {
1057 #ifdef ENABLE_NEON
1058 for (; index <= element_size - 8; index += C8NUM) {
1059 float16x8_t vin0 = vld1q_f16(input0 + index);
1060 uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1_opt));
1061 vst1_u8(output + index, vout);
1062 }
1063 #endif
1064 for (; index < element_size; index++) {
1065 output[index] = input0[index] != input1[0];
1066 }
1067 }
1068 return NNACL_OK;
1069 }
1070
ElementEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1071 int ElementEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1072 int index = 0;
1073 #ifdef ENABLE_NEON
1074 for (; index <= element_size - 8; index += C8NUM) {
1075 float16x8_t vin0 = vld1q_f16(input0 + index);
1076 float16x8_t vin1 = vld1q_f16(input1 + index);
1077 uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1));
1078 vst1_u8(output + index, vout);
1079 }
1080 #endif
1081 for (; index < element_size; index++) {
1082 output[index] = input0[index] == input1[index];
1083 }
1084 return NNACL_OK;
1085 }
1086
ElementOptEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1087 int ElementOptEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1088 bool first_scalar) {
1089 #ifdef ENABLE_NEON
1090 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1091 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1092 #endif
1093 int index = 0;
1094 if (first_scalar) {
1095 #ifdef ENABLE_NEON
1096 for (; index <= element_size - 8; index += C8NUM) {
1097 float16x8_t vin1 = vld1q_f16(input1 + index);
1098 uint8x8_t vout = vmovn_u16(vceqq_f16(vin0_opt, vin1));
1099 vst1_u8(output + index, vout);
1100 }
1101 #endif
1102 for (; index < element_size; index++) {
1103 output[index] = input0[0] == input1[index];
1104 }
1105 } else {
1106 #ifdef ENABLE_NEON
1107 for (; index <= element_size - 8; index += C8NUM) {
1108 float16x8_t vin0 = vld1q_f16(input0 + index);
1109 uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1_opt));
1110 vst1_u8(output + index, vout);
1111 }
1112 #endif
1113 for (; index < element_size; index++) {
1114 output[index] = input0[index] == input1[0];
1115 }
1116 }
1117 return NNACL_OK;
1118 }
1119
ElementLessFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1120 int ElementLessFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1121 int index = 0;
1122 #ifdef ENABLE_NEON
1123 for (; index <= element_size - 8; index += C8NUM) {
1124 float16x8_t vin0 = vld1q_f16(input0 + index);
1125 float16x8_t vin1 = vld1q_f16(input1 + index);
1126 uint8x8_t vout = vmovn_u16(vcltq_f16(vin0, vin1));
1127 vst1_u8(output + index, vout);
1128 }
1129 #endif
1130 for (; index < element_size; index++) {
1131 output[index] = input0[index] < input1[index];
1132 }
1133 return NNACL_OK;
1134 }
1135
ElementOptLessFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1136 int ElementOptLessFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1137 bool first_scalar) {
1138 #ifdef ENABLE_NEON
1139 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1140 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1141 #endif
1142 int index = 0;
1143 if (first_scalar) {
1144 #ifdef ENABLE_NEON
1145 for (; index <= element_size - 8; index += C8NUM) {
1146 float16x8_t vin1 = vld1q_f16(input1 + index);
1147 uint8x8_t vout = vmovn_u16(vcltq_f16(vin0_opt, vin1));
1148 vst1_u8(output + index, vout);
1149 }
1150 #endif
1151 for (; index < element_size; index++) {
1152 output[index] = input0[0] < input1[index];
1153 }
1154 } else {
1155 #ifdef ENABLE_NEON
1156 for (; index <= element_size - 8; index += C8NUM) {
1157 float16x8_t vin0 = vld1q_f16(input0 + index);
1158 uint8x8_t vout = vmovn_u16(vcltq_f16(vin0, vin1_opt));
1159 vst1_u8(output + index, vout);
1160 }
1161 #endif
1162 for (; index < element_size; index++) {
1163 output[index] = input0[index] < input1[0];
1164 }
1165 }
1166 return NNACL_OK;
1167 }
1168
ElementLessEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1169 int ElementLessEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1170 int index = 0;
1171 #ifdef ENABLE_NEON
1172 for (; index <= element_size - 8; index += C8NUM) {
1173 float16x8_t vin0 = vld1q_f16(input0 + index);
1174 float16x8_t vin1 = vld1q_f16(input1 + index);
1175 uint8x8_t vout = vmovn_u16(vcleq_f16(vin0, vin1));
1176 vst1_u8(output + index, vout);
1177 }
1178 #endif
1179 for (; index < element_size; index++) {
1180 output[index] = input0[index] <= input1[index];
1181 }
1182 return NNACL_OK;
1183 }
1184
ElementOptLessEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1185 int ElementOptLessEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1186 bool first_scalar) {
1187 #ifdef ENABLE_NEON
1188 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1189 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1190 #endif
1191 int index = 0;
1192 if (first_scalar) {
1193 #ifdef ENABLE_NEON
1194 for (; index <= element_size - 8; index += C8NUM) {
1195 float16x8_t vin1 = vld1q_f16(input1 + index);
1196 uint8x8_t vout = vmovn_u16(vcleq_f16(vin0_opt, vin1));
1197 vst1_u8(output + index, vout);
1198 }
1199 #endif
1200 for (; index < element_size; index++) {
1201 output[index] = input0[0] <= input1[index];
1202 }
1203 } else {
1204 #ifdef ENABLE_NEON
1205 for (; index <= element_size - 8; index += C8NUM) {
1206 float16x8_t vin0 = vld1q_f16(input0 + index);
1207 uint8x8_t vout = vmovn_u16(vcleq_f16(vin0, vin1_opt));
1208 vst1_u8(output + index, vout);
1209 }
1210 #endif
1211 for (; index < element_size; index++) {
1212 output[index] = input0[index] <= input1[0];
1213 }
1214 }
1215 return NNACL_OK;
1216 }
1217
ElementGreaterFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1218 int ElementGreaterFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1219 int index = 0;
1220 #ifdef ENABLE_NEON
1221 for (; index <= element_size - 8; index += C8NUM) {
1222 float16x8_t vin0 = vld1q_f16(input0 + index);
1223 float16x8_t vin1 = vld1q_f16(input1 + index);
1224 uint8x8_t vout = vmovn_u16(vcgtq_f16(vin0, vin1));
1225 vst1_u8(output + index, vout);
1226 }
1227 #endif
1228 for (; index < element_size; index++) {
1229 output[index] = input0[index] > input1[index];
1230 }
1231 return NNACL_OK;
1232 }
1233
ElementOptGreaterFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1234 int ElementOptGreaterFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1235 bool first_scalar) {
1236 #ifdef ENABLE_NEON
1237 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1238 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1239 #endif
1240 int index = 0;
1241 if (first_scalar) {
1242 #ifdef ENABLE_NEON
1243 for (; index <= element_size - 8; index += C8NUM) {
1244 float16x8_t vin1 = vld1q_f16(input1 + index);
1245 uint8x8_t vout = vmovn_u16(vcgtq_f16(vin0_opt, vin1));
1246 vst1_u8(output + index, vout);
1247 }
1248 #endif
1249 for (; index < element_size; index++) {
1250 output[index] = input0[0] > input1[index];
1251 }
1252 } else {
1253 #ifdef ENABLE_NEON
1254 for (; index <= element_size - 8; index += C8NUM) {
1255 float16x8_t vin0 = vld1q_f16(input0 + index);
1256 uint8x8_t vout = vmovn_u16(vcgtq_f16(vin0, vin1_opt));
1257 vst1_u8(output + index, vout);
1258 }
1259 #endif
1260 for (; index < element_size; index++) {
1261 output[index] = input0[index] > input1[0];
1262 }
1263 }
1264 return NNACL_OK;
1265 }
1266
ElementGreaterEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1267 int ElementGreaterEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1268 int index = 0;
1269 #ifdef ENABLE_NEON
1270 for (; index <= element_size - 8; index += C8NUM) {
1271 float16x8_t vin0 = vld1q_f16(input0 + index);
1272 float16x8_t vin1 = vld1q_f16(input1 + index);
1273 uint8x8_t vout = vmovn_u16(vcgeq_f16(vin0, vin1));
1274 vst1_u8(output + index, vout);
1275 }
1276 #endif
1277 for (; index < element_size; index++) {
1278 output[index] = input0[index] >= input1[index];
1279 }
1280 return NNACL_OK;
1281 }
1282
ElementOptGreaterEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1283 int ElementOptGreaterEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1284 bool first_scalar) {
1285 #ifdef ENABLE_NEON
1286 float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1287 float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1288 #endif
1289 int index = 0;
1290 if (first_scalar) {
1291 #ifdef ENABLE_NEON
1292 for (; index <= element_size - 8; index += C8NUM) {
1293 float16x8_t vin1 = vld1q_f16(input1 + index);
1294 uint8x8_t vout = vmovn_u16(vcgeq_f16(vin0_opt, vin1));
1295 vst1_u8(output + index, vout);
1296 }
1297 #endif
1298 for (; index < element_size; index++) {
1299 output[index] = input0[0] >= input1[index];
1300 }
1301 } else {
1302 #ifdef ENABLE_NEON
1303 for (; index <= element_size - 8; index += C8NUM) {
1304 float16x8_t vin0 = vld1q_f16(input0 + index);
1305 uint8x8_t vout = vmovn_u16(vcgeq_f16(vin0, vin1_opt));
1306 vst1_u8(output + index, vout);
1307 }
1308 #endif
1309 for (; index < element_size; index++) {
1310 output[index] = input0[index] >= input1[0];
1311 }
1312 }
1313 return NNACL_OK;
1314 }
1315