• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "nnacl/fp16/arithmetic_fp16.h"
18 #include <math.h>
19 #include "nnacl/common_func.h"
20 #include "nnacl/nnacl_utils.h"
21 
BroadcastAddFp16(const float16_t * in0,const float16_t * in1,float16_t * tile_in0,float16_t * tile_in1,float16_t * out,int size,ArithmeticParameter * param)22 int BroadcastAddFp16(const float16_t *in0, const float16_t *in1, float16_t *tile_in0, float16_t *tile_in1,
23                      float16_t *out, int size, ArithmeticParameter *param) {
24   TileDimensionsFp16(in0, in1, tile_in0, tile_in1, param);
25   return ElementAddFp16(tile_in0, tile_in1, out, size);
26 }
27 
TileOneDimensionFp16(const void * input,void * output,int dim,size_t ndim,const int * inShape,const int * inStrides,const int * outStrides,const int * multiple)28 void TileOneDimensionFp16(const void *input, void *output, int dim, size_t ndim, const int *inShape,
29                           const int *inStrides, const int *outStrides, const int *multiple) {
30   const float16_t *inData = (const float16_t *)input;
31   float16_t *outData = (float16_t *)output;
32 
33   int srcDimSize = inShape[dim];
34   if (dim == ndim - 1) {
35     for (int i = 0; i < multiple[dim]; i++) {
36       memcpy(outData, inData, srcDimSize * sizeof(float16_t));
37       outData += srcDimSize;
38     }
39     return;
40   }
41   for (size_t i = 0; i < srcDimSize; i++) {
42     for (size_t j = 0; j < multiple[dim]; j++) {
43       TileOneDimensionFp16(inData + inStrides[dim] * i, outData + outStrides[dim] * (i + j * srcDimSize), dim + 1, ndim,
44                            inShape, inStrides, outStrides, multiple);
45     }
46   }
47 }
48 
TileDimensionsFp16(const float16_t * data0,const float16_t * data1,float16_t * tile_data0,float16_t * tile_data1,ArithmeticParameter * param)49 void TileDimensionsFp16(const float16_t *data0, const float16_t *data1, float16_t *tile_data0, float16_t *tile_data1,
50                         ArithmeticParameter *param) {
51   CalcMultiplesAndStrides(param);
52   TileOneDimensionFp16(data0, tile_data0, 0, param->ndim_, param->in_shape0_, param->in_strides0_, param->out_strides_,
53                        param->multiples0_);
54   TileOneDimensionFp16(data1, tile_data1, 0, param->ndim_, param->in_shape1_, param->in_strides1_, param->out_strides_,
55                        param->multiples1_);
56 }
57 
ElementMulFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)58 int ElementMulFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
59   int index = 0;
60 #ifdef ENABLE_NEON
61   for (; index <= element_size - 8; index += C8NUM) {
62     float16x8_t vin0 = vld1q_f16(input0 + index);
63     float16x8_t vin1 = vld1q_f16(input1 + index);
64     float16x8_t vout = vmulq_f16(vin0, vin1);
65     vst1q_f16(output + index, vout);
66   }
67 #endif
68   for (; index < element_size; index++) {
69     output[index] = input0[index] * input1[index];
70   }
71   return NNACL_OK;
72 }
73 
ElementOptMulFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)74 int ElementOptMulFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
75                       bool first_scalar) {
76 #ifdef ENABLE_NEON
77   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
78   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
79 #endif
80   int index = 0;
81   if (first_scalar) {
82 #ifdef ENABLE_NEON
83     for (; index <= element_size - 8; index += C8NUM) {
84       float16x8_t vin1 = vld1q_f16(input1 + index);
85       float16x8_t vout = vmulq_f16(vin0_opt, vin1);
86       vst1q_f16(output + index, vout);
87     }
88 #endif
89     for (; index < element_size; index++) {
90       output[index] = input0[0] * input1[index];
91     }
92   } else {
93 #ifdef ENABLE_NEON
94     for (; index <= element_size - 8; index += C8NUM) {
95       float16x8_t vin0 = vld1q_f16(input0 + index);
96       float16x8_t vout = vmulq_f16(vin0, vin1_opt);
97       vst1q_f16(output + index, vout);
98     }
99 #endif
100     for (; index < element_size; index++) {
101       output[index] = input0[index] * input1[0];
102     }
103   }
104   return NNACL_OK;
105 }
106 
ElementMulReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)107 int ElementMulReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
108 #ifdef ENABLE_NEON
109   float16x8_t zeros = vdupq_n_f16(0.0);
110 #endif
111   int index = 0;
112 #ifdef ENABLE_NEON
113   for (; index <= element_size - 8; index += C8NUM) {
114     float16x8_t vin0 = vld1q_f16(input0 + index);
115     float16x8_t vin1 = vld1q_f16(input1 + index);
116     float16x8_t vout = vmulq_f16(vin0, vin1);
117     vout = vmaxq_f16(vout, zeros);
118     vst1q_f16(output + index, vout);
119   }
120 #endif
121   for (; index < element_size; index++) {
122     float16_t res = input0[index] * input1[index];
123     output[index] = res > 0 ? res : 0;
124   }
125   return NNACL_OK;
126 }
127 
ElementOptMulReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)128 int ElementOptMulReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
129                           bool first_scalar) {
130 #ifdef ENABLE_NEON
131   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
132   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
133   float16x8_t zeros = vdupq_n_f16(0.0);
134 #endif
135   int index = 0;
136   if (first_scalar) {
137 #ifdef ENABLE_NEON
138     for (; index <= element_size - 8; index += C8NUM) {
139       float16x8_t vin1 = vld1q_f16(input1 + index);
140       float16x8_t vout = vmulq_f16(vin0_opt, vin1);
141       vout = vmaxq_f16(vout, zeros);
142       vst1q_f16(output + index, vout);
143     }
144 #endif
145     for (; index < element_size; index++) {
146       float16_t res = input0[0] * input1[index];
147       output[index] = res > 0 ? res : 0;
148     }
149   } else {
150 #ifdef ENABLE_NEON
151     for (; index <= element_size - 8; index += C8NUM) {
152       float16x8_t vin0 = vld1q_f16(input0 + index);
153       float16x8_t vout = vmulq_f16(vin0, vin1_opt);
154       vout = vmaxq_f16(vout, zeros);
155       vst1q_f16(output + index, vout);
156     }
157 #endif
158     for (; index < element_size; index++) {
159       float16_t res = input0[index] * input1[0];
160       output[index] = res > 0 ? res : 0;
161     }
162   }
163   return NNACL_OK;
164 }
165 
ElementMulRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)166 int ElementMulRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
167   int index = 0;
168 #ifdef ENABLE_NEON
169   float16x8_t zeros = vdupq_n_f16(0.0);
170   float16x8_t bounds = vdupq_n_f16(6.0);
171   for (; index <= element_size - 8; index += C8NUM) {
172     float16x8_t vin0 = vld1q_f16(input0 + index);
173     float16x8_t vin1 = vld1q_f16(input1 + index);
174     float16x8_t vout = vmulq_f16(vin0, vin1);
175     vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
176     vst1q_f16(output + index, vout);
177   }
178 #endif
179   for (; index < element_size; index++) {
180     output[index] = MSMIN(MSMAX(input0[index] * input1[index], 0), 6);
181   }
182   return NNACL_OK;
183 }
184 
ElementOptMulRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)185 int ElementOptMulRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
186                            bool first_scalar) {
187 #ifdef ENABLE_NEON
188   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
189   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
190   float16x8_t zeros = vdupq_n_f16(0.0);
191   float16x8_t bounds = vdupq_n_f16(6.0);
192 #endif
193   int index = 0;
194   if (first_scalar) {
195 #ifdef ENABLE_NEON
196     for (; index <= element_size - 8; index += C8NUM) {
197       float16x8_t vin1 = vld1q_f16(input1 + index);
198       float16x8_t vout = vmulq_f16(vin0_opt, vin1);
199       vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
200       vst1q_f16(output + index, vout);
201     }
202 #endif
203     for (; index < element_size; index++) {
204       output[index] = MSMIN(MSMAX(input0[0] * input1[index], 0), 6);
205     }
206   } else {
207 #ifdef ENABLE_NEON
208     for (; index <= element_size - 8; index += C8NUM) {
209       float16x8_t vin0 = vld1q_f16(input0 + index);
210       float16x8_t vout = vmulq_f16(vin0, vin1_opt);
211       vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
212       vst1q_f16(output + index, vout);
213     }
214 #endif
215     for (; index < element_size; index++) {
216       output[index] = MSMIN(MSMAX(input0[index] * input1[0], 0), 6);
217     }
218   }
219   return NNACL_OK;
220 }
221 
ElementAddFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)222 int ElementAddFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
223   int index = 0;
224 #ifdef ENABLE_NEON
225   for (; index <= element_size - 8; index += C8NUM) {
226     float16x8_t vin0 = vld1q_f16(input0 + index);
227     float16x8_t vin1 = vld1q_f16(input1 + index);
228     float16x8_t vout = vaddq_f16(vin0, vin1);
229     vst1q_f16(output + index, vout);
230   }
231   for (; index <= element_size - 4; index += C4NUM) {
232     float16x4_t vin0 = vld1_f16(input0 + index);
233     float16x4_t vin1 = vld1_f16(input1 + index);
234     float16x4_t vout = vadd_f16(vin0, vin1);
235     vst1_f16(output + index, vout);
236   }
237 #endif
238   for (; index < element_size; index++) {
239     output[index] = input0[index] + input1[index];
240   }
241   return NNACL_OK;
242 }
243 
ElementOptAddFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)244 int ElementOptAddFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
245                       bool first_scalar) {
246 #ifdef ENABLE_NEON
247   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
248   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
249 #endif
250   int index = 0;
251   if (first_scalar) {
252 #ifdef ENABLE_NEON
253     for (; index <= element_size - 8; index += C8NUM) {
254       float16x8_t vin1 = vld1q_f16(input1 + index);
255       float16x8_t vout = vaddq_f16(vin0_opt, vin1);
256       vst1q_f16(output + index, vout);
257     }
258 #endif
259     for (; index < element_size; index++) {
260       output[index] = input0[0] + input1[index];
261     }
262   } else {
263 #ifdef ENABLE_NEON
264     for (; index <= element_size - 8; index += C8NUM) {
265       float16x8_t vin0 = vld1q_f16(input0 + index);
266       float16x8_t vout = vaddq_f16(vin0, vin1_opt);
267       vst1q_f16(output + index, vout);
268     }
269 #endif
270     for (; index < element_size; index++) {
271       output[index] = input0[index] + input1[0];
272     }
273   }
274   return NNACL_OK;
275 }
276 
ElementAddReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)277 int ElementAddReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
278   int index = 0;
279 #ifdef ENABLE_NEON
280   float16x8_t zeros = vdupq_n_f16(0.0);
281   for (; index <= element_size - 8; index += C8NUM) {
282     float16x8_t vin0 = vld1q_f16(input0 + index);
283     float16x8_t vin1 = vld1q_f16(input1 + index);
284     float16x8_t vout = vaddq_f16(vin0, vin1);
285     vout = vmaxq_f16(vout, zeros);
286     vst1q_f16(output + index, vout);
287   }
288   float16x4_t zeros1 = vdup_n_f16(0.0f);
289   for (; index <= element_size - 4; index += C4NUM) {
290     float16x4_t vin0 = vld1_f16(input0 + index);
291     float16x4_t vin1 = vld1_f16(input1 + index);
292     float16x4_t vout = vadd_f16(vin0, vin1);
293     vout = vmax_f16(vout, zeros1);
294     vst1_f16(output + index, vout);
295   }
296 #endif
297   for (; index < element_size; index++) {
298     float16_t res = input0[index] + input1[index];
299     output[index] = res > 0 ? res : 0;
300   }
301   return NNACL_OK;
302 }
303 
ElementOptAddReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)304 int ElementOptAddReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
305                           bool first_scalar) {
306 #ifdef ENABLE_NEON
307   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
308   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
309   float16x8_t zeros = vdupq_n_f16(0.0);
310 #endif
311   int index = 0;
312   if (first_scalar) {
313 #ifdef ENABLE_NEON
314     for (; index <= element_size - 8; index += C8NUM) {
315       float16x8_t vin1 = vld1q_f16(input1 + index);
316       float16x8_t vout = vaddq_f16(vin0_opt, vin1);
317       vout = vmaxq_f16(vout, zeros);
318       vst1q_f16(output + index, vout);
319     }
320 #endif
321     for (; index < element_size; index++) {
322       float16_t res = input0[0] + input1[index];
323       output[index] = res > 0 ? res : 0;
324     }
325   } else {
326 #ifdef ENABLE_NEON
327     for (; index <= element_size - 8; index += C8NUM) {
328       float16x8_t vin0 = vld1q_f16(input0 + index);
329       float16x8_t vout = vaddq_f16(vin0, vin1_opt);
330       vout = vmaxq_f16(vout, zeros);
331       vst1q_f16(output + index, vout);
332     }
333 #endif
334     for (; index < element_size; index++) {
335       float16_t res = input0[index] + input1[0];
336       output[index] = res > 0 ? res : 0;
337     }
338   }
339   return NNACL_OK;
340 }
341 
ElementAddRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)342 int ElementAddRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
343   int index = 0;
344 #ifdef ENABLE_NEON
345   float16x8_t zeros = vdupq_n_f16(0.0);
346   float16x8_t bounds = vdupq_n_f16(6.0);
347   for (; index <= element_size - 8; index += C8NUM) {
348     float16x8_t vin0 = vld1q_f16(input0 + index);
349     float16x8_t vin1 = vld1q_f16(input1 + index);
350     float16x8_t vout = vaddq_f16(vin0, vin1);
351     vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
352     vst1q_f16(output + index, vout);
353   }
354   float16x4_t zeros1 = vdup_n_f16(0.0);
355   float16x4_t bounds1 = vdup_n_f16(6.0);
356   for (; index <= element_size - 4; index += C4NUM) {
357     float16x4_t vin0 = vld1_f16(input0 + index);
358     float16x4_t vin1 = vld1_f16(input1 + index);
359     float16x4_t vout = vadd_f16(vin0, vin1);
360     vout = vmin_f16(vmax_f16(vout, zeros1), bounds1);
361     vst1_f16(output + index, vout);
362   }
363 #endif
364   for (; index < element_size; index++) {
365     output[index] = MSMIN(MSMAX(input0[index] + input1[index], 0), 6);
366   }
367   return NNACL_OK;
368 }
369 
ElementOptAddRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)370 int ElementOptAddRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
371                            bool first_scalar) {
372 #ifdef ENABLE_NEON
373   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
374   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
375   float16x8_t zeros = vdupq_n_f16(0.0);
376   float16x8_t bounds = vdupq_n_f16(6.0);
377 #endif
378   int index = 0;
379   if (first_scalar) {
380 #ifdef ENABLE_NEON
381     for (; index <= element_size - 8; index += C8NUM) {
382       float16x8_t vin1 = vld1q_f16(input1 + index);
383       float16x8_t vout = vaddq_f16(vin0_opt, vin1);
384       vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
385       vst1q_f16(output + index, vout);
386     }
387 #endif
388     for (; index < element_size; index++) {
389       output[index] = MSMIN(MSMAX(input0[0] + input1[index], 0), 6);
390     }
391   } else {
392 #ifdef ENABLE_NEON
393     for (; index <= element_size - 8; index += C8NUM) {
394       float16x8_t vin0 = vld1q_f16(input0 + index);
395       float16x8_t vout = vaddq_f16(vin0, vin1_opt);
396       vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
397       vst1q_f16(output + index, vout);
398     }
399 #endif
400     for (; index < element_size; index++) {
401       output[index] = MSMIN(MSMAX(input0[index] + input1[0], 0), 6);
402     }
403   }
404   return NNACL_OK;
405 }
406 
ElementSubFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)407 int ElementSubFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
408   int index = 0;
409 #ifdef ENABLE_NEON
410   for (; index <= element_size - 8; index += C8NUM) {
411     float16x8_t vin0 = vld1q_f16(input0 + index);
412     float16x8_t vin1 = vld1q_f16(input1 + index);
413     float16x8_t vout = vsubq_f16(vin0, vin1);
414     vst1q_f16(output + index, vout);
415   }
416 #endif
417   for (; index < element_size; index++) {
418     output[index] = input0[index] - input1[index];
419   }
420   return NNACL_OK;
421 }
422 
ElementOptSubFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)423 int ElementOptSubFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
424                       bool first_scalar) {
425 #ifdef ENABLE_NEON
426   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
427   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
428 #endif
429   int index = 0;
430   if (first_scalar) {
431 #ifdef ENABLE_NEON
432     for (; index <= element_size - 8; index += C8NUM) {
433       float16x8_t vin1 = vld1q_f16(input1 + index);
434       float16x8_t vout = vsubq_f16(vin0_opt, vin1);
435       vst1q_f16(output + index, vout);
436     }
437 #endif
438     for (; index < element_size; index++) {
439       output[index] = input0[0] - input1[index];
440     }
441   } else {
442 #ifdef ENABLE_NEON
443     for (; index <= element_size - 8; index += C8NUM) {
444       float16x8_t vin0 = vld1q_f16(input0 + index);
445       float16x8_t vout = vsubq_f16(vin0, vin1_opt);
446       vst1q_f16(output + index, vout);
447     }
448 #endif
449     for (; index < element_size; index++) {
450       output[index] = input0[index] - input1[0];
451     }
452   }
453   return NNACL_OK;
454 }
455 
ElementSubReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)456 int ElementSubReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
457   int index = 0;
458 #ifdef ENABLE_NEON
459   float16x8_t zeros = vdupq_n_f16(0.0);
460   for (; index <= element_size - 8; index += C8NUM) {
461     float16x8_t vin0 = vld1q_f16(input0 + index);
462     float16x8_t vin1 = vld1q_f16(input1 + index);
463     float16x8_t vout = vsubq_f16(vin0, vin1);
464     vout = vmaxq_f16(vout, zeros);
465     vst1q_f16(output + index, vout);
466   }
467 #endif
468   for (; index < element_size; index++) {
469     float16_t res = input0[index] - input1[index];
470     output[index] = res > 0 ? res : 0;
471   }
472   return NNACL_OK;
473 }
474 
ElementOptSubReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)475 int ElementOptSubReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
476                           bool first_scalar) {
477 #ifdef ENABLE_NEON
478   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
479   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
480   float16x8_t zeros = vdupq_n_f16(0.0);
481 #endif
482   int index = 0;
483   if (first_scalar) {
484 #ifdef ENABLE_NEON
485     for (; index <= element_size - 8; index += C8NUM) {
486       float16x8_t vin1 = vld1q_f16(input1 + index);
487       float16x8_t vout = vsubq_f16(vin0_opt, vin1);
488       vout = vmaxq_f16(vout, zeros);
489       vst1q_f16(output + index, vout);
490     }
491 #endif
492     for (; index < element_size; index++) {
493       float16_t res = input0[0] - input1[index];
494       output[index] = res > 0 ? res : 0;
495     }
496   } else {
497 #ifdef ENABLE_NEON
498     for (; index <= element_size - 8; index += C8NUM) {
499       float16x8_t vin0 = vld1q_f16(input0 + index);
500       float16x8_t vout = vsubq_f16(vin0, vin1_opt);
501       vout = vmaxq_f16(vout, zeros);
502       vst1q_f16(output + index, vout);
503     }
504 #endif
505     for (; index < element_size; index++) {
506       float16_t res = input0[index] - input1[0];
507       output[index] = res > 0 ? res : 0;
508     }
509   }
510   return NNACL_OK;
511 }
512 
ElementSubRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)513 int ElementSubRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
514   int index = 0;
515 #ifdef ENABLE_NEON
516   float16x8_t zeros = vdupq_n_f16(0.0);
517   float16x8_t bounds = vdupq_n_f16(6.0);
518   for (; index <= element_size - 8; index += C8NUM) {
519     float16x8_t vin0 = vld1q_f16(input0 + index);
520     float16x8_t vin1 = vld1q_f16(input1 + index);
521     float16x8_t vout = vsubq_f16(vin0, vin1);
522     vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
523     vst1q_f16(output + index, vout);
524   }
525 #endif
526   for (; index < element_size; index++) {
527     output[index] = MSMIN(MSMAX(input0[index] - input1[index], 0), 6);
528   }
529   return NNACL_OK;
530 }
531 
ElementOptSubRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)532 int ElementOptSubRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
533                            bool first_scalar) {
534 #ifdef ENABLE_NEON
535   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
536   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
537   float16x8_t zeros = vdupq_n_f16(0.0);
538   float16x8_t bounds = vdupq_n_f16(6.0);
539 #endif
540   int index = 0;
541   if (first_scalar) {
542 #ifdef ENABLE_NEON
543     for (; index <= element_size - 8; index += C8NUM) {
544       float16x8_t vin1 = vld1q_f16(input1 + index);
545       float16x8_t vout = vsubq_f16(vin0_opt, vin1);
546       vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
547       vst1q_f16(output + index, vout);
548     }
549 #endif
550     for (; index < element_size; index++) {
551       output[index] = MSMIN(MSMAX(input0[0] - input1[index], 0), 6);
552     }
553   } else {
554 #ifdef ENABLE_NEON
555     for (; index <= element_size - 8; index += C8NUM) {
556       float16x8_t vin0 = vld1q_f16(input0 + index);
557       float16x8_t vout = vsubq_f16(vin0, vin1_opt);
558       vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
559       vst1q_f16(output + index, vout);
560     }
561 #endif
562     for (; index < element_size; index++) {
563       output[index] = MSMIN(MSMAX(input0[index] - input1[0], 0), 6);
564     }
565   }
566   return NNACL_OK;
567 }
568 
ElementDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)569 int ElementDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
570   int index = 0;
571 #ifdef ENABLE_NEON
572   for (; index <= element_size - 8; index += C8NUM) {
573     float16x8_t vin0 = vld1q_f16(input0 + index);
574     float16x8_t vin1 = vld1q_f16(input1 + index);
575     float16x8_t vout = MS_DIVQ_F16(vin0, vin1);
576     vst1q_f16(output + index, vout);
577   }
578 #endif
579   for (; index < element_size; index++) {
580     output[index] = input0[index] / input1[index];
581   }
582   return NNACL_OK;
583 }
584 
ElementOptDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)585 int ElementOptDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
586                       bool first_scalar) {
587 #ifdef ENABLE_NEON
588   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
589   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
590 #endif
591   int index = 0;
592   if (first_scalar) {
593 #ifdef ENABLE_NEON
594     for (; index <= element_size - 8; index += C8NUM) {
595       float16x8_t vin1 = vld1q_f16(input1 + index);
596       float16x8_t vout = MS_DIVQ_F16(vin0_opt, vin1);
597       vst1q_f16(output + index, vout);
598     }
599 #endif
600     for (; index < element_size; index++) {
601       output[index] = input0[0] / input1[index];
602     }
603   } else {
604     if (input1[0] == 0) {
605       return NNACL_ERRCODE_DIVISOR_ZERO;
606     }
607 #ifdef ENABLE_NEON
608     for (; index <= element_size - 8; index += C8NUM) {
609       float16x8_t vin0 = vld1q_f16(input0 + index);
610       float16x8_t vout = MS_DIVQ_F16(vin0, vin1_opt);
611       vst1q_f16(output + index, vout);
612     }
613 #endif
614     for (; index < element_size; index++) {
615       output[index] = input0[index] / input1[0];
616     }
617   }
618   return NNACL_OK;
619 }
620 
ElementDivReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)621 int ElementDivReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
622   int index = 0;
623 #ifdef ENABLE_NEON
624   float16x8_t zeros = vdupq_n_f16(0.0);
625   for (; index <= element_size - 8; index += C8NUM) {
626     float16x8_t vin0 = vld1q_f16(input0 + index);
627     float16x8_t vin1 = vld1q_f16(input1 + index);
628     float16x8_t vout = MS_DIVQ_F16(vin0, vin1);
629     vout = vmaxq_f16(vout, zeros);
630     vst1q_f16(output + index, vout);
631   }
632 #endif
633   for (; index < element_size; index++) {
634     if (input1[index] == 0) {
635       return NNACL_ERRCODE_DIVISOR_ZERO;
636     }
637     float16_t res = input0[index] / input1[index];
638     output[index] = res > 0 ? res : 0;
639   }
640   return NNACL_OK;
641 }
642 
ElementOptDivReluFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)643 int ElementOptDivReluFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
644                           bool first_scalar) {
645 #ifdef ENABLE_NEON
646   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
647   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
648   float16x8_t zeros = vdupq_n_f16(0.0);
649 #endif
650   int index = 0;
651   if (first_scalar) {
652 #ifdef ENABLE_NEON
653     for (; index <= element_size - 8; index += C8NUM) {
654       float16x8_t vin1 = vld1q_f16(input1 + index);
655       float16x8_t vout = vmaxq_f16(MS_DIVQ_F16(vin0_opt, vin1), zeros);
656       vst1q_f16(output + index, vout);
657     }
658 #endif
659     for (; index < element_size; index++) {
660       if (input1[index] == 0) {
661         return NNACL_ERRCODE_DIVISOR_ZERO;
662       }
663       output[index] = MSMAX(input0[0] / input1[index], 0);
664     }
665   } else {
666     if (input1[0] == 0) {
667       return NNACL_ERRCODE_DIVISOR_ZERO;
668     }
669 #ifdef ENABLE_NEON
670     for (; index <= element_size - 8; index += C8NUM) {
671       float16x8_t vin0 = vld1q_f16(input0 + index);
672       float16x8_t vout = vmaxq_f16(MS_DIVQ_F16(vin0, vin1_opt), zeros);
673       vst1q_f16(output + index, vout);
674     }
675 #endif
676     for (; index < element_size; index++) {
677       output[index] = MSMAX(input0[index] / input1[0], 0);
678     }
679   }
680   return NNACL_OK;
681 }
682 
ElementDivRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)683 int ElementDivRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
684   int index = 0;
685 #ifdef ENABLE_NEON
686   float16x8_t zeros = vdupq_n_f16(0.0);
687   float16x8_t bounds = vdupq_n_f16(6.0);
688   for (; index <= element_size - 8; index += C8NUM) {
689     float16x8_t vin0 = vld1q_f16(input0 + index);
690     float16x8_t vin1 = vld1q_f16(input1 + index);
691     float16x8_t vout = MS_DIVQ_F16(vin0, vin1);
692     vout = vminq_f16(vmaxq_f16(vout, zeros), bounds);
693     vst1q_f16(output + index, vout);
694   }
695 #endif
696   for (; index < element_size; index++) {
697     if (input1[index] == 0) {
698       return NNACL_ERRCODE_DIVISOR_ZERO;
699     }
700     output[index] = MSMIN(MSMAX(input0[index] / input1[index], 0), 6);
701   }
702   return NNACL_OK;
703 }
704 
ElementOptDivRelu6Fp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)705 int ElementOptDivRelu6Fp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
706                            bool first_scalar) {
707 #ifdef ENABLE_NEON
708   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
709   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
710   float16x8_t zeros = vdupq_n_f16(0.0);
711   float16x8_t bounds = vdupq_n_f16(6.0);
712 #endif
713   int index = 0;
714   if (first_scalar) {
715 #ifdef ENABLE_NEON
716     for (; index <= element_size - 8; index += C8NUM) {
717       float16x8_t vin1 = vld1q_f16(input1 + index);
718       float16x8_t vout = vminq_f16(vmaxq_f16(MS_DIVQ_F16(vin0_opt, vin1), zeros), bounds);
719       vst1q_f16(output + index, vout);
720     }
721 #endif
722     for (; index < element_size; index++) {
723       if (input1[index] == 0) {
724         return NNACL_ERRCODE_DIVISOR_ZERO;
725       }
726       output[index] = MSMIN(MSMAX(input0[0] / input1[index], 0), 6);
727     }
728   } else {
729     if (input1[0] == 0) {
730       return NNACL_ERRCODE_DIVISOR_ZERO;
731     }
732 #ifdef ENABLE_NEON
733     for (; index <= element_size - 8; index += C8NUM) {
734       float16x8_t vin0 = vld1q_f16(input0 + index);
735       float16x8_t vout = vminq_f16(vmaxq_f16(MS_DIVQ_F16(vin0, vin1_opt), zeros), bounds);
736       vst1q_f16(output + index, vout);
737     }
738 #endif
739     for (; index < element_size; index++) {
740       output[index] = MSMIN(MSMAX(input0[index] / input1[0], 0), 6);
741     }
742   }
743   return NNACL_OK;
744 }
745 
ElementFloorModFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)746 int ElementFloorModFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
747   for (int i = 0; i < element_size; ++i) {
748     if (input1[i] == 0) {
749       return NNACL_ERRCODE_DIVISOR_ZERO;
750     }
751     output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
752   }
753   return NNACL_OK;
754 }
755 
ElementOptFloorModFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)756 int ElementOptFloorModFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
757                            bool first_scalar) {
758   if (!first_scalar) {
759     for (int i = 0; i < element_size; ++i) {
760       output[i] = input0[i] - floorf(input0[i] / input1[0]) * input1[0];
761     }
762   } else {
763     for (int i = 0; i < element_size; ++i) {
764       output[i] = input0[i] - floorf(input0[i] / input1[i]) * input1[i];
765     }
766   }
767   return NNACL_OK;
768 }
769 
ElementFloorDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)770 int ElementFloorDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
771   for (int i = 0; i < element_size; ++i) {
772     output[i] = floorf(input0[i] / input1[i]);
773   }
774   return NNACL_OK;
775 }
ElementOptFloorDivFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)776 int ElementOptFloorDivFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
777                            bool first_scalar) {
778   if (!first_scalar) {
779     for (int i = 0; i < element_size; ++i) {
780       output[i] = floorf(input0[i] / input1[0]);
781     }
782   } else {
783     for (int i = 0; i < element_size; ++i) {
784       output[i] = floorf(input0[i] / input1[i]);
785     }
786   }
787   return NNACL_OK;
788 }
789 
ElementLogicalAndFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)790 int ElementLogicalAndFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
791   int index = 0;
792 #ifdef ENABLE_NEON
793   float16x8_t vtrue = vdupq_n_f16(1);
794   float16x8_t vfalse = vdupq_n_f16(0);
795   uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
796   uint16x8_t zeros = vdupq_n_u16(0);
797   for (; index <= element_size - 8; index += C8NUM) {
798     uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
799     uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
800     float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
801     vst1q_f16(output + index, vout);
802   }
803 #endif
804   for (; index < element_size; index++) {
805     output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[index]));
806   }
807   return NNACL_OK;
808 }
809 
ElementOptLogicalAndFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)810 int ElementOptLogicalAndFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
811                              bool first_scalar) {
812 #ifdef ENABLE_NEON
813   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
814   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
815   float16x8_t vtrue = vdupq_n_f16(1);
816   float16x8_t vfalse = vdupq_n_f16(0);
817   uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
818   uint16x8_t zeros = vdupq_n_u16(0);
819 #endif
820   int index = 0;
821   if (first_scalar) {
822 #ifdef ENABLE_NEON
823     for (; index <= element_size - 8; index += C8NUM) {
824       float16x8_t vin1_ = vld1q_f16(input1 + index);
825       uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
826       uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
827       float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
828       vst1q_f16(output + index, vout);
829     }
830 #endif
831     for (; index < element_size; index++) {
832       output[index] = (float16_t)((bool)(input0[0]) & (bool)(input1[index]));
833     }
834   } else {
835 #ifdef ENABLE_NEON
836     for (; index <= element_size - 8; index += C8NUM) {
837       float16x8_t vin0_ = vld1q_f16(input0 + index);
838       uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
839       uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
840       float16x8_t vout = vbslq_f16(vceqq_u16(vandq_u16(vin0, vin1), zeros), vfalse, vtrue);
841       vst1q_f16(output + index, vout);
842     }
843 #endif
844     for (; index < element_size; index++) {
845       output[index] = (float16_t)((bool)(input0[index]) & (bool)(input1[0]));
846     }
847   }
848   return NNACL_OK;
849 }
850 
ElementLogicalOrFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)851 int ElementLogicalOrFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
852   int index = 0;
853 #ifdef ENABLE_NEON
854   float16x8_t vtrue = vdupq_n_f16(1);
855   float16x8_t vfalse = vdupq_n_f16(0);
856   uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
857   uint16x8_t zeros = vdupq_n_u16(0);
858   for (; index <= element_size - 8; index += C8NUM) {
859     uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input0 + index)), mask);
860     uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vld1q_f16(input1 + index)), mask);
861     float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
862     vst1q_f16(output + index, vout);
863   }
864 #endif
865   for (; index < element_size; index++) {
866     output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[index]));
867   }
868   return NNACL_OK;
869 }
870 
ElementOptLogicalOrFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)871 int ElementOptLogicalOrFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
872                             bool first_scalar) {
873 #ifdef ENABLE_NEON
874   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
875   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
876   float16x8_t vtrue = vdupq_n_f16(1);
877   float16x8_t vfalse = vdupq_n_f16(0);
878   uint16x8_t mask = vmovq_n_u16(((uint16_t)(1u << 15) - 1));
879   uint16x8_t zeros = vdupq_n_u16(0);
880 #endif
881   int index = 0;
882   if (first_scalar) {
883 #ifdef ENABLE_NEON
884     for (; index <= element_size - 8; index += C8NUM) {
885       float16x8_t vin1_ = vld1q_f16(input1 + index);
886       uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_opt), mask);
887       uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_), mask);
888       float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
889       vst1q_f16(output + index, vout);
890     }
891 #endif
892     for (; index < element_size; index++) {
893       output[index] = (float16_t)((bool)(input0[0]) | (bool)(input1[index]));
894     }
895   } else {
896 #ifdef ENABLE_NEON
897     for (; index <= element_size - 8; index += C8NUM) {
898       float16x8_t vin0_ = vld1q_f16(input0 + index);
899       uint16x8_t vin0 = vandq_u16(vreinterpretq_u16_f16(vin0_), mask);
900       uint16x8_t vin1 = vandq_u16(vreinterpretq_u16_f16(vin1_opt), mask);
901       float16x8_t vout = vbslq_f16(vceqq_u16(vorrq_u16(vin0, vin1), zeros), vfalse, vtrue);
902       vst1q_f16(output + index, vout);
903     }
904 #endif
905     for (; index < element_size; index++) {
906       output[index] = (float16_t)((bool)(input0[index]) | (bool)(input1[0]));
907     }
908   }
909   return NNACL_OK;
910 }
911 
ElementSquaredDifferenceFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)912 int ElementSquaredDifferenceFp16(const float16_t *input0, const float16_t *input1, float16_t *output,
913                                  int element_size) {
914   ElementSubFp16(input0, input1, output, element_size);
915   return ElementMulFp16(output, output, output, element_size);
916 }
917 
ElementOptSquaredDifferenceFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)918 int ElementOptSquaredDifferenceFp16(const float16_t *input0, const float16_t *input1, float16_t *output,
919                                     int element_size, bool first_scalar) {
920   ElementOptSubFp16(input0, input1, output, element_size, first_scalar);
921   return ElementMulFp16(output, output, output, element_size);
922 }
923 
ElementMaximumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)924 int ElementMaximumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
925   int index = 0;
926 #ifdef ENABLE_NEON
927   for (; index <= element_size - 8; index += C8NUM) {
928     float16x8_t vin0 = vld1q_f16(input0 + index);
929     float16x8_t vin1 = vld1q_f16(input1 + index);
930     float16x8_t vout = vmaxq_f16(vin0, vin1);
931     vst1q_f16(output + index, vout);
932   }
933 #endif
934   for (; index < element_size; index++) {
935     output[index] = MSMAX(input0[index], input1[index]);
936   }
937   return NNACL_OK;
938 }
939 
ElementOptMaximumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)940 int ElementOptMaximumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
941                           bool first_scalar) {
942 #ifdef ENABLE_NEON
943   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
944   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
945 #endif
946   int index = 0;
947   if (first_scalar) {
948 #ifdef ENABLE_NEON
949     for (; index <= element_size - 8; index += C8NUM) {
950       float16x8_t vin1 = vld1q_f16(input1 + index);
951       float16x8_t vout = vmaxq_f16(vin0_opt, vin1);
952       vst1q_f16(output + index, vout);
953     }
954 #endif
955     for (; index < element_size; index++) {
956       output[index] = MSMAX(input0[0], input1[index]);
957     }
958   } else {
959 #ifdef ENABLE_NEON
960     for (; index <= element_size - 8; index += C8NUM) {
961       float16x8_t vin0 = vld1q_f16(input0 + index);
962       float16x8_t vout = vmaxq_f16(vin0, vin1_opt);
963       vst1q_f16(output + index, vout);
964     }
965 #endif
966     for (; index < element_size; index++) {
967       output[index] = MSMAX(input0[index], input1[0]);
968     }
969   }
970   return NNACL_OK;
971 }
972 
ElementMinimumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size)973 int ElementMinimumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size) {
974   int index = 0;
975 #ifdef ENABLE_NEON
976   for (; index <= element_size - 8; index += C8NUM) {
977     float16x8_t vin0 = vld1q_f16(input0 + index);
978     float16x8_t vin1 = vld1q_f16(input1 + index);
979     float16x8_t vout = vminq_f16(vin0, vin1);
980     vst1q_f16(output + index, vout);
981   }
982 #endif
983   for (; index < element_size; index++) {
984     output[index] = MSMIN(input0[index], input1[index]);
985   }
986   return NNACL_OK;
987 }
988 
ElementOptMinimumFp16(const float16_t * input0,const float16_t * input1,float16_t * output,int element_size,bool first_scalar)989 int ElementOptMinimumFp16(const float16_t *input0, const float16_t *input1, float16_t *output, int element_size,
990                           bool first_scalar) {
991 #ifdef ENABLE_NEON
992   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
993   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
994 #endif
995   int index = 0;
996   if (first_scalar) {
997 #ifdef ENABLE_NEON
998     for (; index <= element_size - 8; index += C8NUM) {
999       float16x8_t vin1 = vld1q_f16(input1 + index);
1000       float16x8_t vout = vminq_f16(vin0_opt, vin1);
1001       vst1q_f16(output + index, vout);
1002     }
1003 #endif
1004     for (; index < element_size; index++) {
1005       output[index] = MSMIN(input0[0], input1[index]);
1006     }
1007   } else {
1008 #ifdef ENABLE_NEON
1009     for (; index <= element_size - 8; index += C8NUM) {
1010       float16x8_t vin0 = vld1q_f16(input0 + index);
1011       float16x8_t vout = vminq_f16(vin0, vin1_opt);
1012       vst1q_f16(output + index, vout);
1013     }
1014 #endif
1015     for (; index < element_size; index++) {
1016       output[index] = MSMIN(input0[index], input1[0]);
1017     }
1018   }
1019   return NNACL_OK;
1020 }
1021 
ElementNotEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1022 int ElementNotEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1023   int index = 0;
1024 #ifdef ENABLE_NEON
1025   for (; index <= element_size - 8; index += C8NUM) {
1026     float16x8_t vin0 = vld1q_f16(input0 + index);
1027     float16x8_t vin1 = vld1q_f16(input1 + index);
1028     uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1));
1029     vst1_u8(output + index, vout);
1030   }
1031 #endif
1032   for (; index < element_size; index++) {
1033     output[index] = input0[index] != input1[index];
1034   }
1035   return NNACL_OK;
1036 }
1037 
ElementOptNotEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1038 int ElementOptNotEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1039                            bool first_scalar) {
1040 #ifdef ENABLE_NEON
1041   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1042   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1043 #endif
1044   int index = 0;
1045   if (first_scalar) {
1046 #ifdef ENABLE_NEON
1047     for (; index <= element_size - 8; index += C8NUM) {
1048       float16x8_t vin1 = vld1q_f16(input1 + index);
1049       uint8x8_t vout = vmovn_u16(vceqq_f16(vin0_opt, vin1));
1050       vst1_u8(output + index, vout);
1051     }
1052 #endif
1053     for (; index < element_size; index++) {
1054       output[index] = input0[0] != input1[index];
1055     }
1056   } else {
1057 #ifdef ENABLE_NEON
1058     for (; index <= element_size - 8; index += C8NUM) {
1059       float16x8_t vin0 = vld1q_f16(input0 + index);
1060       uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1_opt));
1061       vst1_u8(output + index, vout);
1062     }
1063 #endif
1064     for (; index < element_size; index++) {
1065       output[index] = input0[index] != input1[0];
1066     }
1067   }
1068   return NNACL_OK;
1069 }
1070 
ElementEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1071 int ElementEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1072   int index = 0;
1073 #ifdef ENABLE_NEON
1074   for (; index <= element_size - 8; index += C8NUM) {
1075     float16x8_t vin0 = vld1q_f16(input0 + index);
1076     float16x8_t vin1 = vld1q_f16(input1 + index);
1077     uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1));
1078     vst1_u8(output + index, vout);
1079   }
1080 #endif
1081   for (; index < element_size; index++) {
1082     output[index] = input0[index] == input1[index];
1083   }
1084   return NNACL_OK;
1085 }
1086 
ElementOptEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1087 int ElementOptEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1088                         bool first_scalar) {
1089 #ifdef ENABLE_NEON
1090   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1091   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1092 #endif
1093   int index = 0;
1094   if (first_scalar) {
1095 #ifdef ENABLE_NEON
1096     for (; index <= element_size - 8; index += C8NUM) {
1097       float16x8_t vin1 = vld1q_f16(input1 + index);
1098       uint8x8_t vout = vmovn_u16(vceqq_f16(vin0_opt, vin1));
1099       vst1_u8(output + index, vout);
1100     }
1101 #endif
1102     for (; index < element_size; index++) {
1103       output[index] = input0[0] == input1[index];
1104     }
1105   } else {
1106 #ifdef ENABLE_NEON
1107     for (; index <= element_size - 8; index += C8NUM) {
1108       float16x8_t vin0 = vld1q_f16(input0 + index);
1109       uint8x8_t vout = vmovn_u16(vceqq_f16(vin0, vin1_opt));
1110       vst1_u8(output + index, vout);
1111     }
1112 #endif
1113     for (; index < element_size; index++) {
1114       output[index] = input0[index] == input1[0];
1115     }
1116   }
1117   return NNACL_OK;
1118 }
1119 
ElementLessFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1120 int ElementLessFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1121   int index = 0;
1122 #ifdef ENABLE_NEON
1123   for (; index <= element_size - 8; index += C8NUM) {
1124     float16x8_t vin0 = vld1q_f16(input0 + index);
1125     float16x8_t vin1 = vld1q_f16(input1 + index);
1126     uint8x8_t vout = vmovn_u16(vcltq_f16(vin0, vin1));
1127     vst1_u8(output + index, vout);
1128   }
1129 #endif
1130   for (; index < element_size; index++) {
1131     output[index] = input0[index] < input1[index];
1132   }
1133   return NNACL_OK;
1134 }
1135 
ElementOptLessFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1136 int ElementOptLessFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1137                        bool first_scalar) {
1138 #ifdef ENABLE_NEON
1139   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1140   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1141 #endif
1142   int index = 0;
1143   if (first_scalar) {
1144 #ifdef ENABLE_NEON
1145     for (; index <= element_size - 8; index += C8NUM) {
1146       float16x8_t vin1 = vld1q_f16(input1 + index);
1147       uint8x8_t vout = vmovn_u16(vcltq_f16(vin0_opt, vin1));
1148       vst1_u8(output + index, vout);
1149     }
1150 #endif
1151     for (; index < element_size; index++) {
1152       output[index] = input0[0] < input1[index];
1153     }
1154   } else {
1155 #ifdef ENABLE_NEON
1156     for (; index <= element_size - 8; index += C8NUM) {
1157       float16x8_t vin0 = vld1q_f16(input0 + index);
1158       uint8x8_t vout = vmovn_u16(vcltq_f16(vin0, vin1_opt));
1159       vst1_u8(output + index, vout);
1160     }
1161 #endif
1162     for (; index < element_size; index++) {
1163       output[index] = input0[index] < input1[0];
1164     }
1165   }
1166   return NNACL_OK;
1167 }
1168 
ElementLessEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1169 int ElementLessEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1170   int index = 0;
1171 #ifdef ENABLE_NEON
1172   for (; index <= element_size - 8; index += C8NUM) {
1173     float16x8_t vin0 = vld1q_f16(input0 + index);
1174     float16x8_t vin1 = vld1q_f16(input1 + index);
1175     uint8x8_t vout = vmovn_u16(vcleq_f16(vin0, vin1));
1176     vst1_u8(output + index, vout);
1177   }
1178 #endif
1179   for (; index < element_size; index++) {
1180     output[index] = input0[index] <= input1[index];
1181   }
1182   return NNACL_OK;
1183 }
1184 
ElementOptLessEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1185 int ElementOptLessEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1186                             bool first_scalar) {
1187 #ifdef ENABLE_NEON
1188   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1189   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1190 #endif
1191   int index = 0;
1192   if (first_scalar) {
1193 #ifdef ENABLE_NEON
1194     for (; index <= element_size - 8; index += C8NUM) {
1195       float16x8_t vin1 = vld1q_f16(input1 + index);
1196       uint8x8_t vout = vmovn_u16(vcleq_f16(vin0_opt, vin1));
1197       vst1_u8(output + index, vout);
1198     }
1199 #endif
1200     for (; index < element_size; index++) {
1201       output[index] = input0[0] <= input1[index];
1202     }
1203   } else {
1204 #ifdef ENABLE_NEON
1205     for (; index <= element_size - 8; index += C8NUM) {
1206       float16x8_t vin0 = vld1q_f16(input0 + index);
1207       uint8x8_t vout = vmovn_u16(vcleq_f16(vin0, vin1_opt));
1208       vst1_u8(output + index, vout);
1209     }
1210 #endif
1211     for (; index < element_size; index++) {
1212       output[index] = input0[index] <= input1[0];
1213     }
1214   }
1215   return NNACL_OK;
1216 }
1217 
ElementGreaterFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1218 int ElementGreaterFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1219   int index = 0;
1220 #ifdef ENABLE_NEON
1221   for (; index <= element_size - 8; index += C8NUM) {
1222     float16x8_t vin0 = vld1q_f16(input0 + index);
1223     float16x8_t vin1 = vld1q_f16(input1 + index);
1224     uint8x8_t vout = vmovn_u16(vcgtq_f16(vin0, vin1));
1225     vst1_u8(output + index, vout);
1226   }
1227 #endif
1228   for (; index < element_size; index++) {
1229     output[index] = input0[index] > input1[index];
1230   }
1231   return NNACL_OK;
1232 }
1233 
ElementOptGreaterFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1234 int ElementOptGreaterFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1235                           bool first_scalar) {
1236 #ifdef ENABLE_NEON
1237   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1238   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1239 #endif
1240   int index = 0;
1241   if (first_scalar) {
1242 #ifdef ENABLE_NEON
1243     for (; index <= element_size - 8; index += C8NUM) {
1244       float16x8_t vin1 = vld1q_f16(input1 + index);
1245       uint8x8_t vout = vmovn_u16(vcgtq_f16(vin0_opt, vin1));
1246       vst1_u8(output + index, vout);
1247     }
1248 #endif
1249     for (; index < element_size; index++) {
1250       output[index] = input0[0] > input1[index];
1251     }
1252   } else {
1253 #ifdef ENABLE_NEON
1254     for (; index <= element_size - 8; index += C8NUM) {
1255       float16x8_t vin0 = vld1q_f16(input0 + index);
1256       uint8x8_t vout = vmovn_u16(vcgtq_f16(vin0, vin1_opt));
1257       vst1_u8(output + index, vout);
1258     }
1259 #endif
1260     for (; index < element_size; index++) {
1261       output[index] = input0[index] > input1[0];
1262     }
1263   }
1264   return NNACL_OK;
1265 }
1266 
ElementGreaterEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size)1267 int ElementGreaterEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size) {
1268   int index = 0;
1269 #ifdef ENABLE_NEON
1270   for (; index <= element_size - 8; index += C8NUM) {
1271     float16x8_t vin0 = vld1q_f16(input0 + index);
1272     float16x8_t vin1 = vld1q_f16(input1 + index);
1273     uint8x8_t vout = vmovn_u16(vcgeq_f16(vin0, vin1));
1274     vst1_u8(output + index, vout);
1275   }
1276 #endif
1277   for (; index < element_size; index++) {
1278     output[index] = input0[index] >= input1[index];
1279   }
1280   return NNACL_OK;
1281 }
1282 
ElementOptGreaterEqualFp16(const float16_t * input0,const float16_t * input1,uint8_t * output,int element_size,bool first_scalar)1283 int ElementOptGreaterEqualFp16(const float16_t *input0, const float16_t *input1, uint8_t *output, int element_size,
1284                                bool first_scalar) {
1285 #ifdef ENABLE_NEON
1286   float16x8_t vin0_opt = vdupq_n_f16(input0[0]);
1287   float16x8_t vin1_opt = vdupq_n_f16(input1[0]);
1288 #endif
1289   int index = 0;
1290   if (first_scalar) {
1291 #ifdef ENABLE_NEON
1292     for (; index <= element_size - 8; index += C8NUM) {
1293       float16x8_t vin1 = vld1q_f16(input1 + index);
1294       uint8x8_t vout = vmovn_u16(vcgeq_f16(vin0_opt, vin1));
1295       vst1_u8(output + index, vout);
1296     }
1297 #endif
1298     for (; index < element_size; index++) {
1299       output[index] = input0[0] >= input1[index];
1300     }
1301   } else {
1302 #ifdef ENABLE_NEON
1303     for (; index <= element_size - 8; index += C8NUM) {
1304       float16x8_t vin0 = vld1q_f16(input0 + index);
1305       uint8x8_t vout = vmovn_u16(vcgeq_f16(vin0, vin1_opt));
1306       vst1_u8(output + index, vout);
1307     }
1308 #endif
1309     for (; index < element_size; index++) {
1310       output[index] = input0[index] >= input1[0];
1311     }
1312   }
1313   return NNACL_OK;
1314 }
1315