/external/ComputeLibrary/src/core/NEON/kernels/convolution/winograd/output_transforms/ |
D | a64_fp16_4x4_3x3.cpp | 152 … FZ[i][0] = vadd_f16(vadd_f16(vadd_f16(F[i][0], F[i][1]), vadd_f16(F[i][2], F[i][3])), F[i][4]); in a64_fp16_4x4_3x3() 155 …FZ[i][1] = vadd_f16(vsub_f16(F[i][1], F[i][2]), vmul_f16(vsub_f16(F[i][3], F[i][4]), vdup_n_f16(2.… in a64_fp16_4x4_3x3() 158 …FZ[i][2] = vadd_f16(vadd_f16(F[i][1], F[i][2]), vmul_f16(vadd_f16(F[i][3], F[i][4]), vdup_n_f16(4.… in a64_fp16_4x4_3x3() 161 …FZ[i][3] = vadd_f16(vadd_f16(vsub_f16(F[i][1], F[i][2]), vmul_f16(vsub_f16(F[i][3], F[i][4]), vdup… in a64_fp16_4x4_3x3() 168 …f[0][j] = vadd_f16(vadd_f16(vadd_f16(FZ[0][j], FZ[1][j]), vadd_f16(FZ[2][j], FZ[3][j])), FZ[4][j]); in a64_fp16_4x4_3x3() 171 …f[1][j] = vadd_f16(vsub_f16(FZ[1][j], FZ[2][j]), vmul_f16(vsub_f16(FZ[3][j], FZ[4][j]), vdup_n_f16… in a64_fp16_4x4_3x3() 174 …f[2][j] = vadd_f16(vadd_f16(FZ[1][j], FZ[2][j]), vmul_f16(vadd_f16(FZ[3][j], FZ[4][j]), vdup_n_f16… in a64_fp16_4x4_3x3() 177 …f[3][j] = vadd_f16(vadd_f16(vsub_f16(FZ[1][j], FZ[2][j]), vmul_f16(vsub_f16(FZ[3][j], FZ[4][j]), v… in a64_fp16_4x4_3x3() 195 vmax_f16(vmin_f16(vadd_f16(f[i][j], b), vdup_n_f16(output_max)), in a64_fp16_4x4_3x3()
|
/external/XNNPACK/src/f16-gavgpool-cw/ |
D | neonfp16arith-x4.c | 48 vsum0 = vadd_f16(vsum0, vi0); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 49 vsum1 = vadd_f16(vsum1, vi1); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 50 vsum2 = vadd_f16(vsum2, vi2); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 51 vsum3 = vadd_f16(vsum3, vi3); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 66 vsum0 = vadd_f16(vsum0, vi0); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 67 vsum1 = vadd_f16(vsum1, vi1); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 68 vsum2 = vadd_f16(vsum2, vi2); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 69 vsum3 = vadd_f16(vsum3, vi3); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 98 vsum0 = vadd_f16(vsum0, vi0); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4() 107 vsum0 = vadd_f16(vsum0, vi0); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4()
|
D | neonfp16arith-x8.c | 81 const float16x4_t vsum0_lo = vadd_f16(vget_low_f16(vsum0), vget_high_f16(vsum0)); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() 82 const float16x4_t vsum1_lo = vadd_f16(vget_low_f16(vsum1), vget_high_f16(vsum1)); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() 83 const float16x4_t vsum2_lo = vadd_f16(vget_low_f16(vsum2), vget_high_f16(vsum2)); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() 84 const float16x4_t vsum3_lo = vadd_f16(vget_low_f16(vsum3), vget_high_f16(vsum3)); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8() 122 float16x4_t vsum = vadd_f16(vget_low_f16(vsum0), vget_high_f16(vsum0)); in xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x8()
|
/external/ComputeLibrary/src/core/NEON/kernels/convolution/winograd/input_transforms/ |
D | a64_fp16_6x6.cpp | 181 …XTx[0][j] = vsub_f16(vadd_f16(x[4][j], vmul_f16(x[0][j], vdup_n_f16(4.0f))), vmul_f16(x[2][j], vdu… in a64_fp16_6x6() 184 …XTx[1][j] = vsub_f16(vadd_f16(x[3][j], x[4][j]), vmul_f16(vadd_f16(x[1][j], x[2][j]), vdup_n_f16(… in a64_fp16_6x6() 187 …XTx[2][j] = vadd_f16(vsub_f16(x[4][j], x[3][j]), vmul_f16(vsub_f16(x[1][j], x[2][j]), vdup_n_f16(4… in a64_fp16_6x6() 190 …XTx[3][j] = vadd_f16(vsub_f16(x[4][j], x[2][j]), vmul_f16(vsub_f16(x[3][j], x[1][j]), vdup_n_f16(2… in a64_fp16_6x6() 193 …XTx[4][j] = vadd_f16(vsub_f16(x[4][j], x[2][j]), vmul_f16(vsub_f16(x[1][j], x[3][j]), vdup_n_f16(2… in a64_fp16_6x6() 196 …XTx[5][j] = vsub_f16(vadd_f16(x[5][j], vmul_f16(x[1][j], vdup_n_f16(4.0f))), vmul_f16(x[3][j], vdu… in a64_fp16_6x6() 203 …U[i][0] = vsub_f16(vadd_f16(XTx[i][4], vmul_f16(XTx[i][0], vdup_n_f16(4.0f))), vmul_f16(XTx[i][2],… in a64_fp16_6x6() 206 …U[i][1] = vsub_f16(vadd_f16(XTx[i][3], XTx[i][4]), vmul_f16(vadd_f16(XTx[i][1], XTx[i][2]), vdup_n… in a64_fp16_6x6() 209 …U[i][2] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][3]), vmul_f16(vsub_f16(XTx[i][1], XTx[i][2]), vdup_n… in a64_fp16_6x6() 212 …U[i][3] = vadd_f16(vsub_f16(XTx[i][4], XTx[i][2]), vmul_f16(vsub_f16(XTx[i][3], XTx[i][1]), vdup_n… in a64_fp16_6x6() [all …]
|
/external/ComputeLibrary/src/core/NEON/kernels/convolution/winograd/weight_transforms/ |
D | a64_fp16_4x4_3x3.cpp | 137 Ww[1][j] = vmul_n_f16(vadd_f16(vadd_f16(w[0][j], w[1][j]), w[2][j]), -4.0); in a64_fp16_4x4_3x3() 143 …Ww[3][j] = vadd_f16(vadd_f16(w[0][j], vmul_f16(w[1][j], vdup_n_f16(2.0f))), vmul_f16(w[2][j], vdup… in a64_fp16_4x4_3x3() 146 …Ww[4][j] = vadd_f16(vsub_f16(w[0][j], vmul_f16(w[1][j], vdup_n_f16(2.0f))), vmul_f16(w[2][j], vdup… in a64_fp16_4x4_3x3() 161 …V[i][1] = vmul_n_f16(vmul_n_f16(vadd_f16(vadd_f16(Ww[i][0], Ww[i][1]), Ww[i][2]), -4.0), recip576); in a64_fp16_4x4_3x3() 167 …V[i][3] = vmul_n_f16(vadd_f16(vadd_f16(Ww[i][0], vmul_f16(Ww[i][1], vdup_n_f16(2.0f))), vmul_f16(W… in a64_fp16_4x4_3x3() 170 …V[i][4] = vmul_n_f16(vadd_f16(vsub_f16(Ww[i][0], vmul_f16(Ww[i][1], vdup_n_f16(2.0f))), vmul_f16(W… in a64_fp16_4x4_3x3()
|
/external/XNNPACK/src/f16-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfp16arith-1x4-acc5.c | 168 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 169 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 170 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 171 vo0p0 = vadd_f16(vo0p0, vo0p4); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 279 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 280 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 281 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 282 vo0p0 = vadd_f16(vo0p0, vo0p4); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 378 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() 379 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc5() [all …]
|
D | 3x3s2p1-minmax-neonfp16arith-1x4-acc4.c | 98 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 99 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 100 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 146 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 147 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4() 148 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc4()
|
D | 5x5s2p2-minmax-neonfp16arith-1x4-acc5.c | 181 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5() 182 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5() 183 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5() 184 vo0p0 = vadd_f16(vo0p0, vo0p4); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5() 279 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5() 280 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5() 281 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5() 282 vo0p0 = vadd_f16(vo0p0, vo0p4); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc5()
|
D | 5x5p2-minmax-neonfp16arith-1x4-acc4.c | 168 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 169 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 170 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 278 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 279 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 280 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 376 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 377 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4() 378 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc4()
|
D | 5x5p2-minmax-neonfp16arith-2x4-acc3.c | 209 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 210 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 211 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 212 vo1p0 = vadd_f16(vo1p0, vo1p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 357 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 358 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 359 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 360 vo1p0 = vadd_f16(vo1p0, vo1p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 490 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() 491 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_2x4_acc3() [all …]
|
D | 3x3s2p1-minmax-neonfp16arith-1x4-acc3.c | 98 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 99 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 145 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3() 146 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4_acc3()
|
D | 5x5s2p2-minmax-neonfp16arith-1x4-acc4.c | 181 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc4() 182 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc4() 183 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc4() 278 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc4() 279 vo0p2 = vadd_f16(vo0p2, vo0p3); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc4() 280 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4_acc4()
|
D | 5x5s2p2-minmax-neonfp16arith-2x4-acc3.c | 238 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3() 239 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3() 240 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3() 241 vo1p0 = vadd_f16(vo1p0, vo1p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3() 375 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3() 376 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3() 377 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3() 378 vo1p0 = vadd_f16(vo1p0, vo1p2); in xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_2x4_acc3()
|
D | 5x5p2-minmax-neonfp16arith-1x4-acc3.c | 168 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 169 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 277 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 278 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 374 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3() 375 vo0p0 = vadd_f16(vo0p0, vo0p2); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4_acc3()
|
D | 3x3s2p1-minmax-neonfp16arith-2x4-acc2.c | 127 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x4_acc2() 128 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x4_acc2() 195 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x4_acc2() 196 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_2x4_acc2()
|
D | 5x5p2-minmax-neonfp16arith-4x4-acc2.c | 291 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 292 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 293 vo2p0 = vadd_f16(vo2p0, vo2p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 294 vo3p0 = vadd_f16(vo3p0, vo3p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 513 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 514 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 515 vo2p0 = vadd_f16(vo2p0, vo2p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 516 vo3p0 = vadd_f16(vo3p0, vo3p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 714 vo0p0 = vadd_f16(vo0p0, vo0p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() 715 vo1p0 = vadd_f16(vo1p0, vo1p1); in xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_4x4_acc2() [all …]
|
/external/XNNPACK/src/f16-raddstoreexpminusmax/gen/ |
D | neonfp16arith-rr2-p2-x32.c | 135 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32() 160 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32() 165 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32() 170 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32()
|
D | neonfp16arith-rr2-p2-x32-acc2.c | 137 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2() 162 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2() 167 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2() 172 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc2()
|
D | neonfp16arith-rr2-p2-x40.c | 149 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40() 174 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40() 179 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40() 184 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40()
|
D | neonfp16arith-rr2-p2-x40-acc2.c | 151 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2() 176 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2() 181 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2() 186 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc2()
|
D | neonfp16arith-rr2-p2-x32-acc4.c | 141 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4() 166 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4() 171 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4() 176 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x32_acc4()
|
D | neonfp16arith-rr2-p2-x40-acc5.c | 157 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5() 182 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5() 187 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5() 192 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40_acc5()
|
D | neonfp16arith-rr2-p2-x48.c | 163 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48() 188 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48() 193 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48() 198 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48()
|
D | neonfp16arith-rr2-p2-x48-acc3.c | 167 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3() 192 vacc_lo = vadd_f16(vacc_lo, vf_lo); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3() 197 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3() 202 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48))); in xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x48_acc3()
|
/external/XNNPACK/src/f16-raddstoreexpminusmax/ |
D | neonfp16arith-rr2-p2.c.in | 117 float16x4_t vacc_lo = vadd_f16(vget_low_f16(vacc), vget_high_f16(vacc)); 142 vacc_lo = vadd_f16(vacc_lo, vf_lo); 147 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 32))); 152 … vacc_lo = vadd_f16(vacc_lo, vreinterpret_f16_u64(vshl_n_u64(vreinterpret_u64_f16(vf_lo), 48)));
|