/external/XNNPACK/src/f16-gemm/gen/ |
D | 6x8-minmax-neonfp16arith-ld64.c | 80 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 99 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 113 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 123 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 137 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 147 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 161 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 171 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 185 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 206 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 92 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 115 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 133 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 145 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 163 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 175 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 193 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 205 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 223 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 248 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 85 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 106 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 126 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 143 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 163 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 180 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 200 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 217 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 237 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 265 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 97 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 124 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 150 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 171 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 197 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 218 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 244 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 265 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 291 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 325 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 6x8-minmax-neonfp16arith-ld64.c | 72 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 125 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 139 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 149 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 163 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 173 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 187 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 197 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 211 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 230 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 80 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 147 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 165 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 177 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 195 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 207 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 225 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 237 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 255 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 278 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 77 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 132 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 152 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 169 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 189 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 206 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 226 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 243 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 263 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 289 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 85 float16x8_t vacc5x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 156 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 182 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 203 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 229 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 250 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 276 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 297 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 323 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 355 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 6x8inc-minmax-neonfp16arith-ld64.c | 82 …float16x8_t vacc5x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 101 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 115 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 125 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 139 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 149 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 163 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 173 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 187 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 208 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8inc-minmax-neonfp16arith-ld64.c | 94 …float16x8_t vacc5x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 117 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 135 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 147 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 165 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 177 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 195 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 207 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 225 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 250 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 87 …float16x8_t vacc5x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 108 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 128 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 145 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 165 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 182 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 202 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 219 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 239 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 267 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16inc-minmax-neonfp16arith-ld64.c | 99 …float16x8_t vacc5x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 126 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 152 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 173 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 199 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 220 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 246 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 267 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 293 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 327 vacc5x01234567 = vfmaq_f16(vacc5x01234567, va5, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8-minmax-avx-broadcast.c | 77 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast() local 103 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast() 114 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast() 122 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast() 125 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast() 147 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast() 161 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemm_minmax_ukernel_6x8__avx_broadcast()
|
D | 6x8-minmax-fma3-broadcast.c | 77 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast() local 103 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast() 114 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast() 122 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast() 125 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast() 147 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast() 161 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemm_minmax_ukernel_6x8__fma3_broadcast()
|
D | 7x8-minmax-fma3-broadcast.c | 83 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() local 112 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 124 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 133 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 139 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 163 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 179 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast()
|
D | 7x8-minmax-avx-broadcast.c | 83 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() local 112 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 124 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 133 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 139 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 163 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 179 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8inc-minmax-avx-broadcast.c | 79 __m256 vacc5x01234567 = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8__avx_broadcast() local 105 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_6x8__avx_broadcast() 116 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_6x8__avx_broadcast() 124 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_6x8__avx_broadcast() 127 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_6x8__avx_broadcast() 149 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_6x8__avx_broadcast() 163 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_6x8__avx_broadcast()
|
D | 6x8inc-minmax-fma3-broadcast.c | 79 __m256 vacc5x01234567 = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_6x8__fma3_broadcast() local 105 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_6x8__fma3_broadcast() 116 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_6x8__fma3_broadcast() 124 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_6x8__fma3_broadcast() 127 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_6x8__fma3_broadcast() 149 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_6x8__fma3_broadcast() 163 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_6x8__fma3_broadcast()
|
D | 7x8inc-minmax-avx-broadcast.c | 85 __m256 vacc5x01234567 = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() local 114 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 126 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 135 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 141 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 165 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 181 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast()
|
D | 7x8inc-minmax-fma3-broadcast.c | 85 __m256 vacc5x01234567 = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() local 114 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 126 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 135 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 141 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 165 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 181 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8-minmax-avx-broadcast.c | 71 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast() local 131 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast() 143 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast() 151 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast() 154 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast() 170 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast() 184 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_igemm_minmax_ukernel_6x8__avx_broadcast()
|
D | 6x8-minmax-fma3-broadcast.c | 71 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast() local 131 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast() 143 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast() 151 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast() 154 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast() 170 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast() 184 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_igemm_minmax_ukernel_6x8__fma3_broadcast()
|
D | 7x8-minmax-fma3-broadcast.c | 75 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() local 143 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 156 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 165 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 171 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 188 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 204 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast()
|
D | 7x8-minmax-avx-broadcast.c | 75 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() local 143 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 156 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 165 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 171 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 188 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 204 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast()
|
D | 8x8-minmax-fma3-broadcast.c | 79 __m256 vacc5x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() local 155 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 169 vacc5x01234567 = _mm256_min_ps(vacc5x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 179 vacc5x01234567 = _mm256_max_ps(vacc5x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 188 _mm256_storeu_ps(c5, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 206 __m128 vacc5x0123 = _mm256_castps256_ps128(vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 224 vacc5x0123 = _mm256_extractf128_ps(vacc5x01234567, 1); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast()
|