/external/XNNPACK/src/f16-gemm/gen/ |
D | 6x16-minmax-neonfp16arith-ld64.c | 209 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 218 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 219 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 220 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 221 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 222 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc3, va4, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 223 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc3, va5, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 238 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 239 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 240 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 257 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 268 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 269 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 270 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 271 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 272 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc3, va4, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 273 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc3, va5, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 274 vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc3, va6, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 275 vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc3, va7, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 294 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 161 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 168 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 169 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 170 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 171 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 182 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 183 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 184 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 185 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x16-minmax-neonfp16arith-ld64.c | 89 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 93 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 98 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 8x16inc-minmax-neonfp16arith-ld64.c | 259 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 270 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 271 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 272 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 273 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 274 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 275 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 276 vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc3, va6, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 277 vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc3, va7, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 296 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 211 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 220 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 221 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 222 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 223 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 224 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 225 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 240 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 241 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 242 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 163 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 170 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 171 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 172 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 173 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 184 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 185 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 186 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 187 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x16inc-minmax-neonfp16arith-ld64.c | 91 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() local 95 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 100 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 8x16-minmax-neonfp16arith-ld64.c | 289 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 300 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 301 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 302 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 303 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 304 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc3, va4, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 305 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc3, va5, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 306 vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc3, va6, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 307 vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc3, va7, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 326 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 235 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 244 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 245 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 246 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 247 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 248 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc3, va4, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 249 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc3, va5, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 264 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 265 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 266 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 181 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 188 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 189 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc3, va1, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 190 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 191 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 202 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 203 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 204 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 205 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x16-minmax-neonfp16arith-ld64.c | 100 …const float16x8_t vb89ABCDEFc3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 104 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc3, va0, 3); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 109 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x16s4inc-minmax-fma3-broadcast.c | 136 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 142 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 143 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 144 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 145 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4inc-minmax-fma3-broadcast.c | 155 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 162 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 163 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 164 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 165 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 166 vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEFc3, vacc4x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 3x16s4inc-minmax-fma3-broadcast.c | 117 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 122 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 123 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 124 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4inc-minmax-fma3-broadcast.c | 79 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 82 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 134 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 140 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 141 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 142 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 143 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 153 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 160 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 161 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 162 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 163 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 164 vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEFc3, vacc4x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 115 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 120 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 121 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 122 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4-minmax-fma3-broadcast.c | 77 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 178 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 185 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 186 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 187 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 188 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 189 vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEFc3, vacc4x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 134 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 139 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 140 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 141 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 4x16s4-minmax-fma3-broadcast.c | 156 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 162 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 163 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc3, vacc1x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 164 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 165 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 1x16s4-minmax-fma3-broadcast.c | 90 const __m256 vb89ABCDEFc3 = _mm256_load_ps(w + 56); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() local 93 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc3, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16-minmax-neon-mull-addw-dup.c | 87 … const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 89 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 172 … const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 177 const int16x8_t vprod0x89ABCDEFc3 = vmull_s8(vb89ABCDEFc3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
|