/external/XNNPACK/src/f16-igemm/gen/ |
D | 4x16-minmax-neonfp16arith-ld64.c | 66 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 110 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 124 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 137 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 151 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 164 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 178 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 191 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 205 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 225 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 74 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 136 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 156 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 173 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 193 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 210 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 230 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 247 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 267 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 293 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 82 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 162 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 188 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 209 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 235 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 256 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 282 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 303 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 329 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 361 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 4x16inc-minmax-neonfp16arith-ld64.c | 72 …float16x8_t vacc3x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 92 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 106 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 119 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 133 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 146 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 160 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 173 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 187 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 209 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 84 …float16x8_t vacc3x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 112 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 132 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 149 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 169 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 186 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 206 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 223 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 243 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 271 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16inc-minmax-neonfp16arith-ld64.c | 96 …float16x8_t vacc3x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 132 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 158 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 179 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 205 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 226 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 252 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 273 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 299 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 333 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 4x16-minmax-neonfp16arith-ld64.c | 70 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 90 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 104 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 117 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 131 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 144 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 158 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 171 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 185 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 207 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 82 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 110 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 130 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 147 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 167 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 184 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 204 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 221 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 241 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 269 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 94 float16x8_t vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 130 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 156 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 177 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc1, va3, 1); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 203 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 224 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc2, va3, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 250 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 271 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc3, va3, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 297 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 331 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3, vb89ABCDEF); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x16s4inc-minmax-fma3-broadcast.c | 69 __m256 vacc3x89ABCDEF = _mm256_load_ps(acc + 56); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 94 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 111 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 128 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 145 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 173 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 187 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 197 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 201 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 226 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4inc-minmax-fma3-broadcast.c | 75 __m256 vacc3x89ABCDEF = _mm256_load_ps(acc + 56); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 105 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 125 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 145 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 165 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 197 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 213 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 225 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 233 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 261 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 4x16inc-minmax-fma3-broadcast.c | 69 __m256 vacc3x89ABCDEF = _mm256_load_ps(acc + 56); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() local 94 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 107 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 117 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 121 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 146 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast()
|
D | 4x16inc-minmax-avx-broadcast.c | 69 __m256 vacc3x89ABCDEF = _mm256_load_ps(acc + 56); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() local 94 vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF)); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 107 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 117 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 121 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 146 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast()
|
D | 5x16inc-minmax-avx-broadcast.c | 75 __m256 vacc3x89ABCDEF = _mm256_load_ps(acc + 56); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() local 105 vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF)); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 120 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 132 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 140 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 168 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
|
D | 5x16inc-minmax-fma3-broadcast.c | 75 __m256 vacc3x89ABCDEF = _mm256_load_ps(acc + 56); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() local 105 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 120 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 132 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 140 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 168 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 92 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 109 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 126 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 143 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 171 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 185 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 195 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 199 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 224 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 73 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 103 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 123 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 143 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 163 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 195 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 211 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 223 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 231 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 259 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 4x16-minmax-avx-broadcast.c | 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() local 92 vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF)); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 105 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 115 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 119 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 144 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast()
|
D | 4x16-minmax-fma3-broadcast.c | 67 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() local 92 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 105 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 115 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 119 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 144 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 114 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 131 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 148 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 165 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 193 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 209 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 219 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 223 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 244 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 69 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 128 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 148 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc1, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 168 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc2, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 188 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc3, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 220 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 238 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 250 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 258 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 281 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 4x16-minmax-avx-broadcast.c | 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast() local 114 vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF)); in xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast() 128 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast() 138 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast() 142 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast() 163 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16__avx_broadcast()
|
D | 4x16-minmax-fma3-broadcast.c | 65 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast() local 114 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast() 128 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast() 138 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast() 142 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast() 163 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16__fma3_broadcast()
|
D | 5x16-minmax-avx-broadcast.c | 69 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() local 127 vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF)); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 144 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 156 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 164 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 187 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
|
D | 5x16-minmax-fma3-broadcast.c | 69 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() local 127 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEF, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 144 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 156 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 164 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 187 vacc3x01234567 = vacc3x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
|