/external/XNNPACK/src/f32-dwconv/gen/ |
D | up16x4-fma3.c | 56 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__fma3() local 58 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x4__fma3() 106 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__fma3() local 107 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x4__fma3() 144 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__fma3() local 145 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x4__fma3()
|
D | up16x4-avx-acc2.c | 56 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__avx_acc2() local 58 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x4__avx_acc2() 109 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__avx_acc2() local 110 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x4__avx_acc2() 149 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__avx_acc2() local 150 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x4__avx_acc2()
|
D | up16x4-avx.c | 56 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__avx() local 58 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x4__avx() 106 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__avx() local 107 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x4__avx() 144 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__avx() local 145 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x4__avx()
|
D | up16x4-fma3-acc2.c | 56 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__fma3_acc2() local 58 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x4__fma3_acc2() 109 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__fma3_acc2() local 110 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x4__fma3_acc2() 149 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x4__fma3_acc2() local 150 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x4__fma3_acc2()
|
D | up8x4-fma3.c | 54 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__fma3() local 55 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x4__fma3() 92 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__fma3() local 93 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x4__fma3()
|
D | up8x4-avx.c | 54 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__avx() local 55 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x4__avx() 92 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__avx() local 93 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x4__avx()
|
D | up8x4-fma3-acc2.c | 54 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__fma3_acc2() local 55 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x4__fma3_acc2() 94 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__fma3_acc2() local 95 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x4__fma3_acc2()
|
D | up8x4-avx-acc2.c | 54 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__avx_acc2() local 55 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x4__avx_acc2() 94 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x4__avx_acc2() local 95 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x4__avx_acc2()
|
D | up16x9-fma3-acc2.c | 66 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__fma3_acc2() local 68 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x9__fma3_acc2() 164 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__fma3_acc2() local 165 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x9__fma3_acc2() 234 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__fma3_acc2() local 235 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x9__fma3_acc2()
|
D | up16x9-fma3.c | 66 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__fma3() local 68 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x9__fma3() 161 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__fma3() local 162 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x9__fma3() 229 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__fma3() local 230 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x9__fma3()
|
D | up16x9-avx-acc2.c | 66 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__avx_acc2() local 68 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x9__avx_acc2() 164 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__avx_acc2() local 165 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x9__avx_acc2() 234 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__avx_acc2() local 235 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x9__avx_acc2()
|
D | up16x9-avx.c | 66 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__avx() local 68 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x9__avx() 161 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__avx() local 162 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x9__avx() 229 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x9__avx() local 230 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x9__avx()
|
D | up8x9-fma3-acc2.c | 64 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__fma3_acc2() local 65 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x9__fma3_acc2() 134 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__fma3_acc2() local 135 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x9__fma3_acc2()
|
D | up8x9-fma3.c | 64 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__fma3() local 65 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x9__fma3() 132 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__fma3() local 133 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x9__fma3()
|
D | up8x9-avx.c | 64 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__avx() local 65 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x9__avx() 132 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__avx() local 133 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x9__avx()
|
D | up8x9-avx-acc2.c | 64 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__avx_acc2() local 65 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x9__avx_acc2() 134 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x9__avx_acc2() local 135 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x9__avx_acc2()
|
D | up8x25-fma3-acc2.c | 96 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__fma3_acc2() local 97 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x25__fma3_acc2() 262 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__fma3_acc2() local 263 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x25__fma3_acc2()
|
D | up8x25-fma3.c | 96 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__fma3() local 97 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x25__fma3() 260 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__fma3() local 261 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up8x25__fma3()
|
D | up16x25-fma3.c | 98 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__fma3() local 100 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x25__fma3() 337 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__fma3() local 338 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x25__fma3() 501 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__fma3() local 502 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x25__fma3()
|
D | up16x25-fma3-acc2.c | 98 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__fma3_acc2() local 100 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x25__fma3_acc2() 340 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__fma3_acc2() local 341 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x25__fma3_acc2() 506 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__fma3_acc2() local 507 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_ukernel_up16x25__fma3_acc2()
|
D | up8x25-avx.c | 96 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__avx() local 97 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x25__avx() 260 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__avx() local 261 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x25__avx()
|
D | up8x25-avx-acc2.c | 96 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__avx_acc2() local 97 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x25__avx_acc2() 262 const __m256 vk0x01234567 = _mm256_load_ps(w + 8); in xnn_f32_dwconv_ukernel_up8x25__avx_acc2() local 263 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up8x25__avx_acc2()
|
D | up16x25-avx-acc2.c | 98 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__avx_acc2() local 100 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x25__avx_acc2() 340 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__avx_acc2() local 341 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x25__avx_acc2() 506 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__avx_acc2() local 507 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x25__avx_acc2()
|
D | up16x25-avx.c | 98 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__avx() local 100 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x25__avx() 337 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__avx() local 338 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x25__avx() 501 const __m256 vk0x01234567 = _mm256_load_ps(w + 16); in xnn_f32_dwconv_ukernel_up16x25__avx() local 502 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_ukernel_up16x25__avx()
|