/external/XNNPACK/src/f32-raddstoreexpminusmax/gen/ |
D | avx512f-rr1-p5-scalef-x192-acc2.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() local 81 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 82 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 83 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 84 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 85 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 86 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 87 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 88 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() 89 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc2() [all …]
|
D | avx512f-rr1-p5-scalef-x192.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() local 80 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 81 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 82 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 83 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 84 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 85 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 86 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 87 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() 88 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192() [all …]
|
D | avx512f-rr1-p5-scalef-x192-acc3.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() local 82 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 83 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 84 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 85 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 86 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 87 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 88 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 89 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() 90 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc3() [all …]
|
D | avx512f-rr1-p5-scalef-x192-acc6.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() local 85 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 86 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 87 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 88 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 89 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 90 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 91 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 92 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() 93 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x192_acc6() [all …]
|
D | avx512f-rr1-p5-scalef-x160-acc2.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() local 75 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 76 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 77 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 78 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 79 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 80 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 81 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 82 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() 83 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc2() [all …]
|
D | avx512f-rr1-p5-scalef-x160.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() local 74 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 75 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 76 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 77 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 78 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 79 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 80 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 81 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() 82 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160() [all …]
|
D | avx512f-rr1-p5-scalef-x160-acc5.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() local 78 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 79 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 80 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 81 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 82 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 83 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 84 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 85 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() 86 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x160_acc5() [all …]
|
D | avx512f-rr1-p5-scalef-x144.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() local 71 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 72 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 73 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 74 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 75 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 76 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 77 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 78 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() 79 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144() [all …]
|
D | avx512f-rr1-p5-scalef-x144-acc3.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() local 73 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 74 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 75 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 76 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 77 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 78 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 79 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 80 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() 81 const __m512 vt8 = _mm512_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x144_acc3() [all …]
|
D | avx512f-rr1-p5-scalef-x128-acc4.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() local 71 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 72 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 73 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 74 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 75 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 76 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 77 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 78 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() 166 const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc4() [all …]
|
D | avx512f-rr1-p5-scalef-x128-acc2.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() local 69 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 70 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 71 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 72 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 73 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 74 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 75 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 76 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() 162 const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128_acc2() [all …]
|
D | avx512f-rr1-p5-scalef-x128.c | 30 const __m512 vminus_ln2 = _mm512_set1_ps(params->avx512_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() local 68 const __m512 vt0 = _mm512_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 69 const __m512 vt1 = _mm512_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 70 const __m512 vt2 = _mm512_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 71 const __m512 vt3 = _mm512_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 72 const __m512 vt4 = _mm512_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 73 const __m512 vt5 = _mm512_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 74 const __m512 vt6 = _mm512_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 75 const __m512 vt7 = _mm512_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() 160 const __m512 vt = _mm512_fmadd_ps(vn, vminus_ln2, vx); in xnn_f32_raddstoreexpminusmax_ukernel__avx512f_rr1_p5_scalef_x128() [all …]
|
D | avx2-rr1-p5-x96-acc2.c | 30 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p5.minus_ln2); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() local 107 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 108 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 109 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 110 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 111 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 112 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 113 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 114 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() 115 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f32_raddstoreexpminusmax_ukernel__avx2_rr1_p5_x96_acc2() [all …]
|
/external/XNNPACK/src/f16-raddstoreexpminusmax/gen/ |
D | avx2-rr1-p2-x96-acc6.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() local 111 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 112 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 113 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 114 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 115 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 116 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 117 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 118 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() 119 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc6() [all …]
|
D | avx2-rr1-p2-x96-acc3.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() local 108 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 109 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 110 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 111 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 112 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 113 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 114 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 115 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() 116 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc3() [all …]
|
D | avx2-rr1-p2-x96-acc2.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() local 107 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 108 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 109 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 110 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 111 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 112 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 113 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 114 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() 115 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96_acc2() [all …]
|
D | avx2-rr1-p2-x96.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() local 106 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 107 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 108 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 109 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 110 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 111 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 112 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 113 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() 114 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x96() [all …]
|
D | avx2-rr1-p2-x80.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() local 96 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 97 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 98 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 99 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 100 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 101 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 102 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 103 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() 104 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80() [all …]
|
D | avx2-rr1-p2-x80-acc2.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() local 97 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 98 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 99 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 100 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 101 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 102 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 103 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 104 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() 105 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc2() [all …]
|
D | avx2-rr1-p2-x80-acc5.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() local 100 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 101 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 102 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 103 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 104 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 105 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 106 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 107 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() 108 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x80_acc5() [all …]
|
D | avx2-rr1-p2-x72.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() local 91 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 92 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 93 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 94 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 95 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 96 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 97 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 98 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() 99 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72() [all …]
|
D | avx2-rr1-p2-x72-acc3.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() local 93 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 94 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 95 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 96 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 97 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 98 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 99 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 100 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() 101 __m256 vt8 = _mm256_fmadd_ps(vn8, vminus_ln2, vx8); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x72_acc3() [all …]
|
D | avx2-rr1-p2-x64-acc2.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() local 87 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 88 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 89 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 90 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 91 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 92 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 93 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 94 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() 166 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc2() [all …]
|
D | avx2-rr1-p2-x64-acc4.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() local 89 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 90 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 91 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 92 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 93 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 94 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 95 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 96 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() 170 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64_acc4() [all …]
|
D | avx2-rr1-p2-x64.c | 31 const __m256 vminus_ln2 = _mm256_load_ps(params->avx2_rr1_p2.minus_ln2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() local 86 __m256 vt0 = _mm256_fmadd_ps(vn0, vminus_ln2, vx0); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 87 __m256 vt1 = _mm256_fmadd_ps(vn1, vminus_ln2, vx1); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 88 __m256 vt2 = _mm256_fmadd_ps(vn2, vminus_ln2, vx2); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 89 __m256 vt3 = _mm256_fmadd_ps(vn3, vminus_ln2, vx3); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 90 __m256 vt4 = _mm256_fmadd_ps(vn4, vminus_ln2, vx4); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 91 __m256 vt5 = _mm256_fmadd_ps(vn5, vminus_ln2, vx5); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 92 __m256 vt6 = _mm256_fmadd_ps(vn6, vminus_ln2, vx6); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 93 __m256 vt7 = _mm256_fmadd_ps(vn7, vminus_ln2, vx7); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() 164 __m256 vt = _mm256_fmadd_ps(vn, vminus_ln2, vx); in xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x64() [all …]
|