/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 118 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 125 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 126 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 128 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 129 vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEFc0, vacc4x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 210 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 217 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 218 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 219 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-minmax-fma3-broadcast.c | 105 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 111 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 112 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 113 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 114 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 184 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 190 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 191 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 192 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 193 … = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 92 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 97 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 98 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 99 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 158 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 163 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 164 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 165 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4-minmax-fma3-broadcast.c | 66 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() local 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 106 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() local 109 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 93 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 100 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 101 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 102 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 103 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 104 vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEFc0, vacc4x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 185 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 192 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 193 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 194 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-minmax-fma3-broadcast.c | 83 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 89 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 90 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 91 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 92 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 162 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 168 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 169 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 170 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 171 … = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 73 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 78 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 79 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 80 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 139 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 144 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 145 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 146 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4-minmax-fma3-broadcast.c | 53 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 93 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 96 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 5x16s4inc-minmax-fma3-broadcast.c | 95 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 102 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 103 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 104 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 105 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 106 vacc4x89ABCDEF = _mm256_fmadd_ps(va4, vb89ABCDEFc0, vacc4x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 187 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 194 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 195 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 196 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 4x16s4inc-minmax-fma3-broadcast.c | 85 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 91 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 92 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 93 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 94 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 164 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 170 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 171 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 172 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 173 … = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16s4inc-minmax-fma3-broadcast.c | 75 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 80 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 81 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 82 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 141 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 146 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 147 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 148 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4inc-minmax-fma3-broadcast.c | 55 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 95 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 98 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 6x16-minmax-neonfp16arith-ld64.c | 98 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 107 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 108 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 109 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 110 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 111 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc0, va4, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 112 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 127 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 128 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 129 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 116 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 127 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 128 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 129 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 130 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 131 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc0, va4, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 132 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 133 vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc0, va6, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 134 vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc0, va7, 0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 153 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 80 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 87 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 88 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 89 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 90 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 101 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 102 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 103 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 104 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x16-minmax-neonfp16arith-ld64.c | 53 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 57 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 62 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 8x16inc-minmax-neonfp16arith-ld64.c | 118 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 129 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 130 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 131 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 132 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 133 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 134 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 135 vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc0, va6, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 136 vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc0, va7, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 155 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 100 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 109 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 110 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 111 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 112 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 113 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 114 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 129 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 130 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 131 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 82 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 89 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 90 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 91 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 92 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 103 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 104 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 105 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 106 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x16inc-minmax-neonfp16arith-ld64.c | 55 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() local 59 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64() 64 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_1x16__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 8x16-minmax-neonfp16arith-ld64.c | 148 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 159 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 160 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 161 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 162 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 163 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc0, va4, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 164 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 165 vacc6x89ABCDEF = vfmaq_lane_f16(vacc6x89ABCDEF, vb89ABCDEFc0, va6, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 166 vacc7x89ABCDEF = vfmaq_lane_f16(vacc7x89ABCDEF, vb89ABCDEFc0, va7, 0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 185 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 124 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 133 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 134 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 135 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 136 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 137 vacc4x89ABCDEF = vfmaq_lane_f16(vacc4x89ABCDEF, vb89ABCDEFc0, va4, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 138 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 153 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 154 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 155 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 100 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 107 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 108 vacc1x89ABCDEF = vfmaq_lane_f16(vacc1x89ABCDEF, vb89ABCDEFc0, va1, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 109 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 110 vacc3x89ABCDEF = vfmaq_lane_f16(vacc3x89ABCDEF, vb89ABCDEFc0, va3, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 121 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 122 vacc1x89ABCDEF = vfmaq_f16(vacc1x89ABCDEF, va1c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 123 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 124 vacc3x89ABCDEF = vfmaq_f16(vacc3x89ABCDEF, va3c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x16-minmax-neonfp16arith-ld64.c | 64 …const float16x8_t vb89ABCDEFc0 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 68 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64() 73 vacc0x89ABCDEF = vfmaq_f16(vacc0x89ABCDEF, va0c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64()
|
/external/XNNPACK/src/amalgam/ |
D | fma3.c | 2302 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 2305 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 2342 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 2345 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb89ABCDEFc0, vzero, _CMP_NEQ_OQ)), vb89ABCDEF… in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 2489 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 2495 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 2496 vacc1x89ABCDEF = _mm256_fmadd_ps(va1, vb89ABCDEFc0, vacc1x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 2497 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 2498 vacc3x89ABCDEF = _mm256_fmadd_ps(va3, vb89ABCDEFc0, vacc3x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 2568 const __m256 vb89ABCDEFc0 = _mm256_load_ps(w + 8); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local [all …]
|