/external/XNNPACK/src/f16-igemm/gen/ |
D | 4x16-minmax-neonfp16arith-ld64.c | 64 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 109 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 123 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 136 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc1, va2, 1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 150 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 163 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc2, va2, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 177 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 190 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 204 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 224 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2, vb89ABCDEF); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 72 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 135 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 155 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 172 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc1, va2, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 192 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c1, vb89ABCDEFc1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 209 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc2, va2, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 229 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c2, vb89ABCDEFc2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 246 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 266 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 292 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2, vb89ABCDEF); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 4x16inc-minmax-neonfp16arith-ld64.c | 70 …float16x8_t vacc2x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 91 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 105 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 118 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc1, va2, 1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 132 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 145 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 159 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 172 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 186 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 208 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 82 …float16x8_t vacc2x89ABCDEF = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 111 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 131 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 148 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc1, va2, 1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 168 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c1, vb89ABCDEFc1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 185 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 205 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c2, vb89ABCDEFc2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 222 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 242 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 270 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2, vb89ABCDEF); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 4x16-minmax-neonfp16arith-ld64.c | 68 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 89 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 103 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 116 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc1, va2, 1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 130 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 143 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc2, va2, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 157 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 170 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 184 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 206 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2, vb89ABCDEF); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 80 float16x8_t vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 109 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc0, va2, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 129 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c0, vb89ABCDEFc0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 146 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc1, va2, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 166 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c1, vb89ABCDEFc1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 183 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc2, va2, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 203 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c2, vb89ABCDEFc2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 220 vacc2x89ABCDEF = vfmaq_lane_f16(vacc2x89ABCDEF, vb89ABCDEFc3, va2, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 240 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2c3, vb89ABCDEFc3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 268 vacc2x89ABCDEF = vfmaq_f16(vacc2x89ABCDEF, va2, vb89ABCDEF); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 3x16s4-minmax-fma3-broadcast.c | 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 99 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 113 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 141 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 165 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 179 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 187 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 191 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 208 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 4x16s4-minmax-fma3-broadcast.c | 63 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 113 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 130 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 147 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 164 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 192 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 208 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 218 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 226 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 245 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 67 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 147 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 167 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 187 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 219 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 237 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 249 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 261 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 282 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 3x16-minmax-fma3-broadcast.c | 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() local 99 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 111 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 119 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 123 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast() 140 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16__fma3_broadcast()
|
D | 3x16-minmax-avx-broadcast.c | 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() local 99 vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF)); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 111 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 119 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 123 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast() 140 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_3x16__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 3x16s4inc-minmax-fma3-broadcast.c | 61 __m256 vacc2x89ABCDEF = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 82 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 96 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 110 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 124 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 148 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 160 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 168 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 172 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 192 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 4x16s4inc-minmax-fma3-broadcast.c | 67 __m256 vacc2x89ABCDEF = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 93 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 110 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 127 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 144 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 172 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 186 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 196 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 204 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 227 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16inc-minmax-fma3-broadcast.c | 61 __m256 vacc2x89ABCDEF = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() local 82 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 93 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 101 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 105 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast() 125 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_3x16__fma3_broadcast()
|
D | 3x16inc-minmax-avx-broadcast.c | 61 __m256 vacc2x89ABCDEF = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() local 82 vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF)); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 93 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 101 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 105 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast() 125 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_3x16__avx_broadcast()
|
D | 5x16s4inc-minmax-fma3-broadcast.c | 73 __m256 vacc2x89ABCDEF = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 104 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 124 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 144 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 164 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 196 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 212 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 224 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 236 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 262 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 4x16inc-minmax-fma3-broadcast.c | 67 __m256 vacc2x89ABCDEF = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() local 93 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 106 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 116 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 124 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast() 147 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16__fma3_broadcast()
|
D | 4x16inc-minmax-avx-broadcast.c | 67 __m256 vacc2x89ABCDEF = _mm256_load_ps(acc + 40); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() local 93 vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF)); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 106 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 116 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 124 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast() 147 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_4x16__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 3x16s4-minmax-fma3-broadcast.c | 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 80 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 94 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 108 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 122 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 146 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 158 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 166 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 170 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 190 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 4x16s4-minmax-fma3-broadcast.c | 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 91 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 108 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 125 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 142 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 170 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 184 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 194 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 202 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 225 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16-minmax-avx-broadcast.c | 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() local 80 vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF)); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 91 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 99 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 103 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast() 123 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__avx_broadcast()
|
D | 3x16-minmax-fma3-broadcast.c | 59 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() local 80 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 91 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 99 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 103 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast() 123 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_3x16__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 71 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 102 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc0, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 122 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc1, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 142 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc2, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 162 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEFc3, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 194 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 210 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 222 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 234 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 260 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
D | 4x16-minmax-avx-broadcast.c | 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() local 91 vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF)); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 104 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 114 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 122 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast() 145 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__avx_broadcast()
|
D | 4x16-minmax-fma3-broadcast.c | 65 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() local 91 vacc2x89ABCDEF = _mm256_fmadd_ps(va2, vb89ABCDEF, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 104 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 114 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 122 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast() 145 vacc2x01234567 = vacc2x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_4x16__fma3_broadcast()
|