/external/XNNPACK/src/f16-gemm/gen/ |
D | 6x8-minmax-neonfp16arith-ld64.c | 79 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 98 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 112 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 122 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 136 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 160 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 170 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 184 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 205 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 91 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 114 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 132 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 144 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 174 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 192 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 204 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 222 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 247 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 83 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 105 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 125 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 142 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 179 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 199 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 216 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 236 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 264 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 6x8-minmax-neonfp16arith-ld64.c | 71 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 124 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 138 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 148 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 186 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 196 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 210 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 229 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8-minmax-neonfp16arith-ld64.c | 79 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 164 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 176 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 194 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 206 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 224 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 236 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 254 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 277 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 75 float16x8_t vacc4x01234567 = vacc0x01234567; in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 131 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 151 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 168 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 188 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 205 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 225 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 242 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 262 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 288 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 6x8inc-minmax-neonfp16arith-ld64.c | 81 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 100 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 114 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 124 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 138 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 148 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 162 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 186 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 207 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 8x8inc-minmax-neonfp16arith-ld64.c | 93 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 116 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 134 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 164 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 176 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 194 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 206 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 224 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 249 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 85 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 107 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 127 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 144 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 164 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 181 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 201 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 218 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 238 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 266 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16inc-minmax-neonfp16arith-ld64.c | 97 …float16x8_t vacc4x01234567 = vld1q_f16(acc); acc = (const void*) ((uintptr_t) acc + sizeof(float16… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 125 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c0, va4, 0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 151 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c0, vb01234567c0); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c1, va4, 1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 198 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c1, vb01234567c1); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 219 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 245 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 266 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 292 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 326 vacc4x01234567 = vfmaq_f16(vacc4x01234567, va4, vb01234567); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 99 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 119 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 139 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 159 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 191 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 207 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 219 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 227 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 252 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 5x8-minmax-fma3-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast() local 93 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast() 103 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast() 110 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast() 113 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast() 132 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast() 144 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x8__fma3_broadcast()
|
D | 5x8-minmax-avx-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast() local 93 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast() 103 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast() 110 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast() 113 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast() 132 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast() 144 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x8__avx_broadcast()
|
D | 5x16-minmax-avx-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() local 99 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() 114 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() 126 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() 134 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() 159 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() 165 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() 177 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast() 189 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast()
|
D | 5x16-minmax-fma3-broadcast.c | 74 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() local 99 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() 114 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() 126 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() 134 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() 159 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() 165 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() 177 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast() 189 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 124 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 144 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 164 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 184 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 216 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 234 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 246 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 254 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 274 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 5x16-minmax-avx-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() local 128 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 140 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 152 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 160 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 180 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 186 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 198 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast() 210 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast()
|
D | 5x16-minmax-fma3-broadcast.c | 70 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() local 128 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 140 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 152 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 160 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 180 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 186 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 198 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast() 210 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast()
|
D | 5x8-minmax-fma3-broadcast.c | 66 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast() local 118 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast() 129 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast() 136 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast() 139 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast() 153 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast() 165 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x8__fma3_broadcast()
|
D | 5x8-minmax-avx-broadcast.c | 66 __m256 vacc4x01234567 = vacc0x01234567; in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast() local 118 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast() 129 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast() 136 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast() 139 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast() 153 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast() 165 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_igemm_minmax_ukernel_5x8__avx_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 5x16s4inc-minmax-fma3-broadcast.c | 76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 101 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c0, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 121 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c1, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 141 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 161 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 193 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 209 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 221 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 229 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 254 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 5x8inc-minmax-fma3-broadcast.c | 72 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast() local 95 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast() 105 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast() 112 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast() 115 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast() 134 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast() 146 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x8__fma3_broadcast()
|
D | 5x8inc-minmax-avx-broadcast.c | 72 __m256 vacc4x01234567 = _mm256_load_ps(acc + 32); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast() local 95 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast() 105 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast() 112 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast() 115 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast() 134 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast() 146 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x8__avx_broadcast()
|
D | 5x16inc-minmax-avx-broadcast.c | 76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() local 101 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 116 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 128 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 136 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 161 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 167 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 179 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast() 191 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x16__avx_broadcast()
|
D | 5x16inc-minmax-fma3-broadcast.c | 76 __m256 vacc4x01234567 = _mm256_load_ps(acc + 64); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() local 101 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 116 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 128 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 136 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 161 _mm256_storeu_ps(c4, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 167 vacc4x01234567 = vacc4x89ABCDEF; in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 179 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast() 191 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1); in xnn_f32_gemminc_minmax_ukernel_5x16__fma3_broadcast()
|