/external/XNNPACK/src/f16-gemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 197 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 200 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 201 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 202 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 203 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 204 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 205 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 206 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 207 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 218 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 163 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 166 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 167 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 168 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 169 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 170 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 171 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 180 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 181 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 182 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 129 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 132 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 133 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 134 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 135 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 142 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 143 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 144 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 145 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 6x16-minmax-neonfp16arith-ld64.c | 208 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 212 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 213 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 214 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 215 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 216 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 217 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 232 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 233 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 234 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 256 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 260 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 261 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 262 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 263 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 264 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 265 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 266 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 267 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 286 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 160 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 164 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 165 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 166 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 167 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 178 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 179 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 180 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 181 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8-minmax-neonfp16arith-ld64.c | 78 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64() local 81 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64() 85 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 229 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 232 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 233 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 234 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 235 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 236 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 237 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 238 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 239 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 250 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 189 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 192 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 193 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 194 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 195 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 196 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 197 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 206 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 207 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 208 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 149 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 152 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 153 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 154 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 155 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 162 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 163 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 164 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 165 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16-minmax-neonfp16arith-ld64.c | 288 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 292 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 293 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 294 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 295 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 296 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 297 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 298 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 299 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 318 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 234 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 238 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 239 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 240 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 241 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 242 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 243 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 258 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 259 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 260 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 180 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 184 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 185 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 186 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 187 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 198 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 199 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 200 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 201 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8-minmax-neonfp16arith-ld64.c | 89 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64() local 92 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64() 96 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 8x8inc-minmax-neonfp16arith-ld64.c | 199 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 202 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 203 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 204 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 205 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 206 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 207 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 208 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 209 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 220 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8inc-minmax-neonfp16arith-ld64.c | 165 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 168 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 169 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 170 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 171 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 173 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 182 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 183 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 184 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8inc-minmax-neonfp16arith-ld64.c | 131 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() local 134 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 135 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 137 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 144 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 145 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16inc-minmax-neonfp16arith-ld64.c | 258 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 262 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 263 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 264 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 265 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 266 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 267 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 268 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c3, va6, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 269 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c3, va7, 3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 288 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 210 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 214 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 215 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 216 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 217 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 218 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c3, va4, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 219 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c3, va5, 3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 234 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 235 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 236 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 162 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 166 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 167 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c3, va1, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 168 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c3, va2, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 169 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c3, va3, 3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 180 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 181 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 182 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 183 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8inc-minmax-neonfp16arith-ld64.c | 80 …const float16x8_t vb01234567c3 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64() local 83 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c3, va0, 3); in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64() 87 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c3, vb01234567c3); in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x16s4inc-minmax-fma3-broadcast.c | 135 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 138 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 139 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 140 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 141 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 133 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 136 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 137 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 138 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 139 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 152 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 155 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 156 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 157 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 158 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 159 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 177 const __m256 vb01234567c3 = _mm256_load_ps(w + 48); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 180 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c3, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 181 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c3, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 182 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c3, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 183 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c3, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 184 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c3, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
|