/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 8x8inc-minmax-neonfp16arith-ld64.c | 169 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 172 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 173 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 174 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 175 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 176 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 177 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 178 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 179 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 190 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8inc-minmax-neonfp16arith-ld64.c | 141 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 144 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 145 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 148 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 149 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 158 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 159 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 160 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8inc-minmax-neonfp16arith-ld64.c | 113 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() local 116 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 117 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 118 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 119 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 126 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 127 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 128 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 129 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 173 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 177 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 178 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 179 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 180 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 181 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 182 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 197 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 198 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 199 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 199 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 202 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 203 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 204 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 205 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 206 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 207 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 208 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 209 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 220 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 165 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 168 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 169 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 170 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 171 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 173 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 182 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 183 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 184 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 131 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 134 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 135 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 137 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 144 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 145 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16-minmax-neonfp16arith-ld64.c | 241 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 245 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 246 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 247 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 248 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 249 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 250 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 251 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 252 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 271 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 197 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 201 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 202 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 203 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 204 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 205 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 206 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 221 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 222 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 223 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 167 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 170 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 171 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 172 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 173 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 174 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 175 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 176 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 177 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 188 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 139 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 142 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 143 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 144 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 145 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 147 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 156 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 157 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 158 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 111 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 114 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 115 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 116 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 117 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 124 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 125 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 126 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 127 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16-minmax-neonfp16arith-ld64.c | 209 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 213 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 214 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 215 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 216 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 217 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 218 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 219 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 220 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 239 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 171 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 175 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 176 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 177 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 178 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 179 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 180 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 195 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 196 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 197 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 132 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 135 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 136 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 137 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 138 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 139 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 224 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 227 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 228 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 229 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-minmax-fma3-broadcast.c | 116 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 119 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 120 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 121 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 122 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 195 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 198 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 199 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 200 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 201 … = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 100 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 103 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 104 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 105 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 166 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() local 169 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 170 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast() 171 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4-minmax-fma3-broadcast.c | 68 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 71 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 108 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 111 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 5x16s4inc-minmax-fma3-broadcast.c | 134 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 137 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 138 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 139 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 140 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 141 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 226 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() local 229 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 230 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() 231 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 4x16s4inc-minmax-fma3-broadcast.c | 118 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 121 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 122 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 123 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 124 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 197 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 200 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 201 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 202 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 203 … = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16s4inc-minmax-fma3-broadcast.c | 102 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 105 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 106 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 107 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 168 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() local 171 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 172 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast() 173 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_3x16s4__fma3_broadcast()
|
D | 1x16s4inc-minmax-fma3-broadcast.c | 70 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 73 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 110 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 113 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 157 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 160 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 161 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 162 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 163 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 164 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 249 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 252 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 253 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 254 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() [all …]
|
D | 4x16s4-minmax-fma3-broadcast.c | 138 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 141 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 142 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 143 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 144 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 217 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() local 220 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 221 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 222 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast() 223 … = _mm256_fmadd_ps(_mm256_and_ps(va3, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 3x16s4-minmax-fma3-broadcast.c | 119 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 122 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 123 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 124 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 185 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() local 188 … = _mm256_fmadd_ps(_mm256_and_ps(va0, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 189 … = _mm256_fmadd_ps(_mm256_and_ps(va1, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast() 190 … = _mm256_fmadd_ps(_mm256_and_ps(va2, _mm256_cmp_ps(vb01234567c2, vzero, _CMP_NEQ_OQ)), vb01234567… in xnn_f32_igemm_minmax_ukernel_3x16s4__fma3_broadcast()
|