/external/XNNPACK/src/f16-gemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 167 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 170 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 171 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 172 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 173 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 174 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 175 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 176 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 177 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() 188 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 139 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 142 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 143 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 144 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 145 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 146 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 147 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 156 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 157 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() 158 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 111 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 114 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 115 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 116 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 117 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 124 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 125 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 126 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64() 127 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 6x16-minmax-neonfp16arith-ld64.c | 171 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 175 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 176 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 177 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 178 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 179 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 180 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 195 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 196 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 197 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 8x16-minmax-neonfp16arith-ld64.c | 209 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 213 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 214 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 215 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 216 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 217 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 218 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 219 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 220 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() 239 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 133 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 137 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 138 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 139 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 140 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 151 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 152 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 153 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64() 154 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8-minmax-neonfp16arith-ld64.c | 69 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64() local 72 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64() 76 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-igemm/gen/ |
D | 8x8-minmax-neonfp16arith-ld64.c | 199 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() local 202 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 203 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 204 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 205 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 206 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 207 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 208 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 209 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() 220 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8-minmax-neonfp16arith-ld64.c | 165 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() local 168 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 169 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 170 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 171 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 172 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 173 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 182 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 183 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() 184 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8-minmax-neonfp16arith-ld64.c | 131 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() local 134 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 135 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 136 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 137 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 144 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 145 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16-minmax-neonfp16arith-ld64.c | 241 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() local 245 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 246 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 247 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 248 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 249 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 250 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 251 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 252 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() 271 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16-minmax-neonfp16arith-ld64.c | 197 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 201 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 202 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 203 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 204 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 205 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 206 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 221 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 222 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() 223 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16-minmax-neonfp16arith-ld64.c | 153 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() local 157 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 158 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 159 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 160 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 171 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 172 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 173 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64() 174 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8-minmax-neonfp16arith-ld64.c | 80 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof( float16x… in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64() local 83 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64() 87 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f16-gemm/gen-inc/ |
D | 8x8inc-minmax-neonfp16arith-ld64.c | 169 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() local 172 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 173 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 174 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 175 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 176 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 177 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 178 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 179 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() 190 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x8__neonfp16arith_ld64() [all …]
|
D | 6x8inc-minmax-neonfp16arith-ld64.c | 141 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() local 144 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 145 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 146 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 147 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 148 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 149 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 158 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 159 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() 160 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x8__neonfp16arith_ld64() [all …]
|
D | 4x8inc-minmax-neonfp16arith-ld64.c | 113 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() local 116 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 117 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 118 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 119 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 126 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 127 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 128 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64() 129 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x8__neonfp16arith_ld64()
|
D | 8x16inc-minmax-neonfp16arith-ld64.c | 211 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() local 215 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 216 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 217 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 218 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 219 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 220 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 221 vacc6x01234567 = vfmaq_lane_f16(vacc6x01234567, vb01234567c2, va6, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 222 vacc7x01234567 = vfmaq_lane_f16(vacc7x01234567, vb01234567c2, va7, 2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() 241 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_8x16__neonfp16arith_ld64() [all …]
|
D | 6x16inc-minmax-neonfp16arith-ld64.c | 173 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() local 177 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 178 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 179 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 180 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 181 vacc4x01234567 = vfmaq_lane_f16(vacc4x01234567, vb01234567c2, va4, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 182 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 197 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 198 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() 199 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|
D | 4x16inc-minmax-neonfp16arith-ld64.c | 135 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() local 139 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 140 vacc1x01234567 = vfmaq_lane_f16(vacc1x01234567, vb01234567c2, va1, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 141 vacc2x01234567 = vfmaq_lane_f16(vacc2x01234567, vb01234567c2, va2, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 142 vacc3x01234567 = vfmaq_lane_f16(vacc3x01234567, vb01234567c2, va3, 2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 153 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 154 vacc1x01234567 = vfmaq_f16(vacc1x01234567, va1c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 155 vacc2x01234567 = vfmaq_f16(vacc2x01234567, va2c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64() 156 vacc3x01234567 = vfmaq_f16(vacc3x01234567, va3c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_4x16__neonfp16arith_ld64()
|
D | 1x8inc-minmax-neonfp16arith-ld64.c | 71 …const float16x8_t vb01234567c2 = vld1q_f16(w); w = (const void*) ((uintptr_t) w + sizeof(float16x8… in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64() local 74 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64() 78 vacc0x01234567 = vfmaq_f16(vacc0x01234567, va0c2, vb01234567c2); in xnn_f16_gemminc_minmax_ukernel_1x8__neonfp16arith_ld64()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 4x16s4inc-minmax-fma3-broadcast.c | 118 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() local 121 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 122 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 123 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast() 124 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_4x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 4x16s4-minmax-fma3-broadcast.c | 116 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() local 119 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 120 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 121 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast() 122 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast()
|
D | 5x16s4-minmax-fma3-broadcast.c | 132 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() local 135 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 136 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 137 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 138 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast() 139 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_5x16s4__fma3_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 5x16s4-minmax-fma3-broadcast.c | 157 const __m256 vb01234567c2 = _mm256_load_ps(w + 32); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() local 160 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 161 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567c2, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 162 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567c2, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 163 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567c2, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast() 164 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567c2, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_5x16s4__fma3_broadcast()
|