/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x8c2s4-minmax-rndnu-neon-mlal.c | 85 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() local 102 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 112 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 121 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 128 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 138 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 147 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 154 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 164 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 173 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() [all …]
|
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 91 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 116 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 126 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 136 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 146 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 155 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 162 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 172 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 182 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 192 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() [all …]
|
D | 4x8c2s4-minmax-rndnu-neon-mlal.c | 95 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local 115 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 128 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 139 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 149 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 162 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 173 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 183 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 196 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 207 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() [all …]
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 103 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 131 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 144 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 157 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 170 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 181 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 191 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 204 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 217 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 230 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() [all …]
|
D | 3x16c4s2-minmax-rndnu-neon-mlal.c | 103 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local 128 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 138 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 148 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 158 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 168 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 178 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 188 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 198 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 207 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() [all …]
|
D | 3x8c4s2-minmax-rndnu-neon-mlal.c | 91 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() local 108 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 118 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 128 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 138 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 147 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 154 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 164 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 174 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 184 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
|
D | 4x8c4s2-minmax-rndnu-neon-mlal.c | 103 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local 123 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 136 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 149 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 162 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 173 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 183 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 196 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 209 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 222 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
|
D | 4x16c4s2-minmax-rndnu-neon-mlal.c | 119 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local 147 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 160 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 173 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 186 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 199 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 212 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 225 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 238 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 249 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() [all …]
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 128 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 153 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 163 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 173 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 183 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 193 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 203 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 213 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 223 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 233 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() [all …]
|
D | 3x8c8-minmax-rndnu-neon-mlal.c | 104 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() local 121 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 131 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 141 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 151 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 161 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 171 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 181 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 191 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 152 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 180 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 193 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 206 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 219 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 232 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 245 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 258 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 271 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 284 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() [all …]
|
D | 4x8c8-minmax-rndnu-neon-mlal.c | 120 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local 140 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 153 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 166 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 179 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 192 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 205 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 218 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 231 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c2s4-minmax-rndnu-neon-mlal.c | 70 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() local 87 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 97 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 106 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 113 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 123 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 132 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 139 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 149 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() 158 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal() [all …]
|
D | 3x16c2s4-minmax-rndnu-neon-mlal.c | 76 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() local 101 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 111 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 121 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 131 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 140 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 147 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 157 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 167 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() 177 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal() [all …]
|
D | 4x8c2s4-minmax-rndnu-neon-mlal.c | 78 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local 98 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 111 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 122 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 132 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 145 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 156 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 166 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 179 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 190 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() [all …]
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 86 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 114 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 127 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 140 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 153 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 164 va2x1 = vext_s8(va2x1, va2x1, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 174 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 187 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 200 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 213 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() [all …]
|
D | 3x16c4s2-minmax-rndnu-neon-mlal.c | 88 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() local 113 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 123 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 133 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 143 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 153 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 163 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 173 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 183 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() 192 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4s2__neon_mlal() [all …]
|
D | 3x8c4s2-minmax-rndnu-neon-mlal.c | 76 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() local 93 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 103 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 113 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 123 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 132 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 139 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 149 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 159 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal() 169 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4s2__neon_mlal()
|
D | 4x8c4s2-minmax-rndnu-neon-mlal.c | 86 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local 106 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 119 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 132 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 145 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 156 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 166 vprod2x01c1 = vmlal_s8(vprod2x01c1, vb01c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 179 vprod2x23c1 = vmlal_s8(vprod2x23c1, vb23c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 192 vprod2x45c1 = vmlal_s8(vprod2x45c1, vb45c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 205 vprod2x67c1 = vmlal_s8(vprod2x67c1, vb67c1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal()
|
D | 4x16c4s2-minmax-rndnu-neon-mlal.c | 102 int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local 130 vprod2x01c0 = vmlal_s8(vprod2x01c0, vb01c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 143 vprod2x23c0 = vmlal_s8(vprod2x23c0, vb23c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 156 vprod2x45c0 = vmlal_s8(vprod2x45c0, vb45c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 169 vprod2x67c0 = vmlal_s8(vprod2x67c0, vb67c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 182 vprod2x89c0 = vmlal_s8(vprod2x89c0, vb89c0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 195 vprod2xABc0 = vmlal_s8(vprod2xABc0, vbABc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 208 vprod2xCDc0 = vmlal_s8(vprod2xCDc0, vbCDc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 221 vprod2xEFc0 = vmlal_s8(vprod2xEFc0, vbEFc0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 232 va2x1 = vext_s8(va2x1, va2x1, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() [all …]
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 113 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 138 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 148 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 158 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 168 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 178 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 188 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 198 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 208 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 218 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() [all …]
|
D | 3x8c8-minmax-rndnu-neon-mlal.c | 89 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() local 106 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 116 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 126 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 136 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 146 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 156 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 166 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal() 176 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 135 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 163 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 176 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 189 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 202 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 215 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 228 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 241 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 254 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 267 vprod2x8 = vmlal_s8(vprod2x8, vb8x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() [all …]
|
D | 4x8c8-minmax-rndnu-neon-mlal.c | 103 const int8x8_t va2x1 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local 123 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 136 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 149 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 162 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 175 vprod2x4 = vmlal_s8(vprod2x4, vb4x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 188 vprod2x5 = vmlal_s8(vprod2x5, vb5x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 201 vprod2x6 = vmlal_s8(vprod2x6, vb6x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 214 vprod2x7 = vmlal_s8(vprod2x7, vb7x1, va2x1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
|
/external/XNNPACK/src/bf16-gemm/gen/ |
D | 3x4c8-minmax-neonbf16-bfmlal.c | 136 … const bfloat16x8_t va2x1 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va2), vm1)); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() local 137 vacc2x1 = vbfmlalbq_f32(vacc2x1, va2x1, vb1); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal() 138 vacc2x1 = vbfmlaltq_f32(vacc2x1, va2x1, vb1); in xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal()
|