/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x8c2s4-minmax-rndnu-neon-mlal.c | 96 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local 111 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 124 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 140 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 145 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 158 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 174 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 179 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 192 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 208 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() [all …]
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 104 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 127 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 140 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 153 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 166 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 182 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 187 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 200 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 213 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 226 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() [all …]
|
D | 4x8c2s4-minmax-rndnu-neon-mull.c | 93 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() local 107 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 115 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 123 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 127 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 135 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 143 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 147 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 155 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 163 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() [all …]
|
D | 4x16c2s4-minmax-rndnu-neon-mull.c | 101 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local 123 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 131 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 139 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 147 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 155 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 159 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 167 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 175 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 183 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() [all …]
|
D | 4x8c4s2-minmax-rndnu-neon-mlal.c | 104 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local 119 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 132 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 145 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 158 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 174 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 179 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 192 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 205 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 218 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() [all …]
|
D | 4x16c4s2-minmax-rndnu-neon-mlal.c | 120 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local 143 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 156 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 169 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 182 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 195 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 208 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 221 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 234 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 250 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() [all …]
|
D | 4x8c4s2-minmax-rndnu-neon-mull.c | 101 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() local 115 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 123 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 131 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 139 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 147 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 151 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 159 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 167 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 175 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
|
D | 4x16c4s2-minmax-rndnu-neon-mull.c | 117 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() local 139 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 147 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 155 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 163 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 171 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 179 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 187 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 195 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 203 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() [all …]
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 153 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 177 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 190 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 203 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 216 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 229 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 242 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 255 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 268 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 281 int16x8_t vprod3x8 = vmull_s8(vb8x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() [all …]
|
D | 4x8c8-minmax-rndnu-neon-mlal.c | 121 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local 137 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 150 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 163 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 176 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 189 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 202 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 215 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 228 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
|
D | 4x8c2-minmax-rndnu-neon-mlal-dup.c | 96 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() local 114 const int8x8_t va3c0x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() 149 const int8x8_t va3c1x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() 184 const int8x8_t va3c2x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() 219 const int8x8_t va3c3x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
|
D | 4x8c2-minmax-rndnu-neon-mlal-ld4r.c | 96 const int16x4x4_t va3x0 = vld4_dup_s16((const void*)a3); a3 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() local 114 const int8x8_t va3c0x0 = vreinterpret_s8_s16(va3x0.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() 149 const int8x8_t va3c1x0 = vreinterpret_s8_s16(va3x0.val[1]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() 184 const int8x8_t va3c2x0 = vreinterpret_s8_s16(va3x0.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() 219 const int8x8_t va3c3x0 = vreinterpret_s8_s16(va3x0.val[3]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x8c2s4-minmax-rndnu-neon-mlal.c | 79 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() local 94 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 107 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 123 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 128 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 141 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 157 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 162 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 175 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() 191 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal() [all …]
|
D | 4x16c2s4-minmax-rndnu-neon-mlal.c | 87 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() local 110 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 123 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 136 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 149 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 165 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 170 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 183 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 196 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() 209 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal() [all …]
|
D | 4x8c2s4-minmax-rndnu-neon-mull.c | 76 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() local 90 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 98 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 106 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 110 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 118 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 126 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 130 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 138 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() 146 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull() [all …]
|
D | 4x16c2s4-minmax-rndnu-neon-mull.c | 84 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() local 106 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 114 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 122 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 130 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 138 va3x0 = vext_s8(va3x0, va3x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 142 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 150 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 158 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() 166 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull() [all …]
|
D | 4x8c4s2-minmax-rndnu-neon-mlal.c | 87 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() local 102 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 115 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 128 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 141 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 157 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 162 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 175 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 188 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() 201 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mlal() [all …]
|
D | 4x16c4s2-minmax-rndnu-neon-mlal.c | 103 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() local 126 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 139 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 152 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 165 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 178 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 191 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 204 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 217 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() 233 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mlal() [all …]
|
D | 4x8c4s2-minmax-rndnu-neon-mull.c | 84 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() local 98 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 106 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 114 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 122 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 130 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 134 int16x8_t vprod3x01c1 = vmull_s8(vb01c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 142 int16x8_t vprod3x23c1 = vmull_s8(vb23c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 150 int16x8_t vprod3x45c1 = vmull_s8(vb45c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull() 158 int16x8_t vprod3x67c1 = vmull_s8(vb67c1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4s2__neon_mull()
|
D | 4x16c4s2-minmax-rndnu-neon-mull.c | 100 int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() local 122 int16x8_t vprod3x01c0 = vmull_s8(vb01c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 130 int16x8_t vprod3x23c0 = vmull_s8(vb23c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 138 int16x8_t vprod3x45c0 = vmull_s8(vb45c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 146 int16x8_t vprod3x67c0 = vmull_s8(vb67c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 154 int16x8_t vprod3x89c0 = vmull_s8(vb89c0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 162 int16x8_t vprod3xABc0 = vmull_s8(vbABc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 170 int16x8_t vprod3xCDc0 = vmull_s8(vbCDc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 178 int16x8_t vprod3xEFc0 = vmull_s8(vbEFc0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() 186 va3x0 = vext_s8(va3x0, va3x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4s2__neon_mull() [all …]
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 136 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 160 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 173 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 186 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 199 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 212 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 225 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 238 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 251 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 264 int16x8_t vprod3x8 = vmull_s8(vb8x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() [all …]
|
D | 4x8c8-minmax-rndnu-neon-mlal.c | 104 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() local 120 int16x8_t vprod3x0 = vmull_s8(vb0x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 133 int16x8_t vprod3x1 = vmull_s8(vb1x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 146 int16x8_t vprod3x2 = vmull_s8(vb2x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 159 int16x8_t vprod3x3 = vmull_s8(vb3x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 172 int16x8_t vprod3x4 = vmull_s8(vb4x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 185 int16x8_t vprod3x5 = vmull_s8(vb5x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 198 int16x8_t vprod3x6 = vmull_s8(vb6x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal() 211 int16x8_t vprod3x7 = vmull_s8(vb7x0, va3x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal()
|
D | 4x8c2-minmax-rndnu-neon-mlal-ld4r.c | 79 const int16x4x4_t va3x0 = vld4_dup_s16((const void*)a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() local 96 const int8x8_t va3c0x0 = vreinterpret_s8_s16(va3x0.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() 131 const int8x8_t va3c1x0 = vreinterpret_s8_s16(va3x0.val[1]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() 166 const int8x8_t va3c2x0 = vreinterpret_s8_s16(va3x0.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r() 201 const int8x8_t va3c3x0 = vreinterpret_s8_s16(va3x0.val[3]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r()
|
D | 4x8c2-minmax-rndnu-neon-mlal-dup.c | 79 const int8x8_t va3x0 = vld1_s8(a3); a3 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() local 96 const int8x8_t va3c0x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() 131 const int8x8_t va3c1x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() 166 const int8x8_t va3c2x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup() 201 const int8x8_t va3c3x0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup()
|
/external/XNNPACK/src/bf16-gemm/gen/ |
D | 4x4c8-minmax-neonbf16-bfmlal.c | 150 … const bfloat16x8_t va3x0 = vreinterpretq_bf16_u16(vbicq_u16(vreinterpretq_u16_bf16(va3), vm0)); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() local 151 vacc3x0 = vbfmlalbq_f32(vacc3x0, va3x0, vb0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal() 152 vacc3x0 = vbfmlaltq_f32(vacc3x0, va3x0, vb0); in xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal()
|