/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16c2-minmax-rndnu-neon-mull-dup.c | 121 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 124 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 128 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 132 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 136 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 230 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 231 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 233 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 235 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 237 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld1r.c | 127 const int8x8_t va1c2 = vreinterpret_s8_s16(va12); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 130 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 134 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 138 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 142 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 236 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 237 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 239 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 241 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 243 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld4r.c | 121 const int8x8_t va1c2 = vreinterpret_s8_s16(va1.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 124 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 128 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 132 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 136 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 230 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 231 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 233 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 235 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 237 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld2r.c | 123 const int8x8_t va1c2 = vreinterpret_s8_s16(va11.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() local 126 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 130 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 134 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 138 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 232 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() local 233 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 235 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 237 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 239 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r()
|
D | 2x8c2-minmax-rndnu-neon-mull-ld4r.c | 93 const int8x8_t va1c2 = vreinterpret_s8_s16(va1.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() local 96 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() 100 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() 160 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() local 161 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() 163 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r()
|
D | 2x8c2-minmax-rndnu-neon-mull-dup.c | 93 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() local 96 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() 100 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() 160 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() local 161 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() 163 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup()
|
D | 2x8c2-minmax-rndnu-neon-mull-ld1r.c | 99 const int8x8_t va1c2 = vreinterpret_s8_s16(va12); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() local 102 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() 106 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() 166 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() local 167 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() 169 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r()
|
D | 2x8c2-minmax-rndnu-neon-mull-ld2r.c | 95 const int8x8_t va1c2 = vreinterpret_s8_s16(va11.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() local 98 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() 102 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() 162 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() local 163 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() 165 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r()
|
D | 3x16c2-minmax-rndnu-neon-mull-dup.c | 150 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 154 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 160 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 166 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 172 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 296 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 297 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 299 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 301 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 303 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld2r.c | 153 const int8x8_t va1c2 = vreinterpret_s8_s16(va11.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 157 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 163 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 169 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 175 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 299 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 300 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 302 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 304 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 306 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld4r.c | 150 const int8x8_t va1c2 = vreinterpret_s8_s16(va1.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 154 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 160 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 166 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 172 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 296 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 297 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 299 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 301 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 303 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld1r.c | 159 const int8x8_t va1c2 = vreinterpret_s8_s16(va12); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 163 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 169 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 175 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 181 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 305 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 306 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 308 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 310 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 312 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16c2-minmax-rndnu-neon-mull-ld1r.c | 140 const int8x8_t va1c2 = vreinterpret_s8_s16(va12); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 143 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 147 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 151 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 155 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 249 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() local 250 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 252 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 254 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r() 256 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r()
|
D | 2x16c2-minmax-rndnu-neon-mull-dup.c | 134 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 137 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 141 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 145 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 149 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 243 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() local 244 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 246 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 248 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup() 250 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld4r.c | 134 const int8x8_t va1c2 = vreinterpret_s8_s16(va1.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 137 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 141 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 145 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 149 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 243 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() local 244 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 246 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 248 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r() 250 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r()
|
D | 2x16c2-minmax-rndnu-neon-mull-ld2r.c | 136 const int8x8_t va1c2 = vreinterpret_s8_s16(va11.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() local 139 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 143 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 147 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 151 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 245 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() local 246 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 248 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 250 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r() 252 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r()
|
D | 2x8c2-minmax-rndnu-neon-mull-ld4r.c | 106 const int8x8_t va1c2 = vreinterpret_s8_s16(va1.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() local 109 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() 113 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() 173 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() local 174 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r() 176 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r()
|
D | 2x8c2-minmax-rndnu-neon-mull-ld2r.c | 108 const int8x8_t va1c2 = vreinterpret_s8_s16(va11.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() local 111 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() 115 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() 175 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() local 176 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r() 178 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r()
|
D | 2x8c2-minmax-rndnu-neon-mull-ld1r.c | 112 const int8x8_t va1c2 = vreinterpret_s8_s16(va12); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() local 115 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() 119 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() 179 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() local 180 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r() 182 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r()
|
D | 2x8c2-minmax-rndnu-neon-mull-dup.c | 106 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() local 109 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() 113 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() 173 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() local 174 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup() 176 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld1r.c | 174 const int8x8_t va1c2 = vreinterpret_s8_s16(va12); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 178 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 184 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 190 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 196 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 320 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 321 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 323 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 325 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 327 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r()
|
D | 3x16c2-minmax-rndnu-neon-mull-dup.c | 165 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 169 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 175 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 181 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 187 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 311 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 312 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 314 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 316 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 318 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld4r.c | 165 const int8x8_t va1c2 = vreinterpret_s8_s16(va1.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 169 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 175 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 181 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 187 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 311 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 312 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 314 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 316 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 318 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld2r.c | 168 const int8x8_t va1c2 = vreinterpret_s8_s16(va11.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 172 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 178 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 184 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 190 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 314 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 315 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 317 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 319 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 321 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r()
|
D | 3x8c2-minmax-rndnu-neon-mull-dup.c | 127 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() local 131 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 137 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 215 const int8x8_t va1c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() local 216 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 218 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, va1c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup()
|