/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c2-minmax-rndnu-neon-mull-dup.c | 151 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 155 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 161 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 167 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 173 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 305 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 306 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 308 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 310 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 312 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld2r.c | 154 const int8x8_t va2c2 = vreinterpret_s8_s16(va21.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 158 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 164 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 170 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 176 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 308 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 309 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 311 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 313 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 315 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld4r.c | 151 const int8x8_t va2c2 = vreinterpret_s8_s16(va2.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 155 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 161 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 167 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 173 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 305 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 306 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 308 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 310 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 312 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld1r.c | 160 const int8x8_t va2c2 = vreinterpret_s8_s16(va22); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 164 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 170 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 176 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 182 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 314 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 315 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 317 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 319 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 321 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r()
|
D | 3x8c2-minmax-rndnu-neon-mull-ld1r.c | 122 const int8x8_t va2c2 = vreinterpret_s8_s16(va22); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() local 126 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() 132 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() 214 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() local 215 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() 217 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r()
|
D | 3x8c2-minmax-rndnu-neon-mull-ld4r.c | 113 const int8x8_t va2c2 = vreinterpret_s8_s16(va2.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() local 117 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() 123 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() 205 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() local 206 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() 208 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r()
|
D | 3x8c2-minmax-rndnu-neon-mull-dup.c | 113 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() local 117 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 123 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 205 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() local 206 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 208 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup()
|
D | 3x8c2-minmax-rndnu-neon-mull-ld2r.c | 116 const int8x8_t va2c2 = vreinterpret_s8_s16(va21.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() local 120 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() 126 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() 208 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() local 209 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() 211 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mull-ld2r.c | 184 const int8x8_t va2c2 = vreinterpret_s8_s16(va21.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() local 189 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 197 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 205 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 213 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 375 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() local 376 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 378 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 380 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 382 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r()
|
D | 4x16c2-minmax-rndnu-neon-mull-ld1r.c | 192 const int8x8_t va2c2 = vreinterpret_s8_s16(va22); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() local 197 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 205 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 213 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 221 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 383 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() local 384 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 386 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 388 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r() 390 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r()
|
D | 4x16c2-minmax-rndnu-neon-mull-dup.c | 180 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 185 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 193 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 201 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 209 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 371 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 372 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 374 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 376 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 378 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup()
|
D | 4x16c2-minmax-rndnu-neon-mull-ld4r.c | 180 const int8x8_t va2c2 = vreinterpret_s8_s16(va2.val[2]); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() local 185 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 193 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 201 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 209 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 371 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() local 372 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 374 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 376 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 378 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c2-minmax-rndnu-neon-mull-ld1r.c | 175 const int8x8_t va2c2 = vreinterpret_s8_s16(va22); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 179 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 185 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 191 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 197 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 329 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() local 330 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 332 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 334 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r() 336 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r()
|
D | 3x16c2-minmax-rndnu-neon-mull-dup.c | 166 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 170 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 176 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 182 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 188 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 320 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() local 321 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 323 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 325 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup() 327 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld4r.c | 166 const int8x8_t va2c2 = vreinterpret_s8_s16(va2.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 170 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 176 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 182 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 188 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 320 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() local 321 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 323 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 325 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r() 327 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r()
|
D | 3x16c2-minmax-rndnu-neon-mull-ld2r.c | 169 const int8x8_t va2c2 = vreinterpret_s8_s16(va21.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 173 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 179 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 185 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 191 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 323 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() local 324 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 326 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 328 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r() 330 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r()
|
D | 3x8c2-minmax-rndnu-neon-mull-dup.c | 128 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() local 132 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 138 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 220 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() local 221 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup() 223 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup()
|
D | 3x8c2-minmax-rndnu-neon-mull-ld2r.c | 131 const int8x8_t va2c2 = vreinterpret_s8_s16(va21.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() local 135 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() 141 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() 223 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() local 224 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r() 226 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r()
|
D | 3x8c2-minmax-rndnu-neon-mull-ld4r.c | 128 const int8x8_t va2c2 = vreinterpret_s8_s16(va2.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() local 132 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() 138 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() 220 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() local 221 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r() 223 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r()
|
D | 3x8c2-minmax-rndnu-neon-mull-ld1r.c | 137 const int8x8_t va2c2 = vreinterpret_s8_s16(va22); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() local 141 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() 147 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() 229 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() local 230 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r() 232 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r()
|
D | 4x16c2-minmax-rndnu-neon-mull-dup.c | 197 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 202 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 210 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 218 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 226 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 388 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() local 389 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 391 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 393 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup() 395 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup()
|
D | 4x16c2-minmax-rndnu-neon-mull-ld4r.c | 197 const int8x8_t va2c2 = vreinterpret_s8_s16(va2.val[2]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() local 202 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 210 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 218 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 226 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 388 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() local 389 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 391 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 393 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r() 395 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r()
|
D | 4x16c2-minmax-rndnu-neon-mull-ld2r.c | 201 const int8x8_t va2c2 = vreinterpret_s8_s16(va21.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() local 206 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 214 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 222 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 230 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 392 const int8x8_t va2c2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() local 393 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 395 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 397 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r() 399 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, va2c2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 3x8-wasmsimd-splat.c | 99 const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); in xnn_f32_gemm_ukernel_3x8__wasmsimd_splat() local 106 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_ukernel_3x8__wasmsimd_splat() 109 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2c2, vb4567c2)); in xnn_f32_gemm_ukernel_3x8__wasmsimd_splat()
|
D | 3x8-relu-wasmrelaxedsimd-fma-splat.c | 99 const v128_t va2c2 = wasm_v32x4_shuffle(va2, va2, 2, 2, 2, 2); in xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat() local 106 vacc2x0123 = wasm_f32x4_add(vacc2x0123, wasm_f32x4_mul(va2c2, vb0123c2)); in xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat() 109 vacc2x4567 = wasm_f32x4_add(vacc2x4567, wasm_f32x4_mul(va2c2, vb4567c2)); in xnn_f32_gemm_relu_ukernel_3x8__wasmrelaxedsimd_fma_splat()
|