/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c4-minmax-rndnu-neon-mull-ld2r.c | 87 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() local 89 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 91 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 93 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 95 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 97 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 99 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 101 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 103 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 139 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mull-ld1r.c | 88 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() local 90 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 92 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 94 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 96 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 98 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 100 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 102 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 104 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 140 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mull-dup.c | 87 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() local 89 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 91 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 93 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 95 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 97 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 99 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 101 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 103 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 139 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() local [all …]
|
D | 1x8c4-minmax-rndnu-neon-mull-ld2r.c | 75 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() local 77 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 79 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 81 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 83 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 107 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() local 108 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 110 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 112 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 114 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r()
|
D | 1x8c4-minmax-rndnu-neon-mull-ld1r.c | 76 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() local 78 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 80 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 82 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 84 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 108 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() local 109 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 111 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 113 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 115 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r()
|
D | 1x8c4-minmax-rndnu-neon-mull-dup.c | 75 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() local 77 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 79 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 81 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 83 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 107 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() local 108 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 110 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 112 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 114 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup()
|
D | 1x16c4-minmax-rndnu-neon-mlal-dup.c | 181 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() local 183 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 185 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 187 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 189 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 191 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 193 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 195 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 197 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 233 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mlal-ld2r.c | 181 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() local 183 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 185 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 187 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 189 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 191 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 193 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 195 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 197 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 233 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() local [all …]
|
D | 2x16c4-minmax-rndnu-neon-mull-ld2r.c | 104 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local 107 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 111 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 115 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 119 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 123 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 127 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 131 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 135 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 191 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mlal-ld1r.c | 184 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() local 186 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 188 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 190 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 192 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 194 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 196 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 198 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 200 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 236 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() local [all …]
|
D | 2x16c4-minmax-rndnu-neon-mull-dup.c | 104 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local 107 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 111 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 115 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 119 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 123 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 127 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 131 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 135 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 191 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local [all …]
|
D | 2x16c4-minmax-rndnu-neon-mull-ld1r.c | 106 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local 109 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 113 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 117 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 121 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 125 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 129 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 133 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 137 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 193 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c4-minmax-rndnu-neon-mull-ld1r.c | 77 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() local 79 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 81 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 83 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 85 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 87 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 89 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 91 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 93 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() 129 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mull-ld2r.c | 76 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() local 78 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 80 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 82 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 84 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 86 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 88 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 90 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 92 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() 128 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mull-dup.c | 76 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() local 78 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 80 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 82 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 84 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 86 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 88 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 90 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 92 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() 128 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup() local [all …]
|
D | 1x8c4-minmax-rndnu-neon-mull-ld1r.c | 65 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() local 67 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 69 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 71 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 73 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 97 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() local 98 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 100 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 102 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r() 104 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r()
|
D | 1x8c4-minmax-rndnu-neon-mull-ld2r.c | 64 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() local 66 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 68 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 70 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 72 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 96 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() local 97 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 99 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 101 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r() 103 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r()
|
D | 1x8c4-minmax-rndnu-neon-mull-dup.c | 64 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() local 66 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 68 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 70 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 72 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 96 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() local 97 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 99 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 101 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup() 103 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup()
|
D | 2x16c4-minmax-rndnu-neon-mull-dup.c | 91 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local 94 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 98 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 102 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 106 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 110 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 114 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 118 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 122 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() 178 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mlal-ld2r.c | 170 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() local 172 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 174 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 176 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 178 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 180 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 182 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 184 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 186 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() 222 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r() local [all …]
|
D | 2x16c4-minmax-rndnu-neon-mull-ld1r.c | 93 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local 96 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 100 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 104 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 108 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 112 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 116 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 120 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 124 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() 180 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r() local [all …]
|
D | 2x16c4-minmax-rndnu-neon-mull-ld2r.c | 91 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local 94 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 98 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 102 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 106 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 110 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 114 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 118 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 122 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() 178 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mlal-ld1r.c | 173 const int8x8_t va0c0 = vreinterpret_s8_s32(va00); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() local 175 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 177 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 179 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 181 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 183 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 185 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 187 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 189 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() 225 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r() local [all …]
|
D | 1x16c4-minmax-rndnu-neon-mlal-dup.c | 170 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() local 172 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 174 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 176 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 178 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 180 const int16x8_t vprod0x89c0 = vmull_s8(vb89c0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 182 const int16x8_t vprod0xABc0 = vmull_s8(vbABc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 184 const int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 186 const int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0, va0c0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() 222 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup() local [all …]
|
D | 1x8c4-minmax-fp32-neonv8-mlal-ld2r.c | 119 const int8x8_t va0c0 = vreinterpret_s8_s32(va0.val[0]); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() local 121 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() 123 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() 125 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() 127 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() 151 const int8x8_t va0c0 = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(va0), 0)); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() local 152 const int16x8_t vprod0x01c0 = vmull_s8(vb01c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() 154 const int16x8_t vprod0x23c0 = vmull_s8(vb23c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() 156 const int16x8_t vprod0x45c0 = vmull_s8(vb45c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r() 158 const int16x8_t vprod0x67c0 = vmull_s8(vb67c0, va0c0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c4__neonv8_mlal_ld2r()
|