/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c8-minmax-neon-mlal-padal.c | 112 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 135 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 145 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 155 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 165 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 175 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 185 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 195 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 205 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 215 int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 76 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 98 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 108 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 118 …2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 128 …2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 138 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 148 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 158 …2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 168 …2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 178 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 88 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 103 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 113 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 123 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 133 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 143 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 153 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 163 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 173 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 70 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 84 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 94 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 104 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 114 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 124 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 134 …2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 144 …2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 154 …2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 86 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 110 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 123 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 136 …2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 149 …2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 162 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 175 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 188 …2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 201 …2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 214 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 134 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 159 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 172 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 185 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 198 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 211 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 224 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 237 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 250 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 263 int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
D | 4x8c8-minmax-neon-mlal-padal.c | 102 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 119 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 132 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 145 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 158 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 171 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 184 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 197 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 210 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c2-minmax-neon-mlal-padal-dup.c | 78 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() local 94 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 107 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 120 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 133 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 146 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 159 …2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 172 …2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 185 …2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 91 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 113 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 123 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 133 …2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 143 …2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 153 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 163 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 173 …2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 183 …2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 193 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16c8-minmax-neon-mlal-padal.c | 127 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 150 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 160 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 170 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 180 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 190 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 200 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 210 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 220 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 230 int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 103 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 118 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 128 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 138 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 148 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 158 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 168 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 178 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 188 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 85 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 99 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 109 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 119 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 129 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 139 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 149 …2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 159 …2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 169 …2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 103 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 127 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 140 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 153 …2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 166 …2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 179 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 192 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 205 …2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 218 …2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 231 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 151 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 176 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 189 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 202 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 215 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 228 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 241 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 254 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 267 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 280 int16x8_t vprod2x8 = vmull_s8(vb8x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
D | 4x8c2-minmax-neon-mlal-padal-dup.c | 95 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() local 111 …2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 124 …2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 137 …2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 150 …2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 163 …2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 176 …2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 189 …2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 202 …2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 119 const int8x8_t va2x0 = vld1_s8(a2); a2 += 8; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 136 int16x8_t vprod2x0 = vmull_s8(vb0x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 149 int16x8_t vprod2x1 = vmull_s8(vb1x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 162 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 175 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 188 int16x8_t vprod2x4 = vmull_s8(vb4x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 201 int16x8_t vprod2x5 = vmull_s8(vb5x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 214 int16x8_t vprod2x6 = vmull_s8(vb6x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 227 int16x8_t vprod2x7 = vmull_s8(vb7x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|