/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x16c8-minmax-neon-mlal-padal.c | 102 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 125 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 132 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 139 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 146 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 153 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 160 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 167 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 174 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 181 vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 78 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 101 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 108 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 115 …l_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 122 …l_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 129 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 136 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 143 …l_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 150 …l_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 157 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 2x8c8-minmax-neon-mlal-padal.c | 86 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 101 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 108 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 115 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 122 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 129 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 136 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 143 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 150 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c2-minmax-neon-mlal-padal-dup.c | 74 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() local 89 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 96 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 103 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 110 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 117 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 124 …l_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 131 …l_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 138 …l_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup()
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 90 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 116 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 126 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 136 …l_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 146 …l_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 156 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 166 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 176 …l_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 186 …l_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 196 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16c8-minmax-neon-mlal-padal.c | 126 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 152 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 162 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 172 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 182 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 192 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 202 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 212 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 222 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 232 vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 102 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 120 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 130 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 140 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 150 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 160 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 170 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 180 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 190 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 84 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 102 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 112 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 122 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 132 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 142 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 152 …l_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 162 …l_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 172 …l_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 102 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 131 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 144 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 157 …l_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 170 …l_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 183 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 196 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 209 …l_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 222 …l_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 235 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 150 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 179 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 192 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 205 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 218 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 231 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 244 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 257 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 270 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 283 vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
D | 4x8c2-minmax-neon-mlal-padal-dup.c | 94 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() local 115 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 128 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 141 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 154 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 167 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 180 …l_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 193 …l_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 206 …l_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 118 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 139 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 152 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 165 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 178 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 191 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 204 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 217 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 230 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x16c8-minmax-neon-mlal-padal.c | 89 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 112 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 119 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 126 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 133 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 140 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 147 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 154 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 161 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 168 vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 65 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 88 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 95 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 102 …l_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 109 …l_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 116 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 123 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 130 …l_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 137 …l_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 144 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 2x8c8-minmax-neon-mlal-padal.c | 73 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 88 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 95 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 102 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 109 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 116 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 123 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 130 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 137 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c2-minmax-neon-mlal-padal-dup.c | 61 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() local 76 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 83 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 90 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 97 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 104 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 111 …l_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 118 …l_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 125 …l_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 111 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 137 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 147 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 157 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 167 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 177 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 187 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 197 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 207 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 217 vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 75 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 101 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 111 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 121 …l_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 131 …l_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 141 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 151 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 161 …l_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 171 …l_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 181 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 87 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 105 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 115 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 125 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 135 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 145 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 155 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 165 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 175 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 69 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 87 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 97 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 107 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 117 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 127 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 137 …l_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 147 …l_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 157 …l_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 85 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 114 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 127 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 140 …l_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 153 …l_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 166 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 179 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 192 …l_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 205 …l_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 218 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 133 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 162 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 175 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 188 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 201 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 214 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 227 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 240 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 253 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 266 vprod1x8 = vmlal_s8(vprod1x8, vb8x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
D | 4x8c8-minmax-neon-mlal-padal.c | 101 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 122 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 135 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 148 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 161 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 174 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 187 vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 200 vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 213 vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c2-minmax-neon-mlal-padal-dup.c | 77 const int8x8_t va1x1 = vld1_s8(a1); a1 += 8; in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() local 98 …l_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 111 …l_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 124 …l_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 137 …l_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 150 …l_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 163 …l_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 176 …l_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 189 …l_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3))); in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup()
|