/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c8-minmax-neon-mlal-padal.c | 76 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 97 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 101 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 105 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 109 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 113 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 117 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 121 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 125 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 129 vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() [all …]
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 64 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 85 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 89 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 93 …l_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 97 …l_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 101 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 105 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 109 …l_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 113 …l_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 117 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 1x8c2-minmax-neon-mlal-padal-dup.c | 62 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() local 75 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 79 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 83 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 87 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 91 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 95 …l_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 99 …l_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 103 …l_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup()
|
D | 1x8c8-minmax-neon-mlal-padal.c | 68 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 81 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 85 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 89 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 93 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 97 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 101 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 105 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 109 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 100 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 124 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 131 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 138 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 145 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 152 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 159 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 166 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 173 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 180 vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 76 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 100 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 107 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 114 …l_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 121 …l_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 128 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 135 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 142 …l_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 149 …l_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 156 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 2x8c8-minmax-neon-mlal-padal.c | 84 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 100 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 107 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 114 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 121 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 128 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 135 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 142 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 149 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c2-minmax-neon-mlal-padal-dup.c | 72 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() local 88 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 95 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 102 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 109 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 116 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 123 …l_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 130 …l_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 137 …l_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup()
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 88 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 115 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 125 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 135 …l_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 145 …l_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 155 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 165 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 175 …l_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 185 …l_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 195 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16c8-minmax-neon-mlal-padal.c | 124 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 151 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 161 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 171 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 181 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 191 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 201 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 211 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 221 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 231 vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 100 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 119 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 129 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 139 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 149 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 159 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 169 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 179 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 189 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 82 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 101 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 111 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 121 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 131 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 141 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 151 …l_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 161 …l_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 171 …l_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 100 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 130 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 143 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 156 …l_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 169 …l_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 182 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 195 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 208 …l_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 221 …l_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 234 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c8-minmax-neon-mlal-padal.c | 65 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 86 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 90 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 94 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 98 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 102 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 106 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 110 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 114 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 118 vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() [all …]
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 53 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 74 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 78 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 82 …l_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 86 …l_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 90 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 94 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 98 …l_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 102 …l_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 106 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 1x8c8-minmax-neon-mlal-padal.c | 57 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 70 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 74 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 78 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 82 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 86 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 90 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 94 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 98 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal()
|
D | 1x8c2-minmax-neon-mlal-padal-dup.c | 51 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() local 64 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 68 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 72 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 76 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 80 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 84 …l_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 88 …l_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 92 …l_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 87 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 111 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 118 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 125 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 132 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 139 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 146 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 153 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 160 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 167 vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 63 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 87 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 94 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 101 …l_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 108 …l_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 115 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 122 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 129 …l_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 136 …l_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 143 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 2x8c8-minmax-neon-mlal-padal.c | 71 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 87 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 94 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 101 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 108 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 115 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 122 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 129 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 136 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c2-minmax-neon-mlal-padal-dup.c | 59 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() local 75 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 82 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 89 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 96 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 103 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 110 …l_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 117 …l_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 124 …l_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 109 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 136 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 146 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 156 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 166 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 176 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 186 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 196 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 206 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 216 vprod0x8 = vmlal_s8(vprod0x8, vb8x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 73 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 100 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 110 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 120 …l_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 130 …l_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 140 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 150 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 160 …l_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 170 …l_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 180 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 85 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 104 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 114 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 124 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 134 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 144 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 154 vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 164 vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 174 vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 67 const int8x8_t va0x1 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 86 …l_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 96 …l_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 106 …l_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 116 …l_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 126 …l_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 136 …l_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 146 …l_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 156 …l_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|