/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c8-minmax-neon-mlal-padal.c | 75 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 96 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 100 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 104 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 108 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 112 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 116 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 120 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 124 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 128 int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() [all …]
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 63 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 83 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 87 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 91 …0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 95 …0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 99 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 103 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 107 …0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 111 …0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 115 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 1x8c2-minmax-neon-mlal-padal-dup.c | 61 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() local 73 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 77 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 81 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 85 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 89 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 93 …0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 97 …0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 101 …0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup()
|
D | 1x8c8-minmax-neon-mlal-padal.c | 67 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 80 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 84 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 88 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 92 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 96 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 100 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 104 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 108 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 99 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 122 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 129 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 136 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 143 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 150 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 157 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 164 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 171 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 178 int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 75 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 97 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 104 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 111 …0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 118 …0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 125 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 132 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 139 …0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 146 …0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 153 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 2x8c8-minmax-neon-mlal-padal.c | 83 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 98 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 105 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 112 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 119 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 126 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 133 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 140 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 147 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c2-minmax-neon-mlal-padal-dup.c | 71 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() local 85 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 92 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 99 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 106 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 113 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 120 …0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 127 …0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 134 …0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup()
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 87 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 111 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 121 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 131 …0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 141 …0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 151 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 161 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 171 …0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 181 …0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 191 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16c8-minmax-neon-mlal-padal.c | 123 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 148 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 158 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 168 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 178 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 188 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 198 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 208 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 218 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 228 int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 99 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 116 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 126 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 136 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 146 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 156 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 166 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 176 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 186 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 81 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 97 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 107 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 117 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 127 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 137 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 147 …0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 157 …0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 167 …0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 99 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() local 125 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 138 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 151 …0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 164 …0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 177 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 190 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 203 …0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 216 …0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 229 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c8-minmax-neon-mlal-padal.c | 64 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 85 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 89 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 93 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 97 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 101 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 105 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 109 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 113 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 117 int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() [all …]
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 52 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 72 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 76 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 80 …0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 84 …0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 88 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 92 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 96 …0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 100 …0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 104 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 1x8c8-minmax-neon-mlal-padal.c | 56 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 69 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 73 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 77 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 81 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 85 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 89 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 93 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 97 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal()
|
D | 1x8c2-minmax-neon-mlal-padal-dup.c | 50 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() local 62 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 66 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 70 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 74 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 78 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 82 …0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 86 …0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup() 90 …0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 86 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 109 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 116 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 123 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 130 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 137 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 144 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 151 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 158 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 165 int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 62 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 84 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 91 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 98 …0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 105 …0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 112 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 119 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 126 …0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 133 …0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 140 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 2x8c8-minmax-neon-mlal-padal.c | 70 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 85 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 92 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 99 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 106 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 113 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 120 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 127 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 134 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x8c2-minmax-neon-mlal-padal-dup.c | 58 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() local 72 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 79 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 86 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 93 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 100 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 107 …0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 114 …0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup() 121 …0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 108 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 133 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 143 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 153 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 163 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 173 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 183 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 193 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 203 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 213 int16x8_t vprod0x8 = vmull_s8(vb8x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 72 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() local 96 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 106 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 116 …0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 126 …0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 136 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 146 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 156 …0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 166 …0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 176 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x8c8-minmax-neon-mlal-padal.c | 84 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 101 int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 111 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 121 int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 131 int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 141 int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 151 int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 161 int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 171 int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c2-minmax-neon-mlal-padal-dup.c | 66 const int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() local 82 …0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 92 …0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 102 …0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 112 …0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 122 …0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 132 …0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 142 …0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup() 152 …0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3))); in xnn_qs8_gemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup()
|