/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16c16-minmax-neon-mlal-padal.c | 173 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 174 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 175 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 176 vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 185 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 186 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 187 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 188 vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 197 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 198 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() [all …]
|
D | 3x16c16-minmax-neon-mlal-padal.c | 147 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 148 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 149 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 156 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 157 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 158 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 165 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 166 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 167 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 174 vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() [all …]
|
D | 4x8c16-minmax-neon-mlal-padal.c | 133 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 134 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 135 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 136 vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 145 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 146 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 147 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 148 vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 157 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 158 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() [all …]
|
D | 3x8c16-minmax-neon-mlal-padal.c | 115 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 116 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 117 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 124 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 125 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 126 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 133 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 134 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 135 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 142 vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() [all …]
|
D | 2x16c16-minmax-neon-mlal-padal.c | 121 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 122 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 127 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 128 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 133 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 134 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 139 vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 140 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 145 vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 146 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() [all …]
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 130 …vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 131 …vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 132 …vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 133 …vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 143 …vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 144 …vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 145 …vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 146 …vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 156 …vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 157 …vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 178 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 179 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 180 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 181 vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 191 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 192 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 193 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 194 vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 204 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 205 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 115 …vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 116 …vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 117 …vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 125 …vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 126 …vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 127 …vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 135 …vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 136 …vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 137 …vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 145 …vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16c8-minmax-neon-mlal-padal.c | 151 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 152 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 153 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 161 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 162 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 163 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 171 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 172 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 173 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 181 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 2x16c8-minmax-neon-mlal-padal.c | 124 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 125 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 131 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 132 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 138 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 139 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 145 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 146 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 152 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 153 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 4x8c2-minmax-neon-mlal-padal-dup.c | 114 …vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 115 …vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 116 …vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 117 …vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 127 …vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 128 …vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 129 …vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 130 …vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 140 …vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 141 …vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() [all …]
|
D | 4x8c8-minmax-neon-mlal-padal.c | 138 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 139 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 140 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 141 vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 151 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 152 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 153 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 154 vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 164 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 165 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16c16-minmax-neon-mlal-padal.c | 156 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 157 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 158 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 159 vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 168 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 169 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 170 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 171 vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 180 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 181 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() [all …]
|
D | 3x16c16-minmax-neon-mlal-padal.c | 132 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 133 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 134 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 141 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 142 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 143 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 150 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 151 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 152 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 159 vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() [all …]
|
D | 2x16c16-minmax-neon-mlal-padal.c | 108 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 109 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 114 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 115 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 120 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 121 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 126 vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 127 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 132 vprod0x4 = vmlal_s8(vprod0x4, vget_high_s8(vb4), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 133 vprod1x4 = vmlal_s8(vprod1x4, vget_high_s8(vb4), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() [all …]
|
D | 4x8c16-minmax-neon-mlal-padal.c | 116 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 117 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 118 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 119 vprod3x0 = vmlal_s8(vprod3x0, vget_high_s8(vb0), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 128 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 129 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 130 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 131 vprod3x1 = vmlal_s8(vprod3x1, vget_high_s8(vb1), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 140 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 141 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() [all …]
|
D | 3x8c16-minmax-neon-mlal-padal.c | 100 vprod0x0 = vmlal_s8(vprod0x0, vget_high_s8(vb0), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 101 vprod1x0 = vmlal_s8(vprod1x0, vget_high_s8(vb0), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 102 vprod2x0 = vmlal_s8(vprod2x0, vget_high_s8(vb0), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 109 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 110 vprod1x1 = vmlal_s8(vprod1x1, vget_high_s8(vb1), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 111 vprod2x1 = vmlal_s8(vprod2x1, vget_high_s8(vb1), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 118 vprod0x2 = vmlal_s8(vprod0x2, vget_high_s8(vb2), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 119 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 120 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 127 vprod0x3 = vmlal_s8(vprod0x3, vget_high_s8(vb3), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() [all …]
|
D | 3x16c8-minmax-neon-mlal-padal.c | 136 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 137 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 138 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 146 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 147 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 148 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 156 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 157 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 158 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 166 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() [all …]
|
D | 4x16c2-minmax-neon-mlal-padal-dup.c | 113 …vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 114 …vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 115 …vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 116 …vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 126 …vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 127 …vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 128 …vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 129 …vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 139 …vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() 140 …vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16c8-minmax-neon-mlal-padal.c | 161 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 162 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 163 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 164 vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 174 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 175 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 176 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 177 vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 187 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 188 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() [all …]
|
D | 3x16c2-minmax-neon-mlal-padal-dup.c | 100 …vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 101 …vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 102 …vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 110 …vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 111 …vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 112 …vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 120 …vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 121 …vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 122 …vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() 130 …vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x8c8-minmax-neon-mlal-padal.c | 121 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 122 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 123 vprod2x0 = vmlal_s8(vprod2x0, vb0x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 124 vprod3x0 = vmlal_s8(vprod3x0, vb0x1, va3x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 134 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 135 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 136 vprod2x1 = vmlal_s8(vprod2x1, vb1x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 137 vprod3x1 = vmlal_s8(vprod3x1, vb1x1, va3x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 147 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 148 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() [all …]
|
D | 2x16c8-minmax-neon-mlal-padal.c | 111 vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 112 vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 118 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 119 vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 125 vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 126 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 132 vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 133 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 139 vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 140 vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 87 …vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 88 …vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 94 …vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 95 …vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 101 …vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 102 …vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 108 …vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 109 …vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 115 …vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 116 …vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x8c2-minmax-neon-mlal-padal-dup.c | 97 …vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 98 …vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 99 …vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 100 …vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 110 …vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 111 …vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 112 …vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 113 …vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 123 …vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() 124 …vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret… in xnn_qs8_gemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup() [all …]
|