/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x8-minmax-rndnu-neon-mull-addw-dup.c | 67 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() local 68 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() 69 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() 119 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() local 120 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() 121 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup()
|
D | 2x8-minmax-rndnu-neon-mull-addw-dup.c | 85 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() local 86 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() 87 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() 162 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() local 163 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() 164 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup()
|
D | 1x16-minmax-rndnu-neon-mull-addw-dup.c | 84 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() local 85 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() 86 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() 174 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() local 175 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() 176 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup()
|
D | 3x8-minmax-rndnu-neon-mull-addw-dup.c | 103 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() local 104 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() 105 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() 205 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() local 206 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() 207 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup()
|
D | 4x8-minmax-rndnu-neon-mull-addw-dup.c | 121 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() local 122 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() 123 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() 248 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() local 249 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() 250 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup()
|
D | 2x16-minmax-rndnu-neon-mull-addw-dup.c | 113 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() local 114 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() 115 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() 252 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() local 253 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() 254 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup()
|
D | 3x16-minmax-rndnu-neon-mull-addw-dup.c | 142 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() local 143 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() 144 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() 330 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() local 331 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() 332 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup()
|
D | 4x16-minmax-rndnu-neon-mull-addw-dup.c | 171 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() local 172 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 173 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 408 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() local 409 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 410 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x8-minmax-rndnu-neon-mull-addw-dup.c | 78 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() local 79 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() 80 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() 130 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() local 131 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup() 132 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mull_addw_dup()
|
D | 2x8-minmax-rndnu-neon-mull-addw-dup.c | 98 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() local 99 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() 100 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() 175 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() local 176 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup() 177 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8__neon_mull_addw_dup()
|
D | 1x16-minmax-rndnu-neon-mull-addw-dup.c | 95 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() local 96 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() 97 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() 185 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() local 186 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup() 187 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mull_addw_dup()
|
D | 3x8-minmax-rndnu-neon-mull-addw-dup.c | 118 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() local 119 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() 120 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() 220 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() local 221 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup() 222 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x8__neon_mull_addw_dup()
|
D | 4x8-minmax-rndnu-neon-mull-addw-dup.c | 138 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() local 139 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() 140 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() 265 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() local 266 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup() 267 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__neon_mull_addw_dup()
|
D | 2x16-minmax-rndnu-neon-mull-addw-dup.c | 126 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() local 127 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() 128 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() 265 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() local 266 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup() 267 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16__neon_mull_addw_dup()
|
D | 3x16-minmax-rndnu-neon-mull-addw-dup.c | 157 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() local 158 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() 159 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() 345 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() local 346 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup() 347 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16__neon_mull_addw_dup()
|
D | 4x16-minmax-rndnu-neon-mull-addw-dup.c | 188 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() local 189 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 190 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 425 const int16x8_t vprod0x01234567c3 = vmull_s8(vb01234567c3, vdup_lane_s8(va0, 3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() local 426 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup() 427 vacc0x4567 = vaddw_s16(vacc0x4567, vget_high_s16(vprod0x01234567c3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__neon_mull_addw_dup()
|