/external/XNNPACK/src/qs8-gemm/gen/ |
D | 4x16-minmax-neon-mlal-lane.c | 92 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 93 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 94 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 95 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 96 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 97 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 98 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 99 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 103 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 104 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 80 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 81 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 82 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 83 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 84 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 85 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 90 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 91 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 92 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 4x8-minmax-neon-mlal-lane.c | 84 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 85 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 86 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 87 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 88 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 89 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 90 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 91 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 95 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 96 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 68 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 69 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 70 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 71 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 75 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 76 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 77 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 78 vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 82 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 83 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 3x8-minmax-neon-mlal-lane.c | 74 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 75 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 76 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 77 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 78 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 79 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 83 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 84 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 85 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 86 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() [all …]
|
D | 2x8-minmax-neon-mlal-lane.c | 64 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 65 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 66 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 67 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 71 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 72 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 73 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 74 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 78 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 79 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() [all …]
|
D | 1x16-minmax-neon-mlal-lane.c | 56 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 57 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 61 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 62 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 66 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 67 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 71 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 72 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 76 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 77 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 88 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 91 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 94 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 97 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 102 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 105 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 108 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 111 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 116 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 119 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
/external/XNNPACK/src/qu8-gemm/ |
D | 8x8-minmax-neon.c | 121 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 122 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 123 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 124 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 125 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 126 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 127 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 128 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 129 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() 130 vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0); in xnn_qu8_gemm_minmax_ukernel_8x8__neon() [all …]
|
D | 4x8-minmax-neon.c | 81 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 82 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 83 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 84 vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 85 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 86 vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 87 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 88 vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 93 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 94 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() [all …]
|
/external/XNNPACK/src/qu8-igemm/ |
D | 8x8-minmax-neon.c | 147 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 148 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 149 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 150 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 151 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 152 … vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 153 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 154 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 155 vacc4x0123 = vmlal_lane_s16(vacc4x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 156 … vacc4x4567 = vmlal_lane_s16(vacc4x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa4), 0); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() [all …]
|
D | 4x8-minmax-neon.c | 99 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 100 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 101 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 102 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa1), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 103 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 104 … vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa2), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 105 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 106 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa3), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 113 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 114 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 4x16-minmax-neon-mlal-lane.c | 109 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 110 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 111 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 112 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 113 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 114 … vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 115 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 116 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 120 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 121 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 95 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 96 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 97 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 98 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 99 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 100 … vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 104 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 105 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 106 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 107 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 4x8-minmax-neon-mlal-lane.c | 101 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 102 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 103 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 104 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 105 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 106 … vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 107 vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 108 … vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 112 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 113 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 81 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 82 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 83 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 84 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 88 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 89 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 90 vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 91 … vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 95 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 96 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 3x8-minmax-neon-mlal-lane.c | 89 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 90 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 91 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 92 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 93 vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 94 … vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 98 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 99 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 100 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 101 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() [all …]
|
D | 2x8-minmax-neon-mlal-lane.c | 77 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 78 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 79 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 80 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 84 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 85 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 86 vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 87 … vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 91 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 92 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() [all …]
|
D | 1x16-minmax-neon-mlal-lane.c | 67 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 68 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 72 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 73 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 77 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 78 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 82 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 83 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 87 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 88 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 105 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 108 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 111 vacc2x0123 = vaddw_s16(vacc2x0123, vget_low_s16(vprod2x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 114 vacc3x0123 = vaddw_s16(vacc3x0123, vget_low_s16(vprod3x01234567c0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 119 vacc0x89AB = vaddw_s16(vacc0x89AB, vget_low_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 122 vacc1x89AB = vaddw_s16(vacc1x89AB, vget_low_s16(vprod1x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 125 vacc2x89AB = vaddw_s16(vacc2x89AB, vget_low_s16(vprod2x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 128 vacc3x89AB = vaddw_s16(vacc3x89AB, vget_low_s16(vprod3x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 133 vacc0x0123 = vaddw_s16(vacc0x0123, vget_low_s16(vprod0x01234567c1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 136 vacc1x0123 = vaddw_s16(vacc1x0123, vget_low_s16(vprod1x01234567c1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
D | 1x8-minmax-neon-mlal-lane.c | 65 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 66 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 70 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 71 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 75 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 76 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 80 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 81 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 86 … vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 91 … vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() [all …]
|
/external/XNNPACK/src/qs8-dwconv/gen/ |
D | up32x9-minmax-neon-mul16.c | 108 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 110 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi0x89ABCDEF), vget_low_s16(vk0x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 112 vaccGHIJ = vmlal_s16(vaccGHIJ, vget_low_s16(vi0xGHIJKLMN), vget_low_s16(vk0xGHIJKLMN)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 114 vaccOPQR = vmlal_s16(vaccOPQR, vget_low_s16(vi0xOPQRSTUV), vget_low_s16(vk0xOPQRSTUV)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 126 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 128 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi1x89ABCDEF), vget_low_s16(vk1x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 130 vaccGHIJ = vmlal_s16(vaccGHIJ, vget_low_s16(vi1xGHIJKLMN), vget_low_s16(vk1xGHIJKLMN)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 132 vaccOPQR = vmlal_s16(vaccOPQR, vget_low_s16(vi1xOPQRSTUV), vget_low_s16(vk1xOPQRSTUV)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 144 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 146 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi2x89ABCDEF), vget_low_s16(vk2x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() [all …]
|
D | up24x9-minmax-neon-mul16.c | 104 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 106 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi0x89ABCDEF), vget_low_s16(vk0x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 108 vaccGHIJ = vmlal_s16(vaccGHIJ, vget_low_s16(vi0xGHIJKLMN), vget_low_s16(vk0xGHIJKLMN)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 118 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 120 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi1x89ABCDEF), vget_low_s16(vk1x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 122 vaccGHIJ = vmlal_s16(vaccGHIJ, vget_low_s16(vi1xGHIJKLMN), vget_low_s16(vk1xGHIJKLMN)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 132 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 134 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi2x89ABCDEF), vget_low_s16(vk2x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 136 vaccGHIJ = vmlal_s16(vaccGHIJ, vget_low_s16(vi2xGHIJKLMN), vget_low_s16(vk2xGHIJKLMN)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 146 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() [all …]
|
D | up16x9-minmax-neon-mul16.c | 100 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 102 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi0x89ABCDEF), vget_low_s16(vk0x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 110 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 112 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi1x89ABCDEF), vget_low_s16(vk1x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 122 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi2x89ABCDEF), vget_low_s16(vk2x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 130 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 132 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi3x89ABCDEF), vget_low_s16(vk3x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 140 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 142 vacc89AB = vmlal_s16(vacc89AB, vget_low_s16(vi4x89ABCDEF), vget_low_s16(vk4x89ABCDEF)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() [all …]
|
/external/libhevc/common/arm/ |
D | ihevc_resi_trans_neon_32x32.c | 137 vget_high_s16(diff_16[2][0]), vget_low_s16(diff_16[2][0])); in ihevc_resi_trans_32x32_neon() 141 vget_high_s16(diff_16[3][0]), vget_low_s16(diff_16[3][0])); in ihevc_resi_trans_32x32_neon() 162 vget_high_s16(diff_16[2][1]), vget_low_s16(diff_16[2][1])); in ihevc_resi_trans_32x32_neon() 166 vget_high_s16(diff_16[3][1]), vget_low_s16(diff_16[3][1])); in ihevc_resi_trans_32x32_neon() 239 e0_1 = vcombine_s16(vget_high_s16(e0_1), vget_low_s16(e0_1)); in ihevc_resi_trans_32x32_neon() 244 e1_1 = vcombine_s16(vget_high_s16(e1_1), vget_low_s16(e1_1)); in ihevc_resi_trans_32x32_neon() 253 vcombine_s16(vget_low_s16(ee0), vget_low_s16(ee1)); in ihevc_resi_trans_32x32_neon() 265 vtrn_s32(vreinterpret_s32_s16(vget_low_s16(eee)), in ihevc_resi_trans_32x32_neon() 274 vtrn_s16(vget_low_s16(eeee), vget_high_s16(eeee)); in ihevc_resi_trans_32x32_neon() 286 vmull_s16(vget_low_s16(g_ai2_ihevc_trans_32_01_8), eeee_00); in ihevc_resi_trans_32x32_neon() [all …]
|