/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16-minmax-neon-mlal-lane.c | 62 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() local 67 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 68 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 72 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 73 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 77 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 78 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 82 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 83 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 87 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 1x8-minmax-neon-mlal-lane.c | 60 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() local 65 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 66 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 70 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 71 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 75 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 76 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 80 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 81 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() 86 … vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x8__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 74 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local 81 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 82 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 88 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 89 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 95 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 96 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 102 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 103 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 109 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x8-minmax-neon-mlal-lane.c | 70 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() local 77 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 78 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 84 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 85 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 91 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 92 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 98 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 99 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() 106 … vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x8__neon_mlal_lane() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 86 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local 95 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 96 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 104 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 105 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 113 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 114 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 122 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 123 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 131 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 3x8-minmax-neon-mlal-lane.c | 80 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() local 89 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 90 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 98 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 99 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 107 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 108 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 116 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 117 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() 126 … vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x8__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 98 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 109 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 110 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 120 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 121 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 131 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 132 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 142 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 143 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 153 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x8-minmax-neon-mlal-lane.c | 90 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() local 101 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 102 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 112 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 113 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 123 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 124 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 134 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 135 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() 146 … vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x8__neon_mlal_lane() [all …]
|
D | 1x4c2-minmax-sse41-ld64.c | 61 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() local 68 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() 73 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() 78 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() 83 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() 90 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() local 98 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() 106 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64() 114 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_igemm_minmax_ukernel_1x4c2__sse41_ld64()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16-minmax-neon-mlal-lane.c | 51 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() local 56 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 57 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 61 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 62 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 66 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 67 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 71 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 72 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 76 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 1x8-minmax-neon-mlal-lane.c | 49 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() local 54 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 55 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 59 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 60 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 64 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 65 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 69 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 70 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() 75 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x8__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 61 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() local 68 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 69 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 75 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 76 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 82 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 83 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 90 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 96 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x8-minmax-neon-mlal-lane.c | 57 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() local 64 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 65 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 71 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 72 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 78 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 79 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 85 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 86 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() 93 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x8__neon_mlal_lane() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 71 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local 80 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 81 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 89 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 90 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 98 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 99 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 107 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 108 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 116 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 3x8-minmax-neon-mlal-lane.c | 65 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() local 74 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 75 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 83 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 84 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 92 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 93 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 101 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 102 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() 111 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x8__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 81 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 92 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 93 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 103 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 104 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 114 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 115 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 125 vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 126 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 136 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x8-minmax-neon-mlal-lane.c | 73 const int16x8_t vxa0 = vmovl_s8(va0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() local 84 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 85 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 95 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 96 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 106 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 107 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 117 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 118 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() 129 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x8__neon_mlal_lane() [all …]
|
D | 1x4c2-xw-minmax-xop.c | 55 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() local 61 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 65 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 69 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 73 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 80 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() local 87 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 94 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop() 101 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__xop()
|
D | 1x4c2-xw-minmax-sse41.c | 50 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() local 56 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 60 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 64 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 68 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 75 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() local 82 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 89 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41() 96 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c2__sse41()
|
D | 1x4c2-minmax-xop-ld64.c | 55 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() local 62 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() 67 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() 72 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() 77 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() 84 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() local 92 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() 100 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64() 108 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld64()
|
D | 1x4c2-minmax-xop-ld128.c | 55 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() local 64 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() 67 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() 74 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() 77 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() 84 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() local 92 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() 100 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128() 108 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_1x4c2__xop_ld128()
|
D | 1x4c2-minmax-sse41-ld128.c | 50 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() local 59 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() 62 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() 69 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() 72 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() 79 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() local 87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() 95 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128() 103 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld128()
|
D | 1x4c2-minmax-sse41-ld64.c | 50 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() local 57 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() 62 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() 67 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() 72 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() 79 const __m128i vxa0 = _mm_cvtepi8_epi16(va0); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() local 87 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() 95 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64() 103 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2)); in xnn_qs8_gemm_minmax_ukernel_1x4c2__sse41_ld64()
|
/external/XNNPACK/src/qu8-igemm/ |
D | 4x8-minmax-neon.c | 90 const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local 99 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 100 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 113 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 114 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 1); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 127 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 2); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 128 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 2); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 141 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_low_s16(vxa0), 3); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 142 … vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567), vget_low_s16(vxa0), 3); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 155 … vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567), vget_high_s16(vxa0), 0); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() [all …]
|
/external/XNNPACK/src/qu8-gemm/ |
D | 4x8-minmax-neon.c | 70 const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0)); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() local 81 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 82 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 93 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 94 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 105 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 106 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 117 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 118 vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() 129 vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0); in xnn_qu8_gemm_minmax_ukernel_4x8__neon() [all …]
|