/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x4c8-minmax-xop-ld128.c | 101 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128() local 104 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128() 106 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128() 108 vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld128()
|
D | 3x4c8-minmax-sse41-ld128.c | 96 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128() local 99 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128() 101 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128() 103 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld128()
|
D | 3x4c8-minmax-xop-ld64.c | 106 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64() local 108 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64() 109 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64() 110 vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__xop_ld64()
|
D | 3x4c8-xw-minmax-xop.c | 102 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop() local 104 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop() 105 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop() 106 vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__xop()
|
D | 3x4c8-minmax-sse41-ld64.c | 101 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64() local 103 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64() 104 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64() 105 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64()
|
D | 3x4c8-xw-minmax-sse41.c | 97 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41() local 99 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41() 100 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41() 101 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__sse41()
|
D | 4x4c2-minmax-xop-ld128.c | 113 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() local 125 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 127 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 129 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128() 131 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld128()
|
D | 4x4c2-minmax-xop-ld64.c | 122 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() local 125 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 127 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 129 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64() 131 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123); in xnn_qs8_gemm_minmax_ukernel_4x4c2__xop_ld64()
|
D | 4x4c2-xw-minmax-xop.c | 118 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() local 121 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 123 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 125 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop() 127 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__xop()
|
D | 4x4c2-xw-minmax-sse41.c | 113 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() local 116 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 118 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 120 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41() 122 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_4x4c2__sse41()
|
D | 2x4c8-xw-minmax-xop.c | 86 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() local 88 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop() 89 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__xop()
|
D | 4x4c2-minmax-sse41-ld128.c | 108 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() local 120 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 122 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 124 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128() 126 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld128()
|
D | 4x4c2-minmax-sse41-ld64.c | 117 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() local 120 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 122 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 124 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64() 126 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_gemm_minmax_ukernel_4x4c2__sse41_ld64()
|
D | 2x4c8-minmax-sse41-ld64.c | 85 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64() local 87 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64() 88 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__sse41_ld64()
|
D | 2x4c8-xw-minmax-sse41.c | 81 const __m128i vxb3 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 24 * sizeof(int16_t))); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41() local 83 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41() 84 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__sse41()
|
D | 2x4c8-minmax-xop-ld128.c | 86 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128() local 89 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128() 91 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x4c8-minmax-xop-ld64.c | 121 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64() local 123 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64() 124 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64() 125 vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld64()
|
D | 3x4c8-minmax-xop-ld128.c | 116 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128() local 119 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128() 121 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128() 123 vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__xop_ld128()
|
D | 3x4c8-minmax-sse41-ld64.c | 116 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64() local 118 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64() 119 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64() 120 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64()
|
D | 3x4c8-minmax-sse41-ld128.c | 111 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128() local 114 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128() 116 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128() 118 vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld128()
|
D | 4x4c2-minmax-sse41-ld128.c | 125 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() local 137 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 139 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 141 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128() 143 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld128()
|
D | 4x4c2-minmax-sse41-ld64.c | 134 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() local 137 _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 139 _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 141 _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64() 143 _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3)); in xnn_qs8_igemm_minmax_ukernel_4x4c2__sse41_ld64()
|
D | 4x4c2-minmax-xop-ld128.c | 130 const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() local 142 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 144 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 146 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128() 148 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128()
|
D | 4x4c2-minmax-xop-ld64.c | 139 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() local 142 _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 144 _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 146 _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64() 148 _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123); in xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld64()
|
D | 2x4c8-minmax-xop-ld64.c | 103 const __m128i vxb3 = _mm_cvtepi8_epi16(vb3); in xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64() local 105 vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3); in xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64() 106 vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3); in xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64()
|