/external/XNNPACK/src/qs8-vaddc/gen/ |
D | minmax-sse2-mul16-ld64-x32.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() local 48 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 49 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 50 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 51 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 52 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 53 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 54 __m128i vxprodOPQRSTUVhi = _mm_mulhi_epu16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 55 const __m128i vxprodOPQRSTUVlo = _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 62 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() [all …]
|
D | minmax-sse41-mul16-ld64-x32.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() local 44 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 45 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 46 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 47 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 48 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 49 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 50 __m128i vxprodOPQRSTUVhi = _mm_mulhi_epu16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 51 const __m128i vxprodOPQRSTUVlo = _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 58 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() [all …]
|
D | minmax-sse41-mul16-ld64-x24.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() local 43 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 44 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 45 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 46 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 47 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 48 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 54 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 55 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 56 …= _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() [all …]
|
D | minmax-sse2-mul16-ld64-x24.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() local 46 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 47 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 48 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 49 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 50 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 51 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 57 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 58 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 59 …= _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() [all …]
|
D | minmax-sse41-mul16-ld64-x16.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() local 42 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 43 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 44 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 45 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 50 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 51 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 88 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 89 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 93 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16()
|
D | minmax-sse2-mul16-ld64-x16.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() local 44 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 45 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 46 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 47 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 52 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 53 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 91 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 92 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 96 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16()
|
D | minmax-sse41-mul16-ld64-x8.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() local 41 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 42 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 46 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 73 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 74 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 78 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8()
|
D | minmax-sse2-mul16-ld64-x8.c | 24 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() local 42 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 43 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 47 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 75 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 76 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 80 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8()
|
/external/XNNPACK/src/qs8-vadd/gen/ |
D | minmax-sse41-mul16-ld64-x32.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() local 49 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 51 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 53 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 55 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 57 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 59 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 61 __m128i vxprodOPQRSTUVhi = _mm_mulhi_epu16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 63 const __m128i vxprodOPQRSTUVlo = _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 75 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() [all …]
|
D | minmax-sse2-mul16-ld64-x32.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() local 57 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 59 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 61 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 63 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 65 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 67 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 69 __m128i vxprodOPQRSTUVhi = _mm_mulhi_epu16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 71 const __m128i vxprodOPQRSTUVlo = _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 83 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() [all …]
|
D | minmax-sse41-mul16-ld64-x24.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() local 47 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 49 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 51 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 53 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 55 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 57 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 67 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 69 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 71 …= _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() [all …]
|
D | minmax-sse2-mul16-ld64-x24.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() local 53 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 55 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 57 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 59 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 61 __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 63 const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 73 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 75 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 77 …= _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() [all …]
|
D | minmax-sse2-mul16-ld64-x16.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() local 49 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 51 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 53 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 55 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 63 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 65 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 112 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 114 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 120 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16()
|
D | minmax-sse41-mul16-ld64-x16.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() local 45 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 47 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 49 __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 51 const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 59 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 61 …= _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 106 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 108 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 114 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16()
|
D | minmax-sse41-mul16-ld64-x8.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() local 43 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 45 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 51 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 83 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 85 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 91 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8()
|
D | minmax-sse2-mul16-ld64-x8.c | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() local 45 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 47 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 53 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 87 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 89 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 95 …= _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8()
|
/external/XNNPACK/src/qs8-vaddc/ |
D | sse-mul16-ld64.c.in | 25 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); variable 53 __m128i vxprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(vx${ABC[N:N+8]}, vx_multiplier_lo); 54 const __m128i vxprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_multiplier_lo); 60 …epi16(vxprod${ABC[N:N+8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[N:N+8]}, 15), vx_multiplier_lo)); 110 __m128i vxprod${ABC[0:8]}hi = _mm_mulhi_epu16(vx${ABC[0:8]}, vx_multiplier_lo); 111 const __m128i vxprod${ABC[0:8]}lo = _mm_mullo_epi16(vx${ABC[0:8]}, vx_multiplier_lo); 115 …sub_epi16(vxprod${ABC[0:8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[0:8]}, 15), vx_multiplier_lo));
|
/external/XNNPACK/src/qs8-vadd/ |
D | sse-mul16-ld64.c.in | 26 const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.x_multiplier_lo); variable 59 __m128i vxprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(vx${ABC[N:N+8]}, vx_multiplier_lo); 61 const __m128i vxprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_multiplier_lo); 69 …epi16(vxprod${ABC[N:N+8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[N:N+8]}, 15), vx_multiplier_lo)); 128 __m128i vxprod${ABC[0:8]}hi = _mm_mulhi_epu16(vx${ABC[0:8]}, vx_multiplier_lo); 130 const __m128i vxprod${ABC[0:8]}lo = _mm_mullo_epi16(vx${ABC[0:8]}, vx_multiplier_lo); 136 …sub_epi16(vxprod${ABC[0:8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[0:8]}, 15), vx_multiplier_lo));
|