/external/XNNPACK/src/qs8-vadd/gen/ |
D | minmax-sse41-mul16-ld64-x32.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() local 50 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 52 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 54 __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 56 const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 58 __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 60 const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 62 __m128i vyprodOPQRSTUVhi = _mm_mulhi_epu16(vyOPQRSTUV, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 64 const __m128i vyprodOPQRSTUVlo = _mm_mullo_epi16(vyOPQRSTUV, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 76 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() [all …]
|
D | minmax-sse2-mul16-ld64-x32.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() local 58 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 60 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 62 __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 64 const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 66 __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 68 const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 70 __m128i vyprodOPQRSTUVhi = _mm_mulhi_epu16(vyOPQRSTUV, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 72 const __m128i vyprodOPQRSTUVlo = _mm_mullo_epi16(vyOPQRSTUV, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 84 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() [all …]
|
D | minmax-sse41-mul16-ld64-x24.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() local 48 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 50 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 52 __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 54 const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 56 __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 58 const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 68 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 70 …= _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 72 …= _mm_sub_epi16(vyprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vyGHIJKLMN, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() [all …]
|
D | minmax-sse2-mul16-ld64-x24.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() local 54 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 56 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 58 __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 60 const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 62 __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 64 const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 74 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 76 …= _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 78 …= _mm_sub_epi16(vyprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vyGHIJKLMN, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() [all …]
|
D | minmax-sse2-mul16-ld64-x16.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() local 50 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 52 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 54 __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 56 const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 64 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 66 …= _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 113 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 115 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 121 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16()
|
D | minmax-sse41-mul16-ld64-x16.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() local 46 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 48 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 50 __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 52 const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 60 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 62 …= _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 107 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 109 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 115 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16()
|
D | minmax-sse41-mul16-ld64-x8.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() local 44 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 46 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 52 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 84 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 86 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 92 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8()
|
D | minmax-sse2-mul16-ld64-x8.c | 27 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() local 46 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 48 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 54 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 88 __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 90 const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 96 …= _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8()
|
/external/XNNPACK/src/qs8-vadd/ |
D | sse-mul16-ld64.c.in | 28 const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.y_multiplier_lo); variable 60 __m128i vyprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(vy${ABC[N:N+8]}, vy_multiplier_lo); 62 const __m128i vyprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vy${ABC[N:N+8]}, vy_multiplier_lo); 70 …epi16(vyprod${ABC[N:N+8]}hi, _mm_and_si128(_mm_srai_epi16(vy${ABC[N:N+8]}, 15), vy_multiplier_lo)); 129 __m128i vyprod${ABC[0:8]}hi = _mm_mulhi_epu16(vy${ABC[0:8]}, vy_multiplier_lo); 131 const __m128i vyprod${ABC[0:8]}lo = _mm_mullo_epi16(vy${ABC[0:8]}, vy_multiplier_lo); 137 …sub_epi16(vyprod${ABC[0:8]}hi, _mm_and_si128(_mm_srai_epi16(vy${ABC[0:8]}, 15), vy_multiplier_lo));
|