/external/XNNPACK/src/q8-avgpool/ |
D | mp9p8q-sse2.c | 33 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() local 62 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 63 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 64 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 65 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 66 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 67 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 68 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 69 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() 70 const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero); in xnn_q8_avgpool_ukernel_mp9p8q__sse2() [all …]
|
D | up9-sse2.c | 33 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_avgpool_ukernel_up9__sse2() local 86 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 87 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 88 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 89 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 90 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 91 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 92 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 93 const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() 94 const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero); in xnn_q8_avgpool_ukernel_up9__sse2() [all …]
|
/external/XNNPACK/src/q8-gavgpool/ |
D | mp7p7q-sse2.c | 40 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() local 52 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 53 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 54 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 55 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 56 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 57 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 58 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 68 const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero)); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() 69 const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_q8_gavgpool_ukernel_mp7p7q__sse2() [all …]
|
D | up7-sse2.c | 56 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_gavgpool_ukernel_up7__sse2() local 70 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero); in xnn_q8_gavgpool_ukernel_up7__sse2() 71 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero); in xnn_q8_gavgpool_ukernel_up7__sse2() 72 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero); in xnn_q8_gavgpool_ukernel_up7__sse2() 73 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero); in xnn_q8_gavgpool_ukernel_up7__sse2() 74 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero); in xnn_q8_gavgpool_ukernel_up7__sse2() 75 const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero); in xnn_q8_gavgpool_ukernel_up7__sse2() 76 const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero); in xnn_q8_gavgpool_ukernel_up7__sse2() 86 __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero)); in xnn_q8_gavgpool_ukernel_up7__sse2() 87 __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero)); in xnn_q8_gavgpool_ukernel_up7__sse2() [all …]
|
/external/libaom/libaom/aom_dsp/x86/ |
D | sum_squares_avx2.c | 111 __m256i vzero = _mm256_setzero_si256(); in aom_var_2d_u8_avx2() local 112 __m256i v_acc_sum = vzero; in aom_var_2d_u8_avx2() 113 __m256i v_acc_sqs = vzero; in aom_var_2d_u8_avx2() 127 __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero); in aom_var_2d_u8_avx2() 128 __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero); in aom_var_2d_u8_avx2() 141 v_acc_sum = vzero; in aom_var_2d_u8_avx2() 142 v_acc_sqs = vzero; in aom_var_2d_u8_avx2() 148 __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero); in aom_var_2d_u8_avx2() 149 __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero); in aom_var_2d_u8_avx2() 164 v_acc_sum = vzero; in aom_var_2d_u8_avx2() [all …]
|
D | sum_squares_sse2.c | 229 __m128i vzero = _mm_setzero_si128(); in aom_var_2d_u8_sse2() local 230 __m128i v_acc_sum = vzero; in aom_var_2d_u8_sse2() 231 __m128i v_acc_sqs = vzero; in aom_var_2d_u8_sse2() 245 __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero); in aom_var_2d_u8_sse2() 246 __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero); in aom_var_2d_u8_sse2() 259 v_acc_sum = vzero; in aom_var_2d_u8_sse2() 260 v_acc_sqs = vzero; in aom_var_2d_u8_sse2() 266 __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero); in aom_var_2d_u8_sse2() 267 __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero); in aom_var_2d_u8_sse2() 282 v_acc_sum = vzero; in aom_var_2d_u8_sse2() [all …]
|
/external/XNNPACK/src/q8-dwconv/ |
D | up8x9-sse2.c | 25 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_dwconv_ukernel_up8x9__sse2() local 47 const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero); in xnn_q8_dwconv_ukernel_up8x9__sse2() 49 const __m128i vxk0 = _mm_sub_epi16(_mm_unpacklo_epi8(vk0, vzero), vkernel_zero_point); in xnn_q8_dwconv_ukernel_up8x9__sse2() 56 const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero); in xnn_q8_dwconv_ukernel_up8x9__sse2() 58 const __m128i vxk1 = _mm_sub_epi16(_mm_unpacklo_epi8(vk1, vzero), vkernel_zero_point); in xnn_q8_dwconv_ukernel_up8x9__sse2() 65 const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero); in xnn_q8_dwconv_ukernel_up8x9__sse2() 67 const __m128i vxk2 = _mm_sub_epi16(_mm_unpacklo_epi8(vk2, vzero), vkernel_zero_point); in xnn_q8_dwconv_ukernel_up8x9__sse2() 74 const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero); in xnn_q8_dwconv_ukernel_up8x9__sse2() 76 const __m128i vxk3 = _mm_sub_epi16(_mm_unpacklo_epi8(vk3, vzero), vkernel_zero_point); in xnn_q8_dwconv_ukernel_up8x9__sse2() 83 const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero); in xnn_q8_dwconv_ukernel_up8x9__sse2() [all …]
|
/external/XNNPACK/src/q8-gemm/ |
D | 4x4c2-sse2.c | 63 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_gemm_ukernel_4x4c2__sse2() local 67 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() 70 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() 73 const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() 76 const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() 80 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_4x4c2__sse2() 92 const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_4x4c2__sse2() 104 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_4x4c2__sse2() 116 const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_4x4c2__sse2() 133 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); in xnn_q8_gemm_ukernel_4x4c2__sse2() [all …]
|
D | 2x4c8-sse2.c | 75 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_gemm_ukernel_2x4c8__sse2() local 78 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); in xnn_q8_gemm_ukernel_2x4c8__sse2() 81 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_gemm_ukernel_2x4c8__sse2() 85 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_2x4c8__sse2() 87 const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_2x4c8__sse2() 89 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_2x4c8__sse2() 91 const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point); in xnn_q8_gemm_ukernel_2x4c8__sse2()
|
/external/XNNPACK/src/q8-igemm/ |
D | 4x4c2-sse2.c | 60 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_igemm_ukernel_4x4c2__sse2() local 84 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() 87 const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() 90 const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() 93 const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() 97 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point); in xnn_q8_igemm_ukernel_4x4c2__sse2() 104 const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point); in xnn_q8_igemm_ukernel_4x4c2__sse2() 111 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point); in xnn_q8_igemm_ukernel_4x4c2__sse2() 118 const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point); in xnn_q8_igemm_ukernel_4x4c2__sse2() 130 const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero); in xnn_q8_igemm_ukernel_4x4c2__sse2() [all …]
|
/external/XNNPACK/src/f32-hswish/gen/ |
D | neonfma-x8.c | 30 const float32x4_t vzero = vdupq_n_f32(0.0f); in xnn_f32_hswish_ukernel__neonfma_x8() local 39 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neonfma_x8() 40 vacc4567 = vmaxq_f32(vacc4567, vzero); in xnn_f32_hswish_ukernel__neonfma_x8() 54 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neonfma_x8() 62 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neonfma_x8()
|
D | psimd-x8.c | 30 const psimd_f32 vzero = psimd_splat_f32(0.0f); in xnn_f32_hswish_ukernel__psimd_x8() local 40 vacc0123 = psimd_max_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__psimd_x8() 41 vacc4567 = psimd_max_f32(vacc4567, vzero); in xnn_f32_hswish_ukernel__psimd_x8() 57 vacc0123 = psimd_max_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__psimd_x8() 66 vacc0123 = psimd_max_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__psimd_x8()
|
D | neon-x8.c | 30 const float32x4_t vzero = vdupq_n_f32(0.0f); in xnn_f32_hswish_ukernel__neon_x8() local 39 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neon_x8() 40 vacc4567 = vmaxq_f32(vacc4567, vzero); in xnn_f32_hswish_ukernel__neon_x8() 54 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neon_x8() 62 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neon_x8()
|
D | sse-x8.c | 30 const __m128 vzero = _mm_setzero_ps(); in xnn_f32_hswish_ukernel__sse_x8() local 43 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x8() 44 vacc4567 = _mm_max_ps(vacc4567, vzero); in xnn_f32_hswish_ukernel__sse_x8() 61 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x8() 71 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x8()
|
D | avx512f-x32.c | 31 const __m512 vzero = _mm512_setzero_ps(); in xnn_f32_hswish_ukernel__avx512f_x32() local 41 vacc0123456789ABCDEF = _mm512_max_ps(vacc0123456789ABCDEF, vzero); in xnn_f32_hswish_ukernel__avx512f_x32() 42 vaccGHIJKLMNOPQRSTUV = _mm512_max_ps(vaccGHIJKLMNOPQRSTUV, vzero); in xnn_f32_hswish_ukernel__avx512f_x32() 58 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x32() 73 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x32()
|
D | fma3-x16.c | 32 const __m256 vzero = _mm256_setzero_ps(); in xnn_f32_hswish_ukernel__fma3_x16() local 42 vacc01234567 = _mm256_max_ps(vacc01234567, vzero); in xnn_f32_hswish_ukernel__fma3_x16() 43 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vzero); in xnn_f32_hswish_ukernel__fma3_x16() 59 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x16() 72 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x16()
|
D | avx-x16.c | 32 const __m256 vzero = _mm256_setzero_ps(); in xnn_f32_hswish_ukernel__avx_x16() local 45 vacc01234567 = _mm256_max_ps(vacc01234567, vzero); in xnn_f32_hswish_ukernel__avx_x16() 46 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vzero); in xnn_f32_hswish_ukernel__avx_x16() 63 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x16() 77 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x16()
|
D | psimd-x4.c | 30 const psimd_f32 vzero = psimd_splat_f32(0.0f); in xnn_f32_hswish_ukernel__psimd_x4() local 38 vacc0123 = psimd_max_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__psimd_x4() 51 vacc0123 = psimd_max_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__psimd_x4() 60 vacc0123 = psimd_max_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__psimd_x4()
|
D | avx512f-x16.c | 31 const __m512 vzero = _mm512_setzero_ps(); in xnn_f32_hswish_ukernel__avx512f_x16() local 39 vacc0123456789ABCDEF = _mm512_max_ps(vacc0123456789ABCDEF, vzero); in xnn_f32_hswish_ukernel__avx512f_x16() 52 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x16() 67 vacc = _mm512_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx512f_x16()
|
D | neon-x4.c | 30 const float32x4_t vzero = vdupq_n_f32(0.0f); in xnn_f32_hswish_ukernel__neon_x4() local 37 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neon_x4() 48 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neon_x4() 56 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neon_x4()
|
D | neonfma-x4.c | 30 const float32x4_t vzero = vdupq_n_f32(0.0f); in xnn_f32_hswish_ukernel__neonfma_x4() local 37 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neonfma_x4() 48 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neonfma_x4() 56 vacc0123 = vmaxq_f32(vacc0123, vzero); in xnn_f32_hswish_ukernel__neonfma_x4()
|
D | sse-x4.c | 30 const __m128 vzero = _mm_setzero_ps(); in xnn_f32_hswish_ukernel__sse_x4() local 40 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x4() 54 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x4() 64 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x4()
|
D | fma3-x8.c | 32 const __m256 vzero = _mm256_setzero_ps(); in xnn_f32_hswish_ukernel__fma3_x8() local 40 vacc01234567 = _mm256_max_ps(vacc01234567, vzero); in xnn_f32_hswish_ukernel__fma3_x8() 53 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x8() 66 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__fma3_x8()
|
D | avx-x8.c | 32 const __m256 vzero = _mm256_setzero_ps(); in xnn_f32_hswish_ukernel__avx_x8() local 42 vacc01234567 = _mm256_max_ps(vacc01234567, vzero); in xnn_f32_hswish_ukernel__avx_x8() 56 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x8() 70 vacc = _mm256_max_ps(vacc, vzero); in xnn_f32_hswish_ukernel__avx_x8()
|
/external/XNNPACK/src/q8-vadd/ |
D | sse2.c | 32 const __m128i vzero = _mm_setzero_si128(); in xnn_q8_vadd_ukernel__sse2() local 39 const __m128i vxa = _mm_unpacklo_epi8(va, vzero); in xnn_q8_vadd_ukernel__sse2() 40 const __m128i vxb = _mm_unpacklo_epi8(vb, vzero); in xnn_q8_vadd_ukernel__sse2() 81 const __m128i vxa = _mm_unpacklo_epi8(va, vzero); in xnn_q8_vadd_ukernel__sse2() 82 const __m128i vxb = _mm_unpacklo_epi8(vb, vzero); in xnn_q8_vadd_ukernel__sse2()
|