/external/XNNPACK/src/qu8-vadd/ |
D | minmax-sse2.c | 40 const __m128i vxb = _mm_unpacklo_epi8(vb, vzero); in xnn_qu8_vadd_minmax_ukernel__sse2() local 47 const __m128i vb_product_lo = _mm_mullo_epi16(vxb, vb_multiplier_lo); in xnn_qu8_vadd_minmax_ukernel__sse2() 49 _mm_add_epi16(_mm_mulhi_epu16(vxb, vb_multiplier_lo), _mm_mullo_epi16(vxb, vb_multiplier_hi)); in xnn_qu8_vadd_minmax_ukernel__sse2() 82 const __m128i vxb = _mm_unpacklo_epi8(vb, vzero); in xnn_qu8_vadd_minmax_ukernel__sse2() local 89 const __m128i vb_product_lo = _mm_mullo_epi16(vxb, vb_multiplier_lo); in xnn_qu8_vadd_minmax_ukernel__sse2() 91 _mm_add_epi16(_mm_mulhi_epu16(vxb, vb_multiplier_lo), _mm_mullo_epi16(vxb, vb_multiplier_hi)); in xnn_qu8_vadd_minmax_ukernel__sse2()
|
D | minmax-neon.c | 156 const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point)); in xnn_qu8_vadd_minmax_ukernel__neon() local 166 vacc_lo = vmlaq_s32(vacc_lo, vmovl_s16(vget_low_s16(vxb)), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() 168 vacc_hi = vmlaq_s32(vacc_hi, vmovl_high_s16(vxb), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() 170 vacc_hi = vmlaq_s32(vacc_hi, vmovl_s16(vget_high_s16(vxb)), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() 199 const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point)); in xnn_qu8_vadd_minmax_ukernel__neon() local 209 vacc_lo = vmlaq_s32(vacc_lo, vmovl_s16(vget_low_s16(vxb)), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() 211 vacc_hi = vmlaq_s32(vacc_hi, vmovl_high_s16(vxb), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon() 213 vacc_hi = vmlaq_s32(vacc_hi, vmovl_s16(vget_high_s16(vxb)), vb_multiplier); in xnn_qu8_vadd_minmax_ukernel__neon()
|
/external/XNNPACK/src/qs8-igemm/ |
D | neon-mlal-lane.c.in | 83 const int16x8_t vxb${ABC[N:N+8]}c${K} = vmovl_s8(vb${ABC[N:N+8]}c${K}); 86 …vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c${K}… 87 …vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}… 92 const int16x8_t vxb${ABC[N:N+8]}c${K} = vmovl_s8(vb${ABC[N:N+8]}c${K}); 95 …vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c${K}… 96 …vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}… 107 const int16x8_t vxb${ABC[N:N+8]}c0 = vmovl_s8(vb${ABC[N:N+8]}c0); 111 …vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c0), … 112 …vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}… 117 const int16x8_t vxb${ABC[N:N+8]}c1 = vmovl_s8(vb${ABC[N:N+8]}c1); [all …]
|
D | MRx4c8-wasmsimd.c.in | 90 const v128_t vxb${N} = wasm_i16x8_widen_low_i8x16(vb${N}${N+1}); 91 const v128_t vxb${N+1} = wasm_i16x8_widen_high_i8x16(vb${N}${N+1}); 94 const v128_t vprod${M}x${N} = wasm_i16x8_mul(vxb${N}, vxa${M}); 98 const v128_t vprod${M}x${N+1} = wasm_i16x8_mul(vxb${N+1}, vxa${M}); 108 const v128_t vxb${N} = wasm_i16x8_load_8x8(w); 110 …const v128_t vxb${N} = wasm_i16x8_load_8x8((const void*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t… 113 const v128_t vxb${N} = wasm_v128_load(w); 115 … const v128_t vxb${N} = wasm_v128_load((const void*) ((uintptr_t) w + ${N * 8} * sizeof(int16_t))); 118 const v128_t vprod${M}x${N} = wasm_i16x8_mul(vxa${M}, vxb${N});
|
D | MRx4c8-sse.c.in | 101 const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1}); 102 const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1}); 106 vacc${M}x${N} = _mm_maddd_epi16(vxa${M}, vxb${N}, vacc${M}x${N}); 107 vacc${M}x${N+1} = _mm_maddd_epi16(vxa${M}, vxb${N+1}, vacc${M}x${N+1}); 109 vacc${M}x${N} = _mm_add_epi32(vacc${M}x${N}, _mm_madd_epi16(vxa${M}, vxb${N})); 110 … vacc${M}x${N+1} = _mm_add_epi32(vacc${M}x${N+1}, _mm_madd_epi16(vxa${M}, vxb${N+1})); 118 const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N}); 120 … const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}, _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N})); 124 vacc${M}x${N} = _mm_maddd_epi16(vxa${M}, vxb${N}, vacc${M}x${N}); 126 vacc${M}x${N} = _mm_add_epi32(vacc${M}x${N}, _mm_madd_epi16(vxa${M}, vxb${N}));
|
D | MRx4c2-sse.c.in | 99 const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1}); 100 const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1}); 105 … _mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K}, vacc${M}x0123); 108 … _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K})); 113 …_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K+1}, ${K+1}, ${K+1}, ${K+1})), vxb${K+1}, vacc${M}x0123); 116 …mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K+1}, ${K+1}, ${K+1}, ${K+1})), vxb${K+1})); 124 const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K}); 126 … const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}, _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K})); 131 … _mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K}, vacc${M}x0123); 134 … _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K}));
|
D | MRx8c8-avx2.c.in | 92 const __m256i vxb${N}${N+1} = _mm256_cvtepi8_epi16(vb${N}${N+1}); 95 …${M}x${N}${N+1} = _mm256_add_epi32(vacc${M}x${N}${N+1}, _mm256_madd_epi16(vxa${M}, vxb${N}${N+1}));
|
/external/XNNPACK/src/qs8-gemm/ |
D | neon-mlal-lane.c.in | 75 const int16x8_t vxb${ABC[N:N+8]}c${K} = vmovl_s8(vb${ABC[N:N+8]}c${K}); 78 …vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c${K}… 79 …vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}… 84 const int16x8_t vxb${ABC[N:N+8]}c${K} = vmovl_s8(vb${ABC[N:N+8]}c${K}); 87 …vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c${K}… 88 …vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}… 99 const int16x8_t vxb${ABC[N:N+8]}c0 = vmovl_s8(vb${ABC[N:N+8]}c0); 103 …vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c0), … 104 …vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}… 109 const int16x8_t vxb${ABC[N:N+8]}c1 = vmovl_s8(vb${ABC[N:N+8]}c1); [all …]
|
D | MRx4c8-wasmsimd.c.in | 82 const v128_t vxb${N} = wasm_i16x8_widen_low_i8x16(vb${N}${N+1}); 83 const v128_t vxb${N+1} = wasm_i16x8_widen_high_i8x16(vb${N}${N+1}); 86 const v128_t vprod${M}x${N} = wasm_i16x8_mul(vxb${N}, vxa${M}); 90 const v128_t vprod${M}x${N+1} = wasm_i16x8_mul(vxb${N+1}, vxa${M}); 100 const v128_t vxb${N} = wasm_i16x8_load_8x8(w); 102 …const v128_t vxb${N} = wasm_i16x8_load_8x8((const void*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t… 105 const v128_t vxb${N} = wasm_v128_load(w); 107 … const v128_t vxb${N} = wasm_v128_load((const void*) ((uintptr_t) w + ${N * 8} * sizeof(int16_t))); 110 const v128_t vprod${M}x${N} = wasm_i16x8_mul(vxa${M}, vxb${N});
|
D | MRx4c8-sse.c.in | 96 const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1}); 97 const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1}); 101 vacc${M}x${N} = _mm_maddd_epi16(vxa${M}, vxb${N}, vacc${M}x${N}); 102 vacc${M}x${N+1} = _mm_maddd_epi16(vxa${M}, vxb${N+1}, vacc${M}x${N+1}); 104 vacc${M}x${N} = _mm_add_epi32(vacc${M}x${N}, _mm_madd_epi16(vxa${M}, vxb${N})); 105 vacc${M}x${N+1} = _mm_add_epi32(vacc${M}x${N+1}, _mm_madd_epi16(vxa${M}, vxb${N+1})); 114 const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N}); 116 … const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}, _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N})); 119 const __m128i vxb${N} = _mm_load_si128((const __m128i*) w); 121 …const __m128i vxb${N} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int16_t… [all …]
|
D | MRx4c2-sse.c.in | 94 const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1}); 95 const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1}); 100 … _mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K}, vacc${M}x0123); 103 … _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K})); 108 …_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K+1}, ${K+1}, ${K+1}, ${K+1})), vxb${K+1}, vacc${M}x0123); 111 …mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K+1}, ${K+1}, ${K+1}, ${K+1})), vxb${K+1})); 120 const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K}); 122 … const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}, _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K})); 125 const __m128i vxb${K} = _mm_load_si128((const __m128i*) w); 127 …const __m128i vxb${K} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${K * 8} * sizeof(int16_t… [all …]
|
D | MRx8c8-avx2.c.in | 84 const __m256i vxb${N}${N+1} = _mm256_load_si256((const __m256i*) w); 86 …const __m256i vxb${N}${N+1} = _mm256_load_si256((const __m256i*) ((uintptr_t) w + ${N * 8} * sizeo… 92 const __m256i vxb${N}${N+1} = _mm256_cvtepi8_epi16(vb${N}${N+1}); 95 …${M}x${N}${N+1} = _mm256_add_epi32(vacc${M}x${N}${N+1}, _mm256_madd_epi16(vxa${M}, vxb${N}${N+1}));
|