Home
last modified time | relevance | path

Searched refs:va0 (Results 1 – 25 of 847) sorted by relevance

12345678910>>...34

/external/XNNPACK/src/f32-gemm/gen/
D1x8s4-minmax-neon.c48 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() local
54 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
55 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
57 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
62 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
63 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
65 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
70 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
71 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
73 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon()
[all …]
D1x8s4-minmax-neonfma.c48 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() local
54 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
55 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
57 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
62 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
63 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
65 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
70 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
71 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
73 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma()
[all …]
D1x8s4-minmax-wasmsimd-x86.c48 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() local
55 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
56 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
58 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
63 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
64 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
66 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
71 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
72 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
74 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86()
[all …]
D1x8s4-minmax-sse.c48 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() local
55 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
56 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
58 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
63 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
64 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
66 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
71 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
72 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
74 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse()
[all …]
D1x8s4-minmax-wasmsimd-arm.c50 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() local
57 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
58 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
60 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
65 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
68 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
73 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
74 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
76 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm()
[all …]
D1x16s4-minmax-fma3-broadcast.c48 __m256 va0 = _mm256_broadcast_ps((const __m128*) a0); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local
55 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
58 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
63 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
64 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
66 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
71 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
72 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
74 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast()
[all …]
D3x8s4-minmax-wasmsimd-x86.c64 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() local
75 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
78 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
82 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
89 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
92 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
96 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
103 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
106 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
110 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86()
[all …]
/external/XNNPACK/src/f32-gemm/gen-inc/
D1x8s4inc-minmax-wasmsimd-arm.c52 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() local
59 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
60 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
62 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
67 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
68 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
70 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
75 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
76 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
78 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm()
[all …]
D1x8s4inc-minmax-neon.c50 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() local
56 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
57 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
59 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
64 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
65 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
67 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
72 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
73 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
75 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon()
[all …]
D1x8s4inc-minmax-neonfma.c50 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() local
56 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
57 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
59 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
64 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
65 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
67 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
72 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
73 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
75 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma()
[all …]
D1x8s4inc-minmax-sse.c50 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() local
57 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
58 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
60 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
65 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
66 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
68 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
73 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
74 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
76 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse()
[all …]
D1x8s4inc-minmax-wasmsimd-x86.c50 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() local
57 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
58 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
60 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
65 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
68 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
73 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
74 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
76 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86()
[all …]
D1x16s4inc-minmax-fma3-broadcast.c50 __m256 va0 = _mm256_broadcast_ps((const __m128*) a0); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local
57 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
60 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
65 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
66 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
68 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
73 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
74 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
76 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast()
[all …]
D3x8s4inc-minmax-wasmsimd-x86.c66 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() local
77 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
80 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
84 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
91 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
94 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
98 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
105 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
108 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
112 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86()
[all …]
D3x8s4inc-minmax-sse.c66 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() local
77 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
80 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
84 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
91 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
94 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
98 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
105 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
108 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
112 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse()
[all …]
/external/XNNPACK/src/f32-igemm/gen/
D1x8s4-minmax-neon.c60 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() local
66 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
67 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
69 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
74 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
75 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
77 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
82 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
83 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
85 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon()
[all …]
D1x8s4-minmax-wasmsimd-arm.c63 v128_t va0 = wasm_v128_load(a0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() local
70 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
71 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
73 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
78 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
79 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
81 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
86 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
87 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
89 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm()
[all …]
D1x8s4-minmax-neonfma.c60 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() local
66 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
67 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
69 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
74 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
75 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
77 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
82 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
83 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
85 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma()
[all …]
D1x8s4-minmax-sse.c61 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() local
68 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
69 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
71 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
76 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
77 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
79 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
84 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
85 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
87 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse()
[all …]
D1x8s4-minmax-wasmsimd-x86.c61 v128_t va0 = wasm_v128_load(a0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() local
68 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
69 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
71 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
76 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
77 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
79 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
84 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
85 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
87 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86()
[all …]
D1x16s4-minmax-fma3-broadcast.c61 __m256 va0 = _mm256_broadcast_ps((const __m128*) a0); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() local
68 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
71 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
76 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
77 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
79 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
84 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
85 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
87 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast()
[all …]
D3x8s4-minmax-wasmsimd-arm.c85 v128_t va0 = wasm_v128_load(a0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() local
96 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
99 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
103 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
110 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
113 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
117 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
124 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
127 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
131 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm()
[all …]
/external/XNNPACK/src/f16-gemm/gen/
D1x16-minmax-neonfp16arith-ld64.c50 const float16x4_t va0 = vld1_f16(a0); a0 += 4; in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() local
56 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
57 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
59 const float16x8_t va0c0 = vdupq_lane_f16(va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
68 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c1, va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
69 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
71 const float16x8_t va0c1 = vdupq_lane_f16(va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
80 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
81 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
83 const float16x8_t va0c2 = vdupq_lane_f16(va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64()
[all …]
/external/XNNPACK/src/qs8-gemm/gen/
D1x16c2-minmax-neon-mull-padal-dup.c53 const int8x8_t va0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local
72 …prod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
73 …prod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
74 …prod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
75 …prod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
80 …prod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
81 …prod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
82 …prod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
83 …prod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
88 …prod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
[all …]
/external/XNNPACK/src/qs8-igemm/gen/
D1x16c2-minmax-neon-mull-padal-dup.c64 const int8x8_t va0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local
83 …prod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
84 …prod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
85 …prod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
86 …prod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
91 …prod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
92 …prod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
93 …prod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
94 …prod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
99 …prod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
[all …]

12345678910>>...34