/external/XNNPACK/src/f32-gemm/gen/ |
D | 1x8s4-minmax-neon.c | 48 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() local 54 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 55 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 57 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 62 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 63 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 65 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 70 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 71 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() 73 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neon() [all …]
|
D | 1x8s4-minmax-neonfma.c | 48 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() local 54 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 55 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 57 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 62 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 63 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 65 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 70 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 71 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() 73 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma() [all …]
|
D | 1x8s4-minmax-wasmsimd-x86.c | 48 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() local 55 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 56 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 58 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 63 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 64 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 66 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 71 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 72 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() 74 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_x86() [all …]
|
D | 1x8s4-minmax-sse.c | 48 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() local 55 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 56 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 58 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 63 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 64 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 66 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 71 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 72 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() 74 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__sse() [all …]
|
D | 1x8s4-minmax-wasmsimd-arm.c | 50 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() local 57 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 58 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 60 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 65 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 68 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 73 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 74 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() 76 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_1x8s4__wasmsimd_arm() [all …]
|
D | 1x16s4-minmax-fma3-broadcast.c | 48 __m256 va0 = _mm256_broadcast_ps((const __m128*) a0); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() local 55 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 56 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 58 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 63 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 64 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 66 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 71 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 72 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() 74 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast() [all …]
|
D | 3x8s4-minmax-wasmsimd-x86.c | 64 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() local 75 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 78 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 82 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 89 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 92 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 96 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 103 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 106 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() 110 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 1x8s4inc-minmax-wasmsimd-arm.c | 52 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() local 59 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 60 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 62 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 67 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 68 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 70 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 75 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 76 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() 78 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_arm() [all …]
|
D | 1x8s4inc-minmax-neon.c | 50 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() local 56 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 57 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 59 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 64 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 65 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 67 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 72 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 73 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() 75 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neon() [all …]
|
D | 1x8s4inc-minmax-neonfma.c | 50 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() local 56 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 57 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 59 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 64 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 65 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 67 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 72 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 73 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() 75 va0 = vextq_f32(va0, va0, 1); in xnn_f32_gemminc_minmax_ukernel_1x8s4__neonfma() [all …]
|
D | 1x8s4inc-minmax-sse.c | 50 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() local 57 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 58 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 60 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 65 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 66 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 68 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 73 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 74 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() 76 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__sse() [all …]
|
D | 1x8s4inc-minmax-wasmsimd-x86.c | 50 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() local 57 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 58 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 60 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 65 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 66 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 68 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 73 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 74 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() 76 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_1x8s4__wasmsimd_x86() [all …]
|
D | 1x16s4inc-minmax-fma3-broadcast.c | 50 __m256 va0 = _mm256_broadcast_ps((const __m128*) a0); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() local 57 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 58 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 60 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 65 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 66 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 68 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 73 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 74 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() 76 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_1x16s4__fma3_broadcast() [all …]
|
D | 3x8s4inc-minmax-wasmsimd-x86.c | 66 v128_t va0 = wasm_v128_load(a0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() local 77 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 80 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 84 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 91 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 94 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 98 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 105 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 108 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() 112 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__wasmsimd_x86() [all …]
|
D | 3x8s4inc-minmax-sse.c | 66 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() local 77 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 80 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 84 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 91 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 94 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 98 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 105 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 108 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() 112 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_gemminc_minmax_ukernel_3x8s4__sse() [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 1x8s4-minmax-neon.c | 60 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() local 66 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 67 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 69 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 74 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 75 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 77 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 82 vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 83 vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() 85 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neon() [all …]
|
D | 1x8s4-minmax-wasmsimd-arm.c | 63 v128_t va0 = wasm_v128_load(a0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() local 70 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 71 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 73 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 78 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 79 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 81 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 86 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 87 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() 89 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_arm() [all …]
|
D | 1x8s4-minmax-neonfma.c | 60 float32x4_t va0 = vld1q_f32(a0); a0 += 4; in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() local 66 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 67 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c0); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 69 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 74 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 75 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 77 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 82 vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 83 vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567c2); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() 85 va0 = vextq_f32(va0, va0, 1); in xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma() [all …]
|
D | 1x8s4-minmax-sse.c | 61 __m128 va0 = _mm_loadu_ps(a0); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() local 68 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 69 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 71 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 76 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 77 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 79 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 84 vacc0x0123 = _mm_add_ps(vacc0x0123, _mm_mul_ps(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 85 vacc0x4567 = _mm_add_ps(vacc0x4567, _mm_mul_ps(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() 87 va0 = _mm_shuffle_ps(va0, va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__sse() [all …]
|
D | 1x8s4-minmax-wasmsimd-x86.c | 61 v128_t va0 = wasm_v128_load(a0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() local 68 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 69 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 71 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 76 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 77 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 79 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 84 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 85 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() 87 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_1x8s4__wasmsimd_x86() [all …]
|
D | 1x16s4-minmax-fma3-broadcast.c | 61 __m256 va0 = _mm256_broadcast_ps((const __m128*) a0); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() local 68 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c0, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 69 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc0, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 71 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 76 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c1, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 77 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc1, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 79 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 84 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567c2, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 85 vacc0x89ABCDEF = _mm256_fmadd_ps(va0, vb89ABCDEFc2, vacc0x89ABCDEF); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() 87 va0 = _mm256_permute_ps(va0, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast() [all …]
|
D | 3x8s4-minmax-wasmsimd-arm.c | 85 v128_t va0 = wasm_v128_load(a0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() local 96 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 99 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 103 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 110 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 113 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 117 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 124 vacc0x0123 = wasm_f32x4_add(vacc0x0123, wasm_f32x4_mul(va0, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 127 vacc0x4567 = wasm_f32x4_add(vacc0x4567, wasm_f32x4_mul(va0, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() 131 va0 = wasm_v32x4_shuffle(va0, va0, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_3x8s4__wasmsimd_arm() [all …]
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 1x16-minmax-neonfp16arith-ld64.c | 50 const float16x4_t va0 = vld1_f16(a0); a0 += 4; in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() local 56 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 57 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc0, va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 59 const float16x8_t va0c0 = vdupq_lane_f16(va0, 0); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 68 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c1, va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 69 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc1, va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 71 const float16x8_t va0c1 = vdupq_lane_f16(va0, 1); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 80 vacc0x01234567 = vfmaq_lane_f16(vacc0x01234567, vb01234567c2, va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 81 vacc0x89ABCDEF = vfmaq_lane_f16(vacc0x89ABCDEF, vb89ABCDEFc2, va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() 83 const float16x8_t va0c2 = vdupq_lane_f16(va0, 2); in xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64() [all …]
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c2-minmax-neon-mull-padal-dup.c | 53 const int8x8_t va0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 72 …prod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 73 …prod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 74 …prod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 75 …prod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 80 …prod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 81 …prod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 82 …prod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 83 …prod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 88 …prod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c2-minmax-neon-mull-padal-dup.c | 64 const int8x8_t va0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 83 …prod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 84 …prod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 85 …prod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 86 …prod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 91 …prod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 92 …prod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 93 …prod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 94 …prod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 99 …prod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0))); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() [all …]
|