use core::arch::wasm32::*;

pub const fn v128_from_f32x4(a: [f32; 4]) -> v128 {
    f32x4(a[0], a[1], a[2], a[3])
}

/// Calculates the vector 3 dot product and returns answer in x lane of v128.
#[inline(always)]
pub(crate) fn dot3_in_x(lhs: v128, rhs: v128) -> v128 {
    let x2_y2_z2_w2 = f32x4_mul(lhs, rhs);
    let y2_0_0_0 = i32x4_shuffle::<1, 0, 0, 0>(x2_y2_z2_w2, x2_y2_z2_w2);
    let z2_0_0_0 = i32x4_shuffle::<2, 0, 0, 0>(x2_y2_z2_w2, x2_y2_z2_w2);
    let x2y2_0_0_0 = f32x4_add(x2_y2_z2_w2, y2_0_0_0);
    f32x4_add(x2y2_0_0_0, z2_0_0_0)
}

/// Calculates the vector 4 dot product and returns answer in x lane of v128.
#[inline(always)]
pub(crate) fn dot4_in_x(lhs: v128, rhs: v128) -> v128 {
    let x2_y2_z2_w2 = f32x4_mul(lhs, rhs);
    let z2_w2_0_0 = i32x4_shuffle::<2, 3, 0, 0>(x2_y2_z2_w2, x2_y2_z2_w2);
    let x2z2_y2w2_0_0 = f32x4_add(x2_y2_z2_w2, z2_w2_0_0);
    let y2w2_0_0_0 = i32x4_shuffle::<1, 0, 0, 0>(x2z2_y2w2_0_0, x2z2_y2w2_0_0);
    f32x4_add(x2z2_y2w2_0_0, y2w2_0_0_0)
}

#[inline]
pub(crate) fn dot3(lhs: v128, rhs: v128) -> f32 {
    f32x4_extract_lane::<0>(dot3_in_x(lhs, rhs))
}

#[inline]
pub(crate) fn dot3_into_v128(lhs: v128, rhs: v128) -> v128 {
    let dot_in_x = dot3_in_x(lhs, rhs);
    i32x4_shuffle::<0, 0, 0, 0>(dot_in_x, dot_in_x)
}

#[inline]
pub(crate) fn dot4(lhs: v128, rhs: v128) -> f32 {
    f32x4_extract_lane::<0>(dot4_in_x(lhs, rhs))
}

#[inline]
pub(crate) fn dot4_into_v128(lhs: v128, rhs: v128) -> v128 {
    let dot_in_x = dot4_in_x(lhs, rhs);
    i32x4_shuffle::<0, 0, 0, 0>(dot_in_x, dot_in_x)
}