// crate minimums: sse2, x86_64

use crate::types::*;
use core::arch::x86_64::{__m128i, __m256i};

mod sse2;

#[derive(Copy, Clone)]
pub struct YesS3;
#[derive(Copy, Clone)]
pub struct NoS3;

#[derive(Copy, Clone)]
pub struct YesS4;
#[derive(Copy, Clone)]
pub struct NoS4;

#[derive(Copy, Clone)]
pub struct YesA1;
#[derive(Copy, Clone)]
pub struct NoA1;

#[derive(Copy, Clone)]
pub struct YesA2;
#[derive(Copy, Clone)]
pub struct NoA2;

#[derive(Copy, Clone)]
pub struct YesNI;
#[derive(Copy, Clone)]
pub struct NoNI;

use core::marker::PhantomData;

#[derive(Copy, Clone)]
pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
where
    sse2::u128x1_sse2<S3, S4, NI>: Swap64,
    sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
    sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
    sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
    sse2::u128x1_sse2<S3, S4, NI>: BSwap,
    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
{
    type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
    type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
    type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;

    type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
    type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
    type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
    type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;

    type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
    type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
    type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;

    #[inline(always)]
    unsafe fn instance() -> Self {
        SseMachine(PhantomData)
    }
}

#[derive(Copy, Clone)]
pub struct Avx2Machine<NI>(PhantomData<NI>);
impl<NI: Copy> Machine for Avx2Machine<NI>
where
    sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
    sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
    sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
    sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
{
    type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
    type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
    type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;

    type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
    type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
    type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
    type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;

    type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
    type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
    type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;

    #[inline(always)]
    unsafe fn instance() -> Self {
        Avx2Machine(PhantomData)
    }
}

pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
/// to avoid expensive SSE/VEX conflicts.
pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
pub type AVX2 = Avx2Machine<NoNI>;

/// Generic wrapper for unparameterized storage of any of the possible impls.
/// Converting into and out of this type should be essentially free, although it may be more
/// aligned than a particular impl requires.
#[allow(non_camel_case_types)]
#[derive(Copy, Clone)]
pub union vec128_storage {
    u32x4: [u32; 4],
    u64x2: [u64; 2],
    u128x1: [u128; 1],
    sse2: __m128i,
}
impl Store<vec128_storage> for vec128_storage {
    #[inline(always)]
    unsafe fn unpack(p: vec128_storage) -> Self {
        p
    }
}
impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
    #[inline(always)]
    fn into(self) -> &'a [u32; 4] {
        unsafe { &self.u32x4 }
    }
}
impl Into<vec128_storage> for [u32; 4] {
    #[inline(always)]
    fn into(self) -> vec128_storage {
        vec128_storage { u32x4: self }
    }
}
impl Default for vec128_storage {
    #[inline(always)]
    fn default() -> Self {
        vec128_storage { u128x1: [0] }
    }
}
impl Eq for vec128_storage {}
impl PartialEq for vec128_storage {
    #[inline(always)]
    fn eq(&self, rhs: &Self) -> bool {
        unsafe { self.u128x1 == rhs.u128x1 }
    }
}

#[allow(non_camel_case_types)]
#[derive(Copy, Clone)]
pub union vec256_storage {
    u32x8: [u32; 8],
    u64x4: [u64; 4],
    u128x2: [u128; 2],
    sse2: [vec128_storage; 2],
    avx: __m256i,
}
impl Into<vec256_storage> for [u64; 4] {
    #[inline(always)]
    fn into(self) -> vec256_storage {
        vec256_storage { u64x4: self }
    }
}
impl Default for vec256_storage {
    #[inline(always)]
    fn default() -> Self {
        vec256_storage { u128x2: [0, 0] }
    }
}
impl vec256_storage {
    pub fn new128(xs: [vec128_storage; 2]) -> Self {
        Self { sse2: xs }
    }
    pub fn split128(self) -> [vec128_storage; 2] {
        unsafe { self.sse2 }
    }
}
impl Eq for vec256_storage {}
impl PartialEq for vec256_storage {
    #[inline(always)]
    fn eq(&self, rhs: &Self) -> bool {
        unsafe { self.sse2 == rhs.sse2 }
    }
}

#[allow(non_camel_case_types)]
#[derive(Copy, Clone)]
pub union vec512_storage {
    u32x16: [u32; 16],
    u64x8: [u64; 8],
    u128x4: [u128; 4],
    sse2: [vec128_storage; 4],
    avx: [vec256_storage; 2],
}
impl Default for vec512_storage {
    #[inline(always)]
    fn default() -> Self {
        vec512_storage {
            u128x4: [0, 0, 0, 0],
        }
    }
}
impl vec512_storage {
    pub fn new128(xs: [vec128_storage; 4]) -> Self {
        Self { sse2: xs }
    }
    pub fn split128(self) -> [vec128_storage; 4] {
        unsafe { self.sse2 }
    }
}
impl Eq for vec512_storage {}
impl PartialEq for vec512_storage {
    #[inline(always)]
    fn eq(&self, rhs: &Self) -> bool {
        unsafe { self.avx == rhs.avx }
    }
}

macro_rules! impl_into {
    ($storage:ident, $array:ty, $name:ident) => {
        impl Into<$array> for $storage {
            #[inline(always)]
            fn into(self) -> $array {
                unsafe { self.$name }
            }
        }
    };
}
impl_into!(vec128_storage, [u32; 4], u32x4);
impl_into!(vec128_storage, [u64; 2], u64x2);
impl_into!(vec128_storage, [u128; 1], u128x1);
impl_into!(vec256_storage, [u32; 8], u32x8);
impl_into!(vec256_storage, [u64; 4], u64x4);
impl_into!(vec256_storage, [u128; 2], u128x2);
impl_into!(vec512_storage, [u32; 16], u32x16);
impl_into!(vec512_storage, [u64; 8], u64x8);
impl_into!(vec512_storage, [u128; 4], u128x4);

/// Generate the full set of optimized implementations to take advantage of the most important
/// hardware feature sets.
///
/// This dispatcher is suitable for maximizing throughput.
#[macro_export]
macro_rules! dispatch {
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
        #[cfg(feature = "std")]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            #[inline(always)]
            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            use std::arch::x86_64::*;
            #[target_feature(enable = "avx2")]
            unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
                let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
                _mm256_zeroupper();
                ret
            }
            #[target_feature(enable = "avx")]
            #[target_feature(enable = "sse4.1")]
            #[target_feature(enable = "ssse3")]
            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
                let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
                _mm256_zeroupper();
                ret
            }
            #[target_feature(enable = "sse4.1")]
            #[target_feature(enable = "ssse3")]
            unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
                fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
            }
            #[target_feature(enable = "ssse3")]
            unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
                fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
            }
            #[target_feature(enable = "sse2")]
            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
            }
            unsafe {
                if is_x86_feature_detected!("avx2") {
                    impl_avx2($($arg),*)
                } else if is_x86_feature_detected!("avx") {
                    impl_avx($($arg),*)
                } else if is_x86_feature_detected!("sse4.1") {
                    impl_sse41($($arg),*)
                } else if is_x86_feature_detected!("ssse3") {
                    impl_ssse3($($arg),*)
                } else if is_x86_feature_detected!("sse2") {
                    impl_sse2($($arg),*)
                } else {
                    unimplemented!()
                }
            }
        }
        #[cfg(not(feature = "std"))]
        #[inline(always)]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            unsafe {
                if cfg!(target_feature = "avx2") {
                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
                } else if cfg!(target_feature = "avx") {
                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
                } else if cfg!(target_feature = "sse4.1") {
                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
                } else if cfg!(target_feature = "ssse3") {
                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
                } else {
                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
                }
            }
        }
    };
    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
        dispatch!($mach, $MTy, {
            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
        });
    }
}

/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
///
/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
/// features (e.g. because they are done infrequently), so minimizing their contribution to code
/// size is more important.
#[macro_export]
macro_rules! dispatch_light128 {
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
        #[cfg(feature = "std")]
        $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            #[inline(always)]
            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            use std::arch::x86_64::*;
            #[target_feature(enable = "avx")]
            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
            }
            #[target_feature(enable = "sse2")]
            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
            }
            unsafe {
                if is_x86_feature_detected!("avx") {
                    impl_avx($($arg),*)
                } else if is_x86_feature_detected!("sse2") {
                    impl_sse2($($arg),*)
                } else {
                    unimplemented!()
                }
            }
        }
        #[cfg(not(feature = "std"))]
        #[inline(always)]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            unsafe {
                if cfg!(target_feature = "avx2") {
                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
                } else if cfg!(target_feature = "avx") {
                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
                } else if cfg!(target_feature = "sse4.1") {
                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
                } else if cfg!(target_feature = "ssse3") {
                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
                } else {
                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
                }
            }
        }
    };
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
        dispatch_light128!($mach, $MTy, {
            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
        });
    }
}

/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
///
/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
/// features (e.g. because they are done infrequently), so minimizing their contribution to code
/// size is more important.
#[macro_export]
macro_rules! dispatch_light256 {
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
        #[cfg(feature = "std")]
        $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
            #[inline(always)]
            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            use std::arch::x86_64::*;
            #[target_feature(enable = "avx")]
            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
            }
            #[target_feature(enable = "sse2")]
            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
            }
            unsafe {
                if is_x86_feature_detected!("avx") {
                    impl_avx($($arg),*)
                } else if is_x86_feature_detected!("sse2") {
                    impl_sse2($($arg),*)
                } else {
                    unimplemented!()
                }
            }
        }
        #[cfg(not(feature = "std"))]
        #[inline(always)]
        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
            unsafe {
                if cfg!(target_feature = "avx2") {
                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
                } else if cfg!(target_feature = "avx") {
                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
                } else if cfg!(target_feature = "sse4.1") {
                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
                } else if cfg!(target_feature = "ssse3") {
                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
                } else {
                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
                }
            }
        }
    };
    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
        dispatch_light256!($mach, $MTy, {
            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
        });
    }
}