// Copyright 2024 The ChromiumOS Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

use std::cmp::min;

use crate::video_frame::{VideoFrame, UV_PLANE, U_PLANE, V_PLANE, Y_PLANE};
use crate::DecodedFormat;

/// TODO(greenjustin): This entire file should be replaced with LibYUV.
use byteorder::ByteOrder;
use byteorder::LittleEndian;

#[cfg(feature = "v4l2")]
use std::arch::aarch64::*;

pub const MM21_TILE_WIDTH: usize = 16;
pub const MM21_TILE_HEIGHT: usize = 32;

/// Copies `src` into `dst` as NV12, handling padding.
pub fn nv12_copy(
    src_y: &[u8],
    src_y_stride: usize,
    dst_y: &mut [u8],
    dst_y_stride: usize,
    src_uv: &[u8],
    src_uv_stride: usize,
    dst_uv: &mut [u8],
    dst_uv_stride: usize,
    width: usize,
    height: usize,
) {
    for y in 0..height {
        dst_y[(y * dst_y_stride)..(y * dst_y_stride + width)]
            .copy_from_slice(&src_y[(y * src_y_stride)..(y * src_y_stride + width)]);
    }
    for y in 0..(height / 2) {
        dst_uv[(y * dst_uv_stride)..(y * dst_uv_stride + width)]
            .copy_from_slice(&src_uv[(y * src_uv_stride)..(y * src_uv_stride + width)]);
    }
}

/// Replace 0 padding with the last pixels of the real image. This helps reduce compression
/// artifacts caused by the sharp transition between real image data and 0.
pub fn extend_border_nv12(
    y_plane: &mut [u8],
    uv_plane: &mut [u8],
    visible_width: usize,
    visible_height: usize,
    coded_width: usize,
    coded_height: usize,
) {
    assert!(visible_width > 1);
    assert!(visible_height > 1);
    for y in 0..visible_height {
        let row_start = y * coded_width;
        for x in visible_width..coded_width {
            y_plane[row_start + x] = y_plane[row_start + x - 1]
        }
    }
    for y in visible_height..coded_height {
        let (src, dst) = y_plane.split_at_mut(y * coded_width);
        dst[0..coded_width].copy_from_slice(&src[((y - 1) * coded_width)..(y * coded_width)]);
    }
    for y in 0..(visible_height / 2) {
        let row_start = y * coded_width;
        for x in visible_width..coded_width {
            // We use minus 2 here because we want to actually repeat the last 2 UV values.
            uv_plane[row_start + x] = uv_plane[row_start + x - 2]
        }
    }
    for y in (visible_height / 2)..(coded_height / 2) {
        let (src, dst) = uv_plane.split_at_mut(y * coded_width);
        dst[0..coded_width].copy_from_slice(&src[((y - 1) * coded_width)..(y * coded_width)]);
    }
}

pub fn copy_plane(
    src: &[u8],
    src_stride: usize,
    dst: &mut [u8],
    dst_stride: usize,
    width: usize,
    height: usize,
) {
    for y in 0..height {
        dst[(y * dst_stride)..(y * dst_stride + width)]
            .copy_from_slice(&src[(y * src_stride)..(y * src_stride + width)]);
    }
}

/// Copies `src` into `dst` as I4xx (YUV tri-planar).
///
/// `sub_h` and `sub_v` enable horizontal and vertical sub-sampling, respectively. E.g, if both
/// `sub_h` and `sub_v` are `true` the data will be `4:2:0`, if only `sub_v` is `true` then it will be
/// `4:2:2`, and if both are `false` then we have `4:4:4`.
pub fn i4xx_copy(
    src_y: &[u8],
    src_y_stride: usize,
    dst_y: &mut [u8],
    dst_y_stride: usize,
    src_u: &[u8],
    src_u_stride: usize,
    dst_u: &mut [u8],
    dst_u_stride: usize,
    src_v: &[u8],
    src_v_stride: usize,
    dst_v: &mut [u8],
    dst_v_stride: usize,
    width: usize,
    height: usize,
    (sub_h, sub_v): (bool, bool),
) {
    copy_plane(src_y, src_y_stride, dst_y, dst_y_stride, width, height);

    // Align width and height of UV planes to 2 if sub-sampling is used.
    let uv_width = if sub_h { (width + 1) / 2 } else { width };
    let uv_height = if sub_v { (height + 1) / 2 } else { height };

    copy_plane(src_u, src_u_stride, dst_u, dst_u_stride, uv_width, uv_height);
    copy_plane(src_v, src_v_stride, dst_v, dst_v_stride, uv_width, uv_height);
}

/// Copies `src` into `dst` as I410, removing all padding and changing the layout from packed to
/// triplanar. Also drops the alpha channel.
pub fn y410_to_i410(
    src: &[u8],
    dst: &mut [u8],
    width: usize,
    height: usize,
    strides: [usize; 3],
    offsets: [usize; 3],
) {
    let src_lines = src[offsets[0]..].chunks(strides[0]).map(|line| &line[..width * 4]);

    let dst_y_size = width * 2 * height;
    let dst_u_size = width * 2 * height;

    let (dst_y_plane, dst_uv_planes) = dst.split_at_mut(dst_y_size);
    let (dst_u_plane, dst_v_plane) = dst_uv_planes.split_at_mut(dst_u_size);
    let dst_y_lines = dst_y_plane.chunks_mut(width * 2);
    let dst_u_lines = dst_u_plane.chunks_mut(width * 2);
    let dst_v_lines = dst_v_plane.chunks_mut(width * 2);

    for (src_line, (dst_y_line, (dst_u_line, dst_v_line))) in
        src_lines.zip(dst_y_lines.zip(dst_u_lines.zip(dst_v_lines))).take(height)
    {
        for (src, (dst_y, (dst_u, dst_v))) in src_line.chunks(4).zip(
            dst_y_line.chunks_mut(2).zip(dst_u_line.chunks_mut(2).zip(dst_v_line.chunks_mut(2))),
        ) {
            let y = LittleEndian::read_u16(&[src[1] >> 2 | src[2] << 6, src[2] >> 2 & 0b11]);
            let u = LittleEndian::read_u16(&[src[0], src[1] & 0b11]);
            let v = LittleEndian::read_u16(&[src[2] >> 4 | src[3] << 4, src[3] >> 4 & 0b11]);
            LittleEndian::write_u16(dst_y, y);
            LittleEndian::write_u16(dst_u, u);
            LittleEndian::write_u16(dst_v, v);
        }
    }
}

#[cfg(feature = "v4l2")]
// SAFETY: Verified by caller that |src| and |dst| is valid and not
// a NULL-pointer or invalid memory.
pub unsafe fn align_detile(src: *const u8, src_tile_stride: isize, dst: *mut u8, width: usize) {
    let mut vin = [0u8; MM21_TILE_WIDTH];
    let mut vout = [0u8; MM21_TILE_WIDTH];

    let bytes_per_pixel = 1;
    let mask = MM21_TILE_WIDTH - 1;

    let remainder = width & mask;
    let width_aligned_down = width & !mask;
    if width_aligned_down > 0 {
        detile_row(src, src_tile_stride, dst, width_aligned_down);
    }

    let index = (width_aligned_down / MM21_TILE_WIDTH * (src_tile_stride as usize)) as usize;
    let input_slice =
        std::slice::from_raw_parts(src.offset(index as isize), remainder * bytes_per_pixel);
    (&mut vin[0..remainder * bytes_per_pixel])
        .copy_from_slice(&input_slice[0..remainder * bytes_per_pixel]);

    detile_row(vin.as_ptr(), src_tile_stride, vout.as_mut_ptr(), MM21_TILE_WIDTH);

    let output_slice = std::slice::from_raw_parts_mut(
        dst.offset(width_aligned_down as isize),
        remainder * bytes_per_pixel,
    );
    output_slice[0..remainder * bytes_per_pixel]
        .copy_from_slice(&vout[0..remainder * bytes_per_pixel]);
}

#[cfg(feature = "v4l2")]
// SAFETY: Verified by caller that |src| and |dst| is valid and not
// a NULL-pointer or invalid memory.
pub unsafe fn detile_row(
    mut src: *const u8,
    src_tile_stride: isize,
    mut dst: *mut u8,
    width: usize,
) {
    let mut w = width;
    while w > 0 {
        let v0: uint8x16_t = vld1q_u8(src);
        src = src.offset(src_tile_stride as isize);
        w = w - MM21_TILE_WIDTH;
        vst1q_u8(dst, v0);
        dst = dst.offset(MM21_TILE_WIDTH as isize);
    }
}

// TODO(bchoobineh): Use a fuzzer to verify the correctness of this SIMD
// code compared to its Rust equivalent
// Detiles a plane of data using implementation from LibYUV::DetilePlane.
#[cfg(feature = "v4l2")]
pub fn detile_plane(
    src: &[u8],
    src_stride: usize,
    dst: &mut [u8],
    mut dst_stride: isize,
    width: usize,
    mut height: isize,
    tile_height: usize,
) -> Result<(), String> {
    let src_tile_stride = (16 * tile_height) as isize;

    if width == 0 || height == 0 || (((tile_height) & ((tile_height) - 1)) > 0) {
        return Err("Invalid width, height, or tile height is not a power of 2.".to_owned());
    }

    let mut aligned = true;
    if (width & (MM21_TILE_WIDTH - 1)) > 0 {
        aligned = false;
    }

    if src.len() < (src_stride * (height.abs() as usize)) {
        return Err("Src buffer not big enough.".to_owned());
    }

    if dst.len() < (dst_stride * height.abs()) as usize {
        return Err("Dst buffer not big enough.".to_owned());
    }

    let mut src_ptr = src.as_ptr();
    let mut dst_ptr = dst.as_mut_ptr();

    // Image inversion
    if height < 0 {
        height = -height;
        // SAFETY: Verified the validity of src buffer and height.
        unsafe {
            src_ptr = src_ptr.offset(((height - 1) * dst_stride) as isize);
        }
        dst_stride = -dst_stride;
    }

    // Detile Plane
    for y in 0..height {
        // SAFETY: Verified validity of src and dst pointers.
        unsafe {
            if aligned {
                detile_row(src_ptr, src_tile_stride, dst_ptr, width);
            } else {
                align_detile(src_ptr, src_tile_stride, dst_ptr, width);
            }
        }

        // SAFETY: Verified the validity of the src and dst buffers.
        unsafe {
            dst_ptr = dst_ptr.offset(dst_stride as isize);
            src_ptr = src_ptr.offset(MM21_TILE_WIDTH as isize);
        }
        // Advance to next row of tiles.
        if (y & (tile_height - 1) as isize) == ((tile_height - 1) as isize) {
            // SAFETY: Verified validity of the src buffers.
            unsafe {
                src_ptr = src_ptr.offset(-src_tile_stride + (src_stride * tile_height) as isize);
            }
        }
    }

    Ok(())
}

// Converts MM21 to NV12 using the implementation from LibYUV::MM21ToNV12.
#[cfg(feature = "v4l2")]
pub fn mm21_to_nv12(
    src_y: &[u8],
    src_stride_y: usize,
    dst_y: &mut [u8],
    dst_stride_y: usize,
    src_uv: &[u8],
    src_stride_uv: usize,
    dst_uv: &mut [u8],
    dst_stride_uv: usize,
    width: usize,
    height: isize,
) -> Result<(), String> {
    if width <= 0 {
        return Err("Width must be greater than 0.".to_owned());
    }

    let sign = if height < 0 { -1 } else { 1 };

    // Detile Plane Y
    detile_plane(
        src_y,
        src_stride_y,
        dst_y,
        dst_stride_y as isize,
        width,
        height,
        MM21_TILE_HEIGHT,
    )?;

    // Detile Plane UV
    detile_plane(
        src_uv,
        src_stride_uv,
        dst_uv,
        dst_stride_uv as isize,
        (width + 1) & !1,
        (height + sign) / 2,
        MM21_TILE_HEIGHT / 2,
    )
}

pub fn nv12_to_i420(
    src_y: &[u8],
    src_y_stride: usize,
    dst_y: &mut [u8],
    dst_y_stride: usize,
    src_uv: &[u8],
    src_uv_stride: usize,
    dst_u: &mut [u8],
    dst_u_stride: usize,
    dst_v: &mut [u8],
    dst_v_stride: usize,
    width: usize,
    height: usize,
) {
    copy_plane(src_y, src_y_stride, dst_y, dst_y_stride, width, height);

    // We can just assume 4:2:0 subsampling
    let aligned_width = (width + 1) & (!1);
    for y in 0..((height + 1) / 2) {
        let src_row = &src_uv[(y * src_uv_stride)..(y * src_uv_stride + aligned_width)];
        let dst_u_row = &mut dst_u[(y * dst_u_stride)..(y * dst_u_stride + aligned_width / 2)];
        let dst_v_row = &mut dst_v[(y * dst_v_stride)..(y * dst_v_stride + aligned_width / 2)];
        for x in 0..aligned_width {
            if x % 2 == 0 {
                dst_u_row[x / 2] = src_row[x];
            } else {
                dst_v_row[x / 2] = src_row[x];
            }
        }
    }
}

pub fn i420_to_nv12_chroma(src_u: &[u8], src_v: &[u8], dst_uv: &mut [u8]) {
    for i in 0..dst_uv.len() {
        if i % 2 == 0 {
            dst_uv[i] = src_u[i / 2];
        } else {
            dst_uv[i] = src_v[i / 2];
        }
    }
}

pub fn i420_to_nv12(src_y: &[u8], dst_y: &mut [u8], src_u: &[u8], src_v: &[u8], dst_uv: &mut [u8]) {
    dst_y.copy_from_slice(src_y);
    i420_to_nv12_chroma(src_u, src_v, dst_uv);
}

// TODO: Add more conversions. All supported conversion functions need to take stride parameters.
pub const SUPPORTED_CONVERSION: &'static [(DecodedFormat, DecodedFormat)] = &[
    #[cfg(feature = "v4l2")]
    (DecodedFormat::MM21, DecodedFormat::NV12),
    (DecodedFormat::NV12, DecodedFormat::NV12),
    (DecodedFormat::I420, DecodedFormat::I420),
    (DecodedFormat::I422, DecodedFormat::I422),
    (DecodedFormat::I444, DecodedFormat::I444),
];

pub fn convert_video_frame(src: &impl VideoFrame, dst: &mut impl VideoFrame) -> Result<(), String> {
    let width = min(dst.resolution().width, src.resolution().width) as usize;
    let height = min(dst.resolution().height, src.resolution().height) as usize;

    let conversion = (src.decoded_format()?, dst.decoded_format()?);
    let src_pitches = src.get_plane_pitch();
    let src_mapping = src.map().expect("Image processor src mapping failed!");
    let src_planes = src_mapping.get();
    let dst_pitches = dst.get_plane_pitch();
    let dst_mapping = dst.map_mut().expect("Image processor dst mapping failed!");
    let dst_planes = dst_mapping.get();
    match conversion {
        #[cfg(feature = "v4l2")]
        (DecodedFormat::MM21, DecodedFormat::NV12) => mm21_to_nv12(
            src_planes[Y_PLANE],
            src_pitches[Y_PLANE],
            *dst_planes[Y_PLANE].borrow_mut(),
            dst_pitches[Y_PLANE],
            src_planes[UV_PLANE],
            src_pitches[UV_PLANE],
            *dst_planes[UV_PLANE].borrow_mut(),
            dst_pitches[UV_PLANE],
            width,
            height as isize,
        ),
        (DecodedFormat::NV12, DecodedFormat::NV12) => {
            nv12_copy(
                src_planes[Y_PLANE],
                src_pitches[Y_PLANE],
                *dst_planes[Y_PLANE].borrow_mut(),
                dst_pitches[Y_PLANE],
                src_planes[UV_PLANE],
                src_pitches[UV_PLANE],
                *dst_planes[UV_PLANE].borrow_mut(),
                dst_pitches[UV_PLANE],
                width,
                height,
            );
            Ok(())
        }
        (DecodedFormat::I420, DecodedFormat::I420) => {
            i4xx_copy(
                src_planes[Y_PLANE],
                src_pitches[Y_PLANE],
                *dst_planes[Y_PLANE].borrow_mut(),
                dst_pitches[Y_PLANE],
                src_planes[U_PLANE],
                src_pitches[U_PLANE],
                *dst_planes[U_PLANE].borrow_mut(),
                dst_pitches[U_PLANE],
                src_planes[V_PLANE],
                src_pitches[V_PLANE],
                *dst_planes[V_PLANE].borrow_mut(),
                dst_pitches[V_PLANE],
                width,
                height,
                (true, true),
            );
            Ok(())
        }
        (DecodedFormat::I422, DecodedFormat::I422) => {
            i4xx_copy(
                src_planes[Y_PLANE],
                src_pitches[Y_PLANE],
                *dst_planes[Y_PLANE].borrow_mut(),
                dst_pitches[Y_PLANE],
                src_planes[U_PLANE],
                src_pitches[U_PLANE],
                *dst_planes[U_PLANE].borrow_mut(),
                dst_pitches[U_PLANE],
                src_planes[V_PLANE],
                src_pitches[V_PLANE],
                *dst_planes[V_PLANE].borrow_mut(),
                dst_pitches[V_PLANE],
                width,
                height,
                (true, false),
            );
            Ok(())
        }
        (DecodedFormat::I444, DecodedFormat::I444) => {
            i4xx_copy(
                src_planes[Y_PLANE],
                src_pitches[Y_PLANE],
                *dst_planes[Y_PLANE].borrow_mut(),
                dst_pitches[Y_PLANE],
                src_planes[U_PLANE],
                src_pitches[U_PLANE],
                *dst_planes[U_PLANE].borrow_mut(),
                dst_pitches[U_PLANE],
                src_planes[V_PLANE],
                src_pitches[V_PLANE],
                *dst_planes[V_PLANE].borrow_mut(),
                dst_pitches[V_PLANE],
                width,
                height,
                (false, false),
            );
            Ok(())
        }
        _ => Err(format!("Unsupported conversion {:?} -> {:?}", conversion.0, conversion.1)),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    #[cfg(feature = "v4l2")]
    fn test_mm21_to_nv12() {
        let test_input = include_bytes!("test_data/puppets-480x270_20230825.mm21.yuv");
        let test_expected_output = include_bytes!("test_data/puppets-480x270_20230825.nv12.yuv");

        let mut test_output = [0u8; 480 * 288 * 3 / 2];
        let (test_y_output, test_uv_output) = test_output.split_at_mut(480 * 288);
        mm21_to_nv12(
            &test_input[0..480 * 288],
            480,
            test_y_output,
            480,
            &test_input[480 * 288..480 * 288 * 3 / 2],
            480,
            test_uv_output,
            480,
            480,
            288,
        )
        .expect("Failed to detile!");
        assert_eq!(test_output, *test_expected_output);
    }

    #[test]
    fn test_nv12_to_i420() {
        let test_input = include_bytes!("test_data/puppets-480x270_20230825.nv12.yuv");
        let test_expected_output = include_bytes!("test_data/puppets-480x270_20230825.i420.yuv");

        let mut test_output = [0u8; 480 * 288 * 3 / 2];
        let (test_y_output, test_uv_output) = test_output.split_at_mut(480 * 288);
        let (test_u_output, test_v_output) = test_uv_output.split_at_mut(480 * 288 / 4);
        nv12_to_i420(
            &test_input[0..480 * 288],
            test_y_output,
            &test_input[480 * 288..480 * 288 * 3 / 2],
            test_u_output,
            test_v_output,
        );
        assert_eq!(test_output, *test_expected_output);
    }

    #[test]
    fn test_i420_to_nv12() {
        let test_input = include_bytes!("test_data/puppets-480x270_20230825.i420.yuv");
        let test_expected_output = include_bytes!("test_data/puppets-480x270_20230825.nv12.yuv");

        let mut test_output = [0u8; 480 * 288 * 3 / 2];
        let (test_y_output, test_uv_output) = test_output.split_at_mut(480 * 288);
        i420_to_nv12(
            &test_input[0..(480 * 288)],
            test_y_output,
            &test_input[(480 * 288)..(480 * 288 * 5 / 4)],
            &test_input[(480 * 288 * 5 / 4)..(480 * 288 * 3 / 2)],
            test_uv_output,
        );
        assert_eq!(test_output, *test_expected_output);
    }
}