//****************************************************************************** //* //* Copyright (C) 2015 The Android Open Source Project //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //***************************************************************************** //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ ///* ////---------------------------------------------------------------------------- //// File Name : impeg2_format_conv.s //// //// Description : This file has the Idct Implementations for the //// MPEG4 SP decoder on neon platform. //// //// Reference Document : //// //// Revision History : //// Date Author Detail Description //// ------------ ---------------- ---------------------------------- //// Jul 07, 2008 Naveen Kumar T Created //// ////------------------------------------------------------------------------- //*/ ///* //// ---------------------------------------------------------------------------- //// Include Files //// ---------------------------------------------------------------------------- //*/ .set log2_16 , 4 .set log2_2 , 1 .text .include "impeg2_neon_macros.s" ///* //// ---------------------------------------------------------------------------- //// Struct/Union Types and Define //// ---------------------------------------------------------------------------- //*/ ///* //// ---------------------------------------------------------------------------- //// Static Global Data section variables //// ---------------------------------------------------------------------------- //*/ ////--------------------------- NONE -------------------------------------------- ///* //// ---------------------------------------------------------------------------- //// Static Prototype Functions //// ---------------------------------------------------------------------------- //*/ //// -------------------------- NONE -------------------------------------------- ///* //// ---------------------------------------------------------------------------- //// Exported functions //// ---------------------------------------------------------------------------- //*/ ///***************************************************************************** //* * //* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() * //* * //* Description : This function conversts the image from YUV420P color * //* space to 420SP color space(UV interleaved). * //* * //* Arguments : x0 pu1_y * //* x1 pu1_u * //* x2 pu1_v * //* x3 pu1_dest_y * //* x4 pu1_dest_uv * //* x5 u2_height * //* x6 u2_width * //* x7 u2_stridey * //* sp, #80 u2_strideu * //* sp, #88 u2_stridev * //* sp, #96 u2_dest_stride_y * //* sp, #104 u2_dest_stride_uv * //* sp, #112 convert_uv_only * //* * //* Values Returned : None * //* * //* Register Usage : x8, x10, x16, x20, v0, v1 * //* * //* Stack Usage : 80 Bytes * //* * //* Interruptibility : Interruptible * //* * //* Known Limitations * //* Assumptions: Image Width: Assumed to be multiple of 16 and * //* greater than or equal to 16 * //* Image Height: Assumed to be even. * //* * //* Revision History : * //* DD MM YYYY Author(s) Changes (Describe the changes made) * //* 07 06 2010 Varshita Draft * //* 07 06 2010 Naveen Kr T Completed * //* * //*****************************************************************************/ .global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8 impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8: //// push the registers on the stack // pu1_y, - x0 // pu1_u, - x1 // pu1_v, - x2 // pu1_dest_y, - x3 // pu1_dest_uv, - x4 // u2_height, - x5 // u2_width, - x6 // u2_stridey, - x7 // u2_strideu, - sp, #80 // u2_stridev, - sp, #88 // u2_dest_stride_y, - sp, #96 // u2_dest_stride_uv, - sp, #104 // convert_uv_only - sp, #112 // STMFD sp!,{x4-x12,x14} push_v_regs stp x19, x20, [sp, #-16]! ldr w14, [sp, #112] //// Load convert_uv_only cmp w14, #1 beq yuv420sp_uv_chroma ///* Do the preprocessing before the main loops start */ //// Load the parameters from stack ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack uxtw x8, w8 sub x7, x7, x6 //// Source increment sub x8, x8, x6 //// Destination increment yuv420sp_uv_row_loop_y: mov x16, x6 yuv420sp_uv_col_loop_y: prfm pldl1keep, [x0, #128] ld1 {v0.8b, v1.8b}, [x0], #16 st1 {v0.8b, v1.8b}, [x3], #16 sub x16, x16, #16 cmp x16, #15 bgt yuv420sp_uv_col_loop_y cmp x16, #0 beq yuv420sp_uv_row_loop__y ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read ////Ex if width is 162, above loop will process 160 pixels. And ////Both source and destination will point to 146th pixel and then 16 bytes will be read //// and written using VLD1 and VST1 sub x20, x16, #16 neg x16, x20 sub x0, x0, x16 sub x3, x3, x16 ld1 {v0.8b, v1.8b}, [x0], #16 st1 {v0.8b, v1.8b}, [x3], #16 yuv420sp_uv_row_loop__y: add x0, x0, x7 add x3, x3, x8 subs x5, x5, #1 bgt yuv420sp_uv_row_loop_y yuv420sp_uv_chroma: ldr w7, [sp, #88] //// Load u2_strideu from stack sxtw x7, w7 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack sxtw x8, w8 sub x7, x7, x6, lsr #1 //// Source increment sub x8, x8, x6 //// Destination increment lsr x6, x6, #1 lsr x5, x5, #1 yuv420sp_uv_row_loop_uv: mov x16, x6 yuv420sp_uv_col_loop_uv: prfm pldl1keep, [x1, #128] prfm pldl1keep, [x2, #128] ld1 {v0.8b}, [x1], #8 ld1 {v1.8b}, [x2], #8 st2 {v0.8b, v1.8b}, [x4], #16 sub x16, x16, #8 cmp x16, #7 bgt yuv420sp_uv_col_loop_uv cmp x16, #0 beq yuv420sp_uv_row_loop__uv ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read ////Ex if width is 162, above loop will process 160 pixels. And ////Both source and destination will point to 146th pixel and then 16 bytes will be read //// and written using VLD1 and VST1 sub x20, x16, #8 neg x16, x20 sub x1, x1, x16 sub x2, x2, x16 sub x4, x4, x16, lsl #1 ld1 {v0.8b}, [x1], #8 ld1 {v1.8b}, [x2], #8 st2 {v0.8b, v1.8b}, [x4], #16 yuv420sp_uv_row_loop__uv: add x1, x1, x7 add x2, x2, x7 add x4, x4, x8 subs x5, x5, #1 bgt yuv420sp_uv_row_loop_uv ////POP THE REGISTERS // LDMFD sp!,{x4-x12,PC} ldp x19, x20, [sp], #16 pop_v_regs ret ///***************************************************************************** //* * //* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() * //* * //* Description : This function conversts the image from YUV420P color * //* space to 420SP color space(VU interleaved). * //* This function is similar to above function * //* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in * //* VLD1.8 for chroma - order of registers is different * //* * //* Arguments : x0 pu1_y * //* x1 pu1_u * //* x2 pu1_v * //* x3 pu1_dest_y * //* x4 pu1_dest_uv * //* x5 u2_height * //* x6 u2_width * //* x7 u2_stridey * //* sp, #80 u2_strideu * //* sp, #88 u2_stridev * //* sp, #96 u2_dest_stride_y * //* sp, #104 u2_dest_stride_uv * //* sp, #112 convert_uv_only * //* * //* Values Returned : None * //* * //* Register Usage : x8, x14, x16, x20, v0, v1 * //* * //* Stack Usage : 80 Bytes * //* * //* Interruptibility : Interruptible * //* * //* Known Limitations * //* Assumptions: Image Width: Assumed to be multiple of 16 and * //* greater than or equal to 16 * //* Image Height: Assumed to be even. * //* * //* Revision History : * //* DD MM YYYY Author(s) Changes (Describe the changes made) * //* 07 06 2010 Varshita Draft * //* 07 06 2010 Naveen Kr T Completed * //* * //*****************************************************************************/ .global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8 impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8: //// push the registers on the stack // pu1_y, - x0 // pu1_u, - x1 // pu1_v, - x2 // pu1_dest_y, - x3 // pu1_dest_uv, - x4 // u2_height, - x5 // u2_width, - x6 // u2_stridey, - x7 // u2_strideu, - sp, #80 // u2_stridev, - sp, #88 // u2_dest_stride_y, - sp, #96 // u2_dest_stride_uv, - sp, #104 // convert_uv_only - sp, #112 // STMFD sp!,{x4-x12,x14} push_v_regs stp x19, x20, [sp, #-16]! ldr w14, [sp, #112] //// Load convert_uv_only cmp w14, #1 beq yuv420sp_vu_chroma ///* Do the preprocessing before the main loops start */ //// Load the parameters from stack ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack uxtw x8, w8 sub x7, x7, x6 //// Source increment sub x8, x8, x6 //// Destination increment yuv420sp_vu_row_loop_y: mov x16, x6 yuv420sp_vu_col_loop_y: prfm pldl1keep, [x0, #128] ld1 {v0.8b, v1.8b}, [x0], #16 st1 {v0.8b, v1.8b}, [x3], #16 sub x16, x16, #16 cmp x16, #15 bgt yuv420sp_vu_col_loop_y cmp x16, #0 beq yuv420sp_vu_row_loop__y ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read ////Ex if width is 162, above loop will process 160 pixels. And ////Both source and destination will point to 146th pixel and then 16 bytes will be read //// and written using VLD1 and VST1 sub x20, x16, #16 neg x16, x20 sub x0, x0, x16 sub x3, x3, x16 ld1 {v0.8b, v1.8b}, [x0], #16 st1 {v0.8b, v1.8b}, [x3], #16 yuv420sp_vu_row_loop__y: add x0, x0, x7 add x3, x3, x8 subs x5, x5, #1 bgt yuv420sp_vu_row_loop_y yuv420sp_vu_chroma: ldr w7, [sp, #80] //// Load u2_strideu from stack sxtw x7, w7 ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack sxtw x8, w8 sub x7, x7, x6, lsr #1 //// Source increment sub x8, x8, x6 //// Destination increment lsr x6, x6, #1 lsr x5, x5, #1 yuv420sp_vu_row_loop_uv: mov x16, x6 yuv420sp_vu_col_loop_uv: prfm pldl1keep, [x1, #128] prfm pldl1keep, [x2, #128] ld1 {v1.8b}, [x1], #8 ld1 {v0.8b}, [x2], #8 st2 {v0.8b, v1.8b}, [x4], #16 sub x16, x16, #8 cmp x16, #7 bgt yuv420sp_vu_col_loop_uv cmp x16, #0 beq yuv420sp_vu_row_loop__uv ////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read ////Ex if width is 162, above loop will process 160 pixels. And ////Both source and destination will point to 146th pixel and then 16 bytes will be read //// and written using VLD1 and VST1 sub x20, x16, #8 neg x16, x20 sub x1, x1, x16 sub x2, x2, x16 sub x4, x4, x16, lsl #1 ld1 {v1.8b}, [x1], #8 ld1 {v0.8b}, [x2], #8 st2 {v0.8b, v1.8b}, [x4], #16 yuv420sp_vu_row_loop__uv: add x1, x1, x7 add x2, x2, x7 add x4, x4, x8 subs x5, x5, #1 bgt yuv420sp_vu_row_loop_uv ////POP THE REGISTERS // LDMFD sp!,{x4-x12,PC} ldp x19, x20, [sp], #16 pop_v_regs ret