@/***************************************************************************** @* @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore @* @* Licensed under the Apache License, Version 2.0 (the "License"); @* you may not use this file except in compliance with the License. @* You may obtain a copy of the License at: @* @* http://www.apache.org/licenses/LICENSE-2.0 @* @* Unless required by applicable law or agreed to in writing, software @* distributed under the License is distributed on an "AS IS" BASIS, @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @* See the License for the specific language governing permissions and @* limitations under the License. @* @*****************************************************************************/ @/** @ ******************************************************************************* @ * @file @ * ihevc_padding_neon.s @ * @ * @brief @ * contains function definitions padding @ * @ * @author @ * naveen sr @ * @ * @par list of functions: @ * - ihevc_pad_left_luma() @ * - ihevc_pad_left_chroma() @ * @ * @remarks @ * none @ * @ ******************************************************************************* @*/ @/** @******************************************************************************* @* @* @brief @* padding (luma block) at the left of a 2d array @* @* @par description: @* the left column of a 2d array is replicated for pad_size times at the left @* @* @* @param[in] pu1_src @* uword8 pointer to the source @* @* @param[in] src_strd @* integer source stride @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @param[in] pad_size @* integer -padding size of the array @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* @* @remarks @* none @* @******************************************************************************* @*/ @.if pad_left_luma == c @void ihevc_pad_left_luma(uword8 *pu1_src, @ word32 src_strd, @ word32 ht, @ word32 pad_size) @**************variables vs registers************************* @ r0 => *pu1_src @ r1 => src_strd @ r2 => ht @ r3 => pad_size .text .align 4 .globl ihevc_pad_left_luma_a9q .type ihevc_pad_left_luma_a9q, %function ihevc_pad_left_luma_a9q: stmfd sp!, {r4-r11,lr} @stack stores the values of the arguments loop_start_luma_left: @ pad size is assumed to be pad_left = 80 sub r4,r0,r3 ldrb r8,[r0] add r0,r1 ldrb r9,[r0] add r0,r1 ldrb r10,[r0] add r0,r1 ldrb r11,[r0] add r0,r1 vdup.u8 q0,r8 vdup.u8 q1,r9 vdup.u8 q2,r10 vdup.u8 q3,r11 add r5,r4,r1 vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4] @ 16 bytes store add r6,r5,r1 vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store add r7,r6,r1 vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store subs r2,#4 vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_luma_left ldmfd sp!,{r4-r11,pc} @reload the registers from sp @/** @******************************************************************************* @* @* @brief @* padding (chroma block) at the left of a 2d array @* @* @par description: @* the left column of a 2d array is replicated for pad_size times at the left @* @* @* @param[in] pu1_src @* uword8 pointer to the source @* @* @param[in] src_strd @* integer source stride @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array (each colour component) @* @* @param[in] pad_size @* integer -padding size of the array @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* @* @remarks @* none @* @******************************************************************************* @*/ @.if pad_left_chroma == c @void ihevc_pad_left_chroma(uword8 *pu1_src, @ word32 src_strd, @ word32 ht, @ word32 pad_size) @{ @ r0 => *pu1_src @ r1 => src_strd @ r2 => ht @ r3 => pad_size .globl ihevc_pad_left_chroma_a9q .type ihevc_pad_left_chroma_a9q, %function ihevc_pad_left_chroma_a9q: stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments loop_start_chroma_left: @ pad size is assumed to be pad_left = 80 sub r4,r0,r3 ldrh r8,[r0] add r0,r1 ldrh r9,[r0] add r0,r1 ldrh r10,[r0] add r0,r1 ldrh r11,[r0] add r0,r1 vdup.u16 q0,r8 vdup.u16 q1,r9 vdup.u16 q2,r10 vdup.u16 q3,r11 add r5,r4,r1 vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4] @ 16 bytes store add r6,r5,r1 vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store add r7,r6,r1 vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store subs r2,#4 vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_chroma_left ldmfd sp!,{r4-r11,pc} @reload the registers from sp @/** @******************************************************************************* @* @* @brief @* padding (luma block) at the right of a 2d array @* @* @par description: @* the right column of a 2d array is replicated for pad_size times at the right @* @* @* @param[in] pu1_src @* uword8 pointer to the source @* @* @param[in] src_strd @* integer source stride @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @param[in] pad_size @* integer -padding size of the array @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* @* @remarks @* none @* @******************************************************************************* @*/ @.if pad_right_luma == c @void ihevc_pad_right_luma(uword8 *pu1_src, @ word32 src_strd, @ word32 ht, @ word32 pad_size) @{ @ word32 row@ @ @ for(row = 0@ row < ht@ row++) @ { @ memset(pu1_src, *(pu1_src -1), pad_size)@ @ @ pu1_src += src_strd@ @ } @} @ @ r0 => *pu1_src @ r1 => src_strd @ r2 => ht @ r3 => pad_size .globl ihevc_pad_right_luma_a9q .type ihevc_pad_right_luma_a9q, %function ihevc_pad_right_luma_a9q: stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments loop_start_luma_right: @ pad size is assumed to be pad_left = 80 mov r4,r0 ldrb r8,[r0, #-1] add r0,r1 ldrb r9,[r0, #-1] add r0,r1 ldrb r10,[r0, #-1] add r0,r1 ldrb r11,[r0, #-1] add r0,r1 add r5,r4,r1 add r6,r5,r1 add r7,r6,r1 vdup.u8 q0,r8 vdup.u8 q1,r9 vdup.u8 q2,r10 vdup.u8 q3,r11 vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4] @ 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store subs r2,#4 vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7] @128/8 = 16 bytes store @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_luma_right ldmfd sp!,{r4-r11,pc} @reload the registers from sp @/** @******************************************************************************* @* @* @brief @@* padding (chroma block) at the right of a 2d array @* @* @par description: @* the right column of a 2d array is replicated for pad_size times at the right @* @* @* @param[in] pu1_src @@* uword8 pointer to the source @* @* @param[in] src_strd @* integer source stride @* @* @param[in] ht @@* integer height of the array @* @* @param[in] wd @* integer width of the array (each colour component) @* @* @param[in] pad_size @* integer -padding size of the array @* @* @param[in] ht @@* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* @* @remarks @* none @* @******************************************************************************* @*/ @.if pad_right_chroma == c @void ihevc_pad_right_chroma(uword8 *pu1_src, @ word32 src_strd, @ word32 ht, @ word32 pad_size) @ r0 => *pu1_src @ r1 => src_strd @ r2 => ht @ r3 => pad_size .globl ihevc_pad_right_chroma_a9q .type ihevc_pad_right_chroma_a9q, %function ihevc_pad_right_chroma_a9q: stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments loop_start_chroma_right: @ pad size is assumed to be pad_left = 80 mov r4,r0 ldrh r8,[r0, #-2] add r0,r1 ldrh r9,[r0, #-2] add r0,r1 ldrh r10,[r0, #-2] add r0,r1 ldrh r11,[r0, #-2] add r0,r1 vdup.u16 q0,r8 vdup.u16 q1,r9 vdup.u16 q2,r10 vdup.u16 q3,r11 add r5,r4,r1 vst1.8 {d0,d1},[r4]! @128/8 = 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4]! @ 16 bytes store vst1.8 {d0,d1},[r4] @ 16 bytes store add r6,r5,r1 vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5]! @128/8 = 16 bytes store vst1.8 {d2,d3},[r5] @128/8 = 16 bytes store add r7,r6,r1 vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6]! @128/8 = 16 bytes store vst1.8 {d4,d5},[r6] @128/8 = 16 bytes store subs r2,#4 vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7]! @128/8 = 16 bytes store vst1.8 {d6,d7},[r7] @128/8 = 16 bytes store @ total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store bne loop_start_chroma_right ldmfd sp!,{r4-r11,pc} @reload the registers from sp