@/****************************************************************************** @ * @ * Copyright (C) 2015 The Android Open Source Project @ * @ * Licensed under the Apache License, Version 2.0 (the "License"); @ * you may not use this file except in compliance with the License. @ * You may obtain a copy of the License at: @ * @ * http://www.apache.org/licenses/LICENSE-2.0 @ * @ * Unless required by applicable law or agreed to in writing, software @ * distributed under the License is distributed on an "AS IS" BASIS, @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @ * See the License for the specific language governing permissions and @ * limitations under the License. @ * @ ***************************************************************************** @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore @*/ @** @****************************************************************************** @* @file @* ih264_default_weighted_pred_a9q.s @* @* @brief @* Contains function definitions for default weighted prediction. @* @* @author @* Kaushik Senthoor R @* @* @par List of Functions: @* @* - ih264_default_weighted_pred_luma_a9q() @* - ih264_default_weighted_pred_chroma_a9q() @* @* @remarks @* None @* @******************************************************************************* @* @******************************************************************************* @* @function @* ih264_default_weighted_pred_luma_a9q() @* @* @brief @* This routine performs the default weighted prediction as described in sec @* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. @* @* @par Description: @* This function gets two ht x wd blocks, calculates their rounded-average and @* stores it in the destination block. @* @* @param[in] pu1_src1: @* UWORD8 Pointer to the buffer containing the first input block. @* @* @param[in] pu1_src2: @* UWORD8 Pointer to the buffer containing the second input block. @* @* @param[out] pu1_dst @* UWORD8 pointer to the destination where the output block is stored. @* @* @param[in] src_strd1 @* Stride of the first input buffer @* @* @param[in] src_strd2 @* Stride of the second input buffer @* @* @param[in] dst_strd @* Stride of the destination buffer @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* None @* @* @remarks @* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). @* @******************************************************************************* @* @void ih264_default_weighted_pred_luma_a9q(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @ UWORD8 *pu1_dst, @ WORD32 src_strd1, @ WORD32 src_strd2, @ WORD32 dst_strd, @ WORD32 ht, @ WORD32 wd) @ @**************Variables Vs Registers***************************************** @ r0 => pu1_src1 @ r1 => pu1_src2 @ r2 => pu1_dst @ r3 => src_strd1 @ [sp] => src_strd2 (r4) @ [sp+4] => dst_strd (r5) @ [sp+8] => ht (r6) @ [sp+12] => wd (r7) @ .text .p2align 2 .global ih264_default_weighted_pred_luma_a9q ih264_default_weighted_pred_luma_a9q: stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments ldr r7, [sp, #32] @Load wd ldr r4, [sp, #20] @Load src_strd2 ldr r5, [sp, #24] @Load dst_strd cmp r7, #16 ldr r6, [sp, #28] @Load ht vpush {d8-d15} beq loop_16 @branch if wd is 16 cmp r7, #8 beq loop_8 @branch if wd is 8 loop_4: @each iteration processes four rows vld1.32 d0[0], [r0], r3 @load row 1 in source 1 vld1.32 d0[1], [r0], r3 @load row 2 in source 1 vld1.32 d2[0], [r1], r4 @load row 1 in source 2 vld1.32 d2[1], [r1], r4 @load row 2 in source 2 vld1.32 d1[0], [r0], r3 @load row 3 in source 1 vld1.32 d1[1], [r0], r3 @load row 4 in source 1 vrhadd.u8 d0, d0, d2 vld1.32 d3[0], [r1], r4 @load row 3 in source 2 vld1.32 d3[1], [r1], r4 @load row 4 in source 2 subs r6, r6, #4 @decrement ht by 4 vst1.32 d0[0], [r2], r5 @load row 1 in destination vst1.32 d0[1], [r2], r5 @load row 2 in destination vrhadd.u8 d1, d1, d3 vst1.32 d1[0], [r2], r5 @load row 3 in destination vst1.32 d1[1], [r2], r5 @load row 4 in destination bgt loop_4 @if greater than 0 repeat the loop again b end_loops loop_8: @each iteration processes four rows vld1.8 d0, [r0], r3 @load row 1 in source 1 vld1.8 d4, [r1], r4 @load row 1 in source 2 vld1.8 d1, [r0], r3 @load row 2 in source 1 vld1.8 d5, [r1], r4 @load row 2 in source 2 vld1.8 d2, [r0], r3 @load row 3 in source 1 vrhadd.u8 q0, q0, q2 vld1.8 d6, [r1], r4 @load row 3 in source 2 vld1.8 d3, [r0], r3 @load row 4 in source 1 vrhadd.u8 d2, d2, d6 vld1.8 d7, [r1], r4 @load row 4 in source 2 subs r6, r6, #4 @decrement ht by 4 vst1.8 d0, [r2], r5 @load row 1 in destination vrhadd.u8 d3, d3, d7 vst1.8 d1, [r2], r5 @load row 2 in destination vst1.8 d2, [r2], r5 @load row 3 in destination vst1.8 d3, [r2], r5 @load row 4 in destination bgt loop_8 @if greater than 0 repeat the loop again b end_loops loop_16: @each iteration processes eight rows vld1.8 {q0}, [r0], r3 @load row 1 in source 1 vld1.8 {q8}, [r1], r4 @load row 1 in source 2 vld1.8 {q1}, [r0], r3 @load row 2 in source 1 vld1.8 {q9}, [r1], r4 @load row 2 in source 2 vrhadd.u8 q0, q0, q8 vld1.8 {q2}, [r0], r3 @load row 3 in source 1 vld1.8 {q10}, [r1], r4 @load row 3 in source 2 vrhadd.u8 q1, q1, q9 vld1.8 {q3}, [r0], r3 @load row 4 in source 1 vld1.8 {q11}, [r1], r4 @load row 4 in source 2 vrhadd.u8 q2, q2, q10 vld1.8 {q4}, [r0], r3 @load row 5 in source 1 vld1.8 {q12}, [r1], r4 @load row 5 in source 2 vrhadd.u8 q3, q3, q11 vld1.8 {q5}, [r0], r3 @load row 6 in source 1 vld1.8 {q13}, [r1], r4 @load row 6 in source 2 vrhadd.u8 q4, q4, q12 vld1.8 {q6}, [r0], r3 @load row 7 in source 1 vld1.8 {q14}, [r1], r4 @load row 7 in source 2 vrhadd.u8 q5, q5, q13 vld1.8 {q7}, [r0], r3 @load row 8 in source 1 vld1.8 {q15}, [r1], r4 @load row 8 in source 2 vrhadd.u8 q6, q6, q14 vst1.8 {q0}, [r2], r5 @load row 1 in destination vst1.8 {q1}, [r2], r5 @load row 2 in destination vrhadd.u8 q7, q7, q15 vst1.8 {q2}, [r2], r5 @load row 3 in destination vst1.8 {q3}, [r2], r5 @load row 4 in destination subs r6, r6, #8 @decrement ht by 8 vst1.8 {q4}, [r2], r5 @load row 5 in destination vst1.8 {q5}, [r2], r5 @load row 6 in destination vst1.8 {q6}, [r2], r5 @load row 7 in destination vst1.8 {q7}, [r2], r5 @load row 8 in destination bgt loop_16 @if greater than 0 repeat the loop again end_loops: vpop {d8-d15} ldmfd sp!, {r4-r7, r15} @Reload the registers from sp @******************************************************************************* @* @function @* ih264_default_weighted_pred_chroma_a9q() @* @* @brief @* This routine performs the default weighted prediction as described in sec @* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. @* @* @par Description: @* This function gets two ht x wd blocks, calculates their rounded-average and @* stores it in the destination block for U and V. @* @* @param[in] pu1_src1: @* UWORD8 Pointer to the buffer containing the first input block. @* @* @param[in] pu1_src2: @* UWORD8 Pointer to the buffer containing the second input block. @* @* @param[out] pu1_dst @* UWORD8 pointer to the destination where the output block is stored. @* @* @param[in] src_strd1 @* Stride of the first input buffer @* @* @param[in] src_strd2 @* Stride of the second input buffer @* @* @param[in] dst_strd @* Stride of the destination buffer @* @* @param[in] ht @* integer height of the array @* @* @param[in] wd @* integer width of the array @* @* @returns @* None @* @* @remarks @* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). @* @******************************************************************************* @* @void ih264_default_weighted_pred_chroma_a9q(UWORD8 *pu1_src1, @ UWORD8 *pu1_src2, @ UWORD8 *pu1_dst, @ WORD32 src_strd1, @ WORD32 src_strd2, @ WORD32 dst_strd, @ WORD32 ht, @ WORD32 wd) @ @**************Variables Vs Registers***************************************** @ r0 => pu1_src1 @ r1 => pu1_src2 @ r2 => pu1_dst @ r3 => src_strd1 @ [sp] => src_strd2 (r4) @ [sp+4] => dst_strd (r5) @ [sp+8] => ht (r6) @ [sp+12] => wd (r7) @ .global ih264_default_weighted_pred_chroma_a9q ih264_default_weighted_pred_chroma_a9q: stmfd sp!, {r4-r7, r14} @stack stores the values of the arguments ldr r7, [sp, #32] @Load wd ldr r4, [sp, #20] @Load src_strd2 ldr r5, [sp, #24] @Load dst_strd cmp r7, #8 ldr r6, [sp, #28] @Load ht vpush {d8-d15} beq loop_8_uv @branch if wd is 8 cmp r7, #4 beq loop_4_uv @branch if wd is 4 loop_2_uv: @each iteration processes two rows vld1.32 d0[0], [r0], r3 @load row 1 in source 1 vld1.32 d0[1], [r0], r3 @load row 2 in source 1 vld1.32 d1[0], [r1], r4 @load row 1 in source 2 vld1.32 d1[1], [r1], r4 @load row 2 in source 2 vrhadd.u8 d0, d0, d1 subs r6, r6, #2 @decrement ht by 2 vst1.32 d0[0], [r2], r5 @load row 1 in destination vst1.32 d0[1], [r2], r5 @load row 2 in destination bgt loop_2_uv @if greater than 0 repeat the loop again b end_loops_uv loop_4_uv: @each iteration processes two rows vld1.8 d0, [r0], r3 @load row 1 in source 1 vld1.8 d2, [r1], r4 @load row 1 in source 2 vld1.8 d1, [r0], r3 @load row 2 in source 1 vrhadd.u8 d0, d0, d2 vld1.8 d3, [r1], r4 @load row 2 in source 2 vrhadd.u8 d1, d1, d3 vst1.8 d0, [r2], r5 @load row 1 in destination subs r6, r6, #2 @decrement ht by 2 vst1.8 d1, [r2], r5 @load row 2 in destination bgt loop_4_uv @if greater than 0 repeat the loop again b end_loops_uv loop_8_uv: @each iteration processes four rows vld1.8 {q0}, [r0], r3 @load row 1 in source 1 vld1.8 {q4}, [r1], r4 @load row 1 in source 2 vld1.8 {q1}, [r0], r3 @load row 2 in source 1 vrhadd.u8 q0, q0, q4 vld1.8 {q5}, [r1], r4 @load row 2 in source 2 vld1.8 {q2}, [r0], r3 @load row 3 in source 1 vrhadd.u8 q1, q1, q5 vld1.8 {q6}, [r1], r4 @load row 3 in source 2 vld1.8 {q3}, [r0], r3 @load row 4 in source 1 vrhadd.u8 q2, q2, q6 vld1.8 {q7}, [r1], r4 @load row 4 in source 2 vst1.8 {q0}, [r2], r5 @load row 1 in destination vrhadd.u8 q3, q3, q7 vst1.8 {q1}, [r2], r5 @load row 2 in destination subs r6, r6, #4 @decrement ht by 4 vst1.8 {q2}, [r2], r5 @load row 3 in destination vst1.8 {q3}, [r2], r5 @load row 4 in destination bgt loop_8_uv @if greater than 0 repeat the loop again end_loops_uv: vpop {d8-d15} ldmfd sp!, {r4-r7, r15} @Reload the registers from sp