///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** //******************************************************************************* //* //file //* ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s //* //* //brief //* contains function definitions for inter prediction interpolation. //* functions are coded using neon intrinsics and can be compiled using //* rvct //* //* //author //* yogeswaran rs / parthiban //* //* //par list of functions: //* //* //* //remarks //* none //* //******************************************************************************* //*/ ///** ///** //******************************************************************************* //* //* //brief //* chroma interprediction filter for 16bit vertical input. //* //* //par description: //* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to //* the elements pointed by 'pu1_src' and writes to the location pointed by //* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and //* clipped to lie between 0 and 255 assumptions : the function is //* optimized considering the fact width and height are multiple of 2. //* //* //param[in] pi2_src //* word16 pointer to the source //* //* //param[out] pu1_dst //* uword8 pointer to the destination //* //* //param[in] src_strd //* integer source stride //* //* //param[in] dst_strd //* integer destination stride //* //* //param[in] pi1_coeff //* word8 pointer to the filter coefficients //* //* //param[in] ht //* integer height of the array //* //* //param[in] wd //* integer width of the array //* //* //returns //* //* //remarks //* none //* //******************************************************************************* //*/ //void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src, // uword8 *pu1_dst, // word32 src_strd, // word32 dst_strd, // word8 *pi1_coeff, // word32 ht, // word32 wd) //**************variables vs registers***************************************** //x0 => *pu1_src //x1 => *pi2_dst //x2 => src_strd //x3 => dst_strd .text .align 4 .include "ihevc_neon_macros.s" .globl ihevc_inter_pred_chroma_vert_w16inp_av8 .type ihevc_inter_pred_chroma_vert_w16inp_av8, %function ihevc_inter_pred_chroma_vert_w16inp_av8: // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments stp x19, x20,[sp,#-16]! mov x15,x4 // pi1_coeff mov x16,x5 // ht mov x17,x6 // wd mov x4, x15 //loads pi1_coeff mov x6, x17 //wd lsl x2,x2,#1 //src_strd = 2* src_strd mov x5,x16 //loads ht ld1 {v0.8b},[x4] //loads pi1_coeff sub x4,x0,x2 //pu1_src - src_strd sxtl v0.8h, v0.8b //long the value tst x6,#3 //checks wd == 2 dup v16.4h, v0.h[0] //coeff_0 dup v17.4h, v0.h[1] //coeff_1 dup v18.4h, v0.h[2] //coeff_2 dup v19.4h, v0.h[3] //coeff_3 bgt core_loop_ht_2 //jumps to loop handles wd 2 tst x5,#3 //checks ht == mul of 4 beq core_loop_ht_4 //jumps to loop handles ht mul of 4 core_loop_ht_2: lsl x7,x2,#1 //2*src_strd lsl x12,x3,#1 //2*dst_strd lsl x9,x6,#2 //4*wd sub x6,x12,x6,lsl #1 //2*dst_strd - 2*wd sub x8,x7,x9 //2*src_strd - 4*wd mov x12,x9 //4wd inner_loop_ht_2: add x0,x4,x2 //increments pi2_src ld1 {v0.4h},[x4],#8 //loads pu1_src smull v0.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) subs x12,x12,#8 //2wd + 8 ld1 {v2.4h},[x0],x2 //loads pi2_src smull v7.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v3.4h},[x0],x2 //loads pi2_src smlal v0.4s, v2.4h, v17.4h ld1 {v6.4h},[x0],x2 smlal v7.4s, v3.4h, v17.4h ld1 {v2.4h},[x0] add x7,x1,x3 //pu1_dst + dst_strd smlal v0.4s, v3.4h, v18.4h smlal v7.4s, v6.4h, v18.4h smlal v0.4s, v6.4h, v19.4h smlal v7.4s, v2.4h, v19.4h sqshrn v0.4h, v0.4s,#6 //right shift sqshrn v30.4h, v7.4s,#6 //right shift sqrshrun v0.8b, v0.8h,#6 //rounding shift sqrshrun v30.8b, v30.8h,#6 //rounding shift st1 {v0.s}[0],[x1],#4 //stores the loaded value st1 {v30.s}[0],[x7] //stores the loaded value bgt inner_loop_ht_2 //inner loop -again //inner loop ends subs x5,x5,#2 //increments ht add x1,x1,x6 //pu1_dst += 2*dst_strd - 2*wd mov x12,x9 //4wd add x4,x4,x8 //pi1_src_tmp1 += 2*src_strd - 4*wd bgt inner_loop_ht_2 //loop again b end_loops //jumps to end core_loop_ht_4: lsl x7,x2,#2 //2*src_strd lsl x12,x3,#2 //2*dst_strd lsr x11, x6, #1 //divide by 2 sub x14,x12,x6,lsl #1 //2*dst_strd - 2*wd sub x8,x7,x6,lsl #2 //2*src_strd - 4*wd mul x12, x5 , x11 //multiply height by width sub x12, x12,#4 //subtract by one for epilog lsl x11, x6, #1 //2*wd prolog: add x0,x4,x2 //increments pi2_src ld1 {v0.4h},[x4],#8 //loads pu1_src ld1 {v1.4h},[x0],x2 //loads pi2_src subs x11,x11,#4 ld1 {v2.4h},[x0],x2 //loads pi2_src smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) ld1 {v3.4h},[x0],x2 smlal v30.4s, v1.4h, v17.4h smlal v30.4s, v2.4h, v18.4h add x9,x1,x3 //pu1_dst + dst_strd smlal v30.4s, v3.4h, v19.4h ld1 {v4.4h},[x0],x2 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) add x20,x4,x8 csel x4, x20, x4,le smlal v28.4s, v2.4h, v17.4h ld1 {v5.4h},[x0],x2 smlal v28.4s, v3.4h, v18.4h ld1 {v6.4h},[x0],x2 smlal v28.4s, v4.4h, v19.4h lsl x20,x6,#1 csel x11, x20, x11,le sqshrn v30.4h, v30.4s,#6 //right shift smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) add x0,x4,x2 smlal v26.4s, v3.4h, v17.4h smlal v26.4s, v4.4h, v18.4h ld1 {v0.4h},[x4],#8 //loads pu1_src smlal v26.4s, v5.4h, v19.4h sqrshrun v30.8b, v30.8h,#6 //rounding shift sqshrn v28.4h, v28.4s,#6 //right shift ld1 {v1.4h},[x0],x2 //loads pi2_src smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) st1 {v30.s}[0],[x1],#4 //stores the loaded value smlal v24.4s, v4.4h, v17.4h ld1 {v2.4h},[x0],x2 //loads pi2_src smlal v24.4s, v5.4h, v18.4h ld1 {v3.4h},[x0],x2 smlal v24.4s, v6.4h, v19.4h add x20,x1,x14 csel x1, x20, x1,le sqshrn v26.4h, v26.4s,#6 //right shift subs x12,x12,#4 sqrshrun v28.8b, v28.8h,#6 //rounding shift beq epilog //jumps to epilog kernel_4: smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) subs x11,x11,#4 smlal v30.4s, v1.4h, v17.4h st1 {v28.s}[0],[x9],x3 //stores the loaded value smlal v30.4s, v2.4h, v18.4h smlal v30.4s, v3.4h, v19.4h sqshrn v24.4h, v24.4s,#6 //right shift sqrshrun v26.8b, v26.8h,#6 //rounding shift ld1 {v4.4h},[x0],x2 smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) smlal v28.4s, v2.4h, v17.4h smlal v28.4s, v3.4h, v18.4h smlal v28.4s, v4.4h, v19.4h st1 {v26.s}[0],[x9],x3 //stores the loaded value add x20,x4,x8 csel x4, x20, x4,le lsl x20,x6,#1 csel x11, x20, x11,le sqshrn v30.4h, v30.4s,#6 //right shift sqrshrun v24.8b, v24.8h,#6 //rounding shift ld1 {v5.4h},[x0],x2 smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v6.4h},[x0],x2 smlal v26.4s, v3.4h, v17.4h st1 {v24.s}[0],[x9] //stores the loaded value add x0,x4,x2 smlal v26.4s, v4.4h, v18.4h ld1 {v0.4h},[x4],#8 //loads pu1_src smlal v26.4s, v5.4h, v19.4h sqshrn v28.4h, v28.4s,#6 //right shift sqrshrun v30.8b, v30.8h,#6 //rounding shift ld1 {v1.4h},[x0],x2 //loads pi2_src smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) add x9,x1,x3 //pu1_dst + dst_strd ld1 {v2.4h},[x0],x2 //loads pi2_src smlal v24.4s, v4.4h, v17.4h ld1 {v3.4h},[x0],x2 smlal v24.4s, v5.4h, v18.4h st1 {v30.s}[0],[x1],#4 //stores the loaded value smlal v24.4s, v6.4h, v19.4h sqshrn v26.4h, v26.4s,#6 //right shift sqrshrun v28.8b, v28.8h,#6 //rounding shift add x20,x1,x14 csel x1, x20, x1,le subs x12,x12,#4 bgt kernel_4 //jumps to kernel_4 epilog: smull v30.4s, v0.4h, v16.4h //vmull_s16(src_tmp1, coeff_0) st1 {v28.s}[0],[x9],x3 //stores the loaded value smlal v30.4s, v1.4h, v17.4h smlal v30.4s, v2.4h, v18.4h smlal v30.4s, v3.4h, v19.4h sqshrn v24.4h, v24.4s,#6 //right shift sqrshrun v26.8b, v26.8h,#6 //rounding shift smull v28.4s, v1.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v4.4h},[x0],x2 smlal v28.4s, v2.4h, v17.4h st1 {v26.s}[0],[x9],x3 //stores the loaded value smlal v28.4s, v3.4h, v18.4h smlal v28.4s, v4.4h, v19.4h sqshrn v30.4h, v30.4s,#6 //right shift sqrshrun v24.8b, v24.8h,#6 //rounding shift smull v26.4s, v2.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) ld1 {v5.4h},[x0],x2 smlal v26.4s, v3.4h, v17.4h smlal v26.4s, v4.4h, v18.4h smlal v26.4s, v5.4h, v19.4h sqshrn v28.4h, v28.4s,#6 //right shift sqrshrun v30.8b, v30.8h,#6 //rounding shift st1 {v24.s}[0],[x9] //stores the loaded value smull v24.4s, v3.4h, v16.4h //vmull_s16(src_tmp2, coeff_0) smlal v24.4s, v4.4h, v17.4h add x9,x1,x3 //pu1_dst + dst_strd ld1 {v6.4h},[x0],x2 smlal v24.4s, v5.4h, v18.4h smlal v24.4s, v6.4h, v19.4h st1 {v30.s}[0],[x1],#4 //stores the loaded value sqrshrun v28.8b, v28.8h,#6 //rounding shift sqshrn v26.4h, v26.4s,#6 //right shift st1 {v28.s}[0],[x9],x3 //stores the loaded value sqrshrun v26.8b, v26.8h,#6 //rounding shift sqshrn v24.4h, v24.4s,#6 //right shift st1 {v26.s}[0],[x9],x3 //stores the loaded value sqrshrun v24.8b, v24.8h,#6 //rounding shift st1 {v24.s}[0],[x9] //stores the loaded value end_loops: // ldmfd sp!,{x4-x12,x15} //reload the registers from sp ldp x19, x20,[sp],#16 ret