///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///******************************************************************************* //* @file //* ihevc_deblk_luma_vert.s //* //* @brief //* contains function definitions for inter prediction interpolation. //* functions are coded using neon intrinsics and can be compiled using //* rvct //* //* @author //* anand s //* //* @par list of functions: //* //* //* @remarks //* none //* //*******************************************************************************/ .text .align 4 .extern gai4_ihevc_tc_table .extern gai4_ihevc_beta_table .globl ihevc_deblk_luma_horz_av8 .type ihevc_deblk_luma_horz_av8, %function ihevc_deblk_luma_horz_av8: // stmfd sp!, {x3-x12,x14} sxtw x5,w5 sxtw x6,w6 stp d8,d9,[sp,#-16]! // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error. // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function. stp d10,d11,[sp,#-16]! stp d12,d13,[sp,#-16]! stp d14,d15,[sp,#-16]! stp x19, x20,[sp,#-16]! stp x21, x22,[sp,#-16]! mov x21,x7 ldr w22,[sp,#96] add x3,x3,x4 add x3,x3,#1 asr x3,x3,#1 add x7,x3,x5,lsl #1 add x3,x3,x6,lsl #1 cmp x7,#0x33 mov x20,#0x33 csel x7, x20, x7,gt bgt l1.1532 cmp x7,#0x0 mov x20,#0x0 csel x7, x20, x7,lt // x7 has the beta_index value l1.1532: // bic x2,x2,#1 asr x2,x2,#1 add x3,x3,x2,lsl #1 cmp x3,#0x35 mov x20,#0x35 csel x3, x20, x3,gt bgt l1.1564 cmp x3,#0x0 mov x20,#0x0 csel x3, x20, x3,lt // x3 has the tc_index value // qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ // beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ // tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ l1.1564: adrp x2, :got:gai4_ihevc_beta_table ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table] adrp x4, :got:gai4_ihevc_tc_table ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table] ldr w5, [x2,x7,lsl #2] // beta ldr w6, [x4,x3,lsl #2] // tc cmp x6,#0 beq l1.2404 movi v0.4h, #0x2 lsl x7,x6,#1 add x14,x1,x1,lsl #1 neg x19,x14 ldr w8, [x0,x19] // -3 value dup v1.8b,w7 lsl x19,x1,#1 neg x19,x19 ldr w10, [x0,x19] //-2 value dup v23.2s,w8 // -3 value neg x19,x1 ldr w11, [x0,x19] //-1 value dup v24.2s,w10 // -2 value and x8,x8,#0xff ldr w12, [x0,#0] // 0 value dup v25.2s,w11 // -1 value and x10,x10,#0xff ldr w9, [x0,x1] // 1 value dup v26.2s,w12 // 0 value and x11,x11,#0xff lsl x19,x1,#1 ldr w2, [x0,x19] // 2 value dup v27.2s,w9 // 1value and x12,x12,#0xff dup v28.2s,w2 // 2 value and x9,x9,#0xff and x2,x2,#0xff add x12,x12,x2 subs x9,x12,x9,lsl #1 // dq0 value is stored in x9 csneg x9,x9,x9,pl //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ add x8,x8,x11 subs x8,x8,x10,lsl #1 csneg x8,x8,x8,pl // dp0 value is stored in x8 // dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ add x3,x1,x1,lsl #1 add x14,x0,#3 neg x19,x3 ldrb w2,[x14,x19] // -2 value lsl x19,x1,#1 neg x19,x19 ldrb w10,[x14,x19] // -2 value neg x19,x1 ldrb w11,[x14,x19] // -1 value ldrb w12,[x14,#0] // 0 value ldrb w3,[x14,x1] // 1 value lsl x19,x1,#1 ldrb w4,[x14,x19] // 2 value add x12,x12,x4 subs x12,x12,x3,lsl #1 // dq3value is stored in x12 csneg x12,x12,x12,pl // dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ add x2,x2,x11 subs x11,x2,x10,lsl #1 csneg x11,x11,x11,pl // dp3 value is stored in x8 // dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ add x3,x8,x9 // x3 has the d0 value add x4,x11,x12 // x4 has the d3 value // d0 = dp0 + dq0@ // d3 = dp3 + dq3@ add x14,x8,x11 // x13 has the value dp add x12,x12,x9 // x12 has the value dq // dp = dp0 + dp3@ // dq = dq0 + dq3@ add x11, x3, x4 // x3 has the value d // d = d0 + d3@ cmp x11,x5 bge l1.2404 // if(d < beta) // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11 // registers for use: x2,x7,x8,x9,x10, asr x10,x5,#2 uqadd v30.8b, v26.8b , v1.8b cmp x10,x3,lsl #1 uqsub v31.8b, v26.8b , v1.8b ble l1.1840 add x10,x1,x1,lsl #1 uaddl v6.8h, v25.8b , v26.8b neg x19,x1 ldr w2, [x0,x19,lsl #2] // has the -4 value neg x19, x1 ldrb w7,[x0,x19] // has the -1 value dup v22.2s,w2 // -4 value uaddw v7.8h, v6.8h , v27.8b ldrb w3,[x0,#0] // x4 has the 0 value uqadd v16.8b, v27.8b , v1.8b and x2,x2,#0xff mul v12.8h, v7.8h, v0.4h[0] ldr w8, [x0,x10] // has the 3 value uaddl v10.8h, v24.8b , v28.8b subs x2,x2,x7 uqsub v17.8b, v27.8b , v1.8b dup v29.2s,w8 // 3 value and x8,x8,#0xff add v12.8h, v12.8h , v10.8h csneg x2,x2,x2,pl rshrn v20.8b, v12.8h,#3 subs x8,x8,x3 csneg x8,x8,x8,pl umin v18.8b, v20.8b , v30.8b add x8,x8,x2 cmp x8,x5,asr #3 bge l1.1840 uaddw v14.8h, v7.8h , v28.8b subs x7,x3,x7 umax v4.8b, v18.8b , v31.8b csneg x7,x7,x7,pl uqadd v30.8b, v28.8b , v1.8b mov x10,#5 rshrn v21.8b, v14.8h,#2 mul x10, x10, x6 uqsub v31.8b, v28.8b , v1.8b add x10, x10,#1 cmp x7,x10,asr #1 umin v18.8b, v21.8b , v16.8b bge l1.1840 // if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) // && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) umax v5.8b, v18.8b , v17.8b asr x10,x5,#2 uaddl v16.8h, v29.8b , v28.8b cmp x10,x4,lsl #1 ble l1.1840 add x10,x1,x1,lsl #1 mul v16.8h, v16.8h, v0.4h[0] add x4,x0,#3 lsl x19,x1,#2 neg x19,x19 ldrb w2,[x4,x19] add v16.8h, v16.8h , v14.8h neg x19,x1 ldrb w7,[x4,x19] rshrn v19.8b, v16.8h,#3 ldrb w3,[x4,#0] ldrb w8,[x4,x10] // ubfx x7,x2,#24,#8 @ has the -1 value // and x2,#0xff @ has the -4 value // ubfx x8,x3,#24,#8 @ has the 3 value // and x3,#0xff @ x4 has the 0 value subs x8,x8,x3 umin v18.8b, v19.8b , v30.8b csneg x8,x8,x8,pl uaddl v6.8h, v25.8b , v24.8b subs x2,x2,x7 umax v3.8b, v18.8b , v31.8b csneg x2,x2,x2,pl uaddw v7.8h, v6.8h , v26.8b add x8,x8,x2 uqadd v30.8b, v25.8b , v1.8b cmp x8,x5,asr #3 uqsub v31.8b, v25.8b , v1.8b bge l1.1840 mul v12.8h, v7.8h, v0.4h[0] subs x7,x3,x7 uqadd v16.8b, v24.8b , v1.8b csneg x7,x7,x7,pl uaddl v10.8h, v23.8b , v27.8b mov x10,#5 uqsub v17.8b, v24.8b , v1.8b mul x10, x10, x6 add v12.8h, v12.8h , v10.8h add x10, x10,#1 rshrn v20.8b, v12.8h,#3 cmp x7,x10,asr #1 uaddw v14.8h, v7.8h , v23.8b bge l1.1840 umin v18.8b, v20.8b , v30.8b mov x2,#2 uqadd v30.8b, v23.8b , v1.8b mov w4,w21 umax v2.8b, v18.8b , v31.8b mov w5,w22 rshrn v21.8b, v14.8h,#2 b end_dep_deq_decision_horz // x2 has the value of de // x6 has teh value of tc // x5 has the value of beta // x14 has the value of dp // x12 has the value of dq // x0 has the value of source address // x1 has the src stride l1.1840: mov x2,#1 mov x11,x5 mov w4,w21 mov w5,w22 cmp x6,#1 mov x20,#0 csel x9, x20, x9,eq mov x20,#0 csel x10, x20, x10,eq beq end_dep_deq_decision_horz and x7,x4,x5 cmp x7,#1 beq both_flags_set_horz cmp x4,#0 beq set_flag_dep_zero_horz add x8,x11,x11,asr #1 mov x10,#0 asr x8,x8,#3 cmp x8,x14 mov x20,#1 csel x9, x20, x9,gt mov x20,#0 csel x9, x20, x9,le b end_dep_deq_decision_horz set_flag_dep_zero_horz: add x8,x11,x11,asr #1 mov x9,#0 asr x8,x8,#3 cmp x8,x12 mov x20,#1 csel x10, x20, x10,gt mov x20,#0 csel x10, x20, x10,le b end_dep_deq_decision_horz both_flags_set_horz: add x8,x11,x11,asr #1 asr x8,x8,#3 cmp x8,x14 mov x20,#1 csel x9, x20, x9,gt mov x20,#0 csel x9, x20, x9,le cmp x8,x12 mov x20,#1 csel x10, x20, x10,gt mov x20,#0 csel x10, x20, x10,le end_dep_deq_decision_horz: //x0=source address //x1=stride // x2 =de // x4=flag p //x5= flag q //x6 =tc // x9 =dep // x10=deq // add x14,x1,x1,lsl #1 // lsl x7,x6,#1 // vdup.8 d1,x7 // vmov.i16 d0,#0x2 umin v18.8b, v21.8b , v16.8b cmp x2,#1 uqsub v31.8b, v23.8b , v1.8b beq l1.2408 uaddl v7.8h, v23.8b , v22.8b cmp x5,#1 bne strong_filtering_p strong_filtering_q: mov x12,x0 st1 {v4.s}[0],[x12],x1 st1 {v5.s}[0],[x12],x1 st1 {v3.s}[0],[x12] cmp x4,#1 bne l1.2404 strong_filtering_p: umax v5.8b, v18.8b , v17.8b mov x12,x0 mul v7.8h, v7.8h, v0.4h[0] sub x20,x1,#0 neg x11, x20 add v16.8h, v7.8h , v14.8h add x12,x12,x11 rshrn v19.8b, v16.8h,#3 st1 {v2.s}[0],[x12],x11 umin v18.8b, v19.8b , v30.8b st1 {v5.s}[0],[x12],x11 umax v3.8b, v18.8b , v31.8b st1 {v3.s}[0],[x12] l1.2404: // ldmfd sp!, {x3-x12,pc} ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. ret // x4=flag p //x5= flag q //x6 =tc // x9 =dep // x10=deq // d22 -4 value //d23 @ -3 value // vdup.32 d24,x11 @ -2 value // vdup.32 d25, x11 @-1 value // vdup.32 d26,x11 @ 0 value // vdup.32 d27,x11 @ 1value // vdup.32 d28,x11 @ 2 value // vdup.32 d29,x11 @ 3 value l1.2408: movi v0.4h, #0x9 usubl v10.8h, v26.8b , v25.8b mul v10.8h, v10.8h, v0.4h[0] movi v0.4h, #0x3 usubl v12.8h, v27.8b , v24.8b mul v12.8h, v12.8h, v0.4h[0] dup v30.8b,w6 // duplicating the +tc value sub x20,x6,#0 neg x12, x20 dup v31.8b,w12 // duplicating the -tc value sub v10.8h, v10.8h , v12.8h srshr v10.8h, v10.8h,#4 // delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ abs v7.8h, v10.8h xtn v9.8b, v7.8h // storing the absolute values of delta in d9 sqxtn v10.8b, v10.8h // storing the clipped values of delta in d16 smin v11.8b, v10.8b , v30.8b smax v7.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)// uxtl v6.8h, v25.8b saddw v4.8h, v6.8h , v7.8b sqxtun v12.8b, v4.8h uxtl v6.8h, v26.8b ssubw v4.8h, v6.8h , v7.8b sqxtun v13.8b, v4.8h mov x11,#0xa mul x12, x11, x6 dup v2.8b,w12 // d2 has the 10*tc value mov v18.8b, v24.8b dup v0.8b,w6 sshr v0.8b,v0.8b,#1 neg v1.8b, v0.8b cmp x4,#1 bne l1.2724 cmp x9,#1 bne l1.2700 // d12 and d13 have the value temp_p0 and temp_q0 uaddl v14.8h, v23.8b , v25.8b rshrn v14.8b, v14.8h,#1 usubl v14.8h, v14.8b , v24.8b saddw v14.8h, v14.8h , v7.8b sqshrn v14.8b, v14.8h,#1 smin v15.8b, v14.8b , v0.8b smax v14.8b, v1.8b , v15.8b // d14 has the delta p value uxtl v16.8h, v24.8b saddw v16.8h, v16.8h , v14.8b sqxtun v14.8b, v16.8h // d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@ cmhs v18.8b,v9.8b,v2.8b bsl v18.8b,v24.8b,v14.8b l1.2700: mov x12,x0 sub x20,x1,#0 neg x11, x20 add x12,x12,x11 cmhs v19.8b,v9.8b,v2.8b bsl v19.8b,v25.8b,v12.8b st1 {v19.s}[0],[x12],x11 st1 {v18.s}[0],[x12] l1.2724: cmp x5,#1 bne l1.2404 cmp x10,#1 mov v18.8b, v27.8b bne l1.2852 uaddl v14.8h, v26.8b , v28.8b rshrn v14.8b, v14.8h,#1 usubl v14.8h, v14.8b , v27.8b ssubw v14.8h, v14.8h , v7.8b sqshrn v14.8b, v14.8h,#1 smin v15.8b, v14.8b , v0.8b smax v14.8b, v1.8b , v15.8b // d14 has the delta p value uxtl v16.8h, v27.8b saddw v16.8h, v16.8h , v14.8b sqxtun v14.8b, v16.8h cmhs v18.8b,v9.8b,v2.8b bsl v18.8b,v27.8b,v14.8b l1.2852: mov x12,x0 cmhs v19.8b,v9.8b,v2.8b bsl v19.8b,v26.8b,v13.8b st1 {v19.s}[0],[x12],x1 st1 {v18.s}[0],[x12] // ldmfd sp!, {x3-x12,x15} ldp x21, x22,[sp],#16 ldp x19, x20,[sp],#16 ldp d14,d15,[sp],#16 ldp d12,d13,[sp],#16 ldp d10,d11,[sp],#16 ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. ret