///***************************************************************************** //* //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //*****************************************************************************/ ///** ///******************************************************************************* //* //file //* ihevcd_itrans_recon_dc_chroma.s //* //* //brief //* contains function definitions itrans and recon for dc only case //* //* //author //* ittiam //* //* //par list of functions: //* //* //* //remarks //* none //* //*******************************************************************************/ .text .include "ihevc_neon_macros.s" .globl ihevcd_itrans_recon_dc_chroma_av8 .type ihevcd_itrans_recon_dc_chroma_av8, %function ihevcd_itrans_recon_dc_chroma_av8: //void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred, // uword8 *pu1_dst, // word32 pred_strd, // word32 dst_strd, // word32 log2_trans_size, // word16 i2_coeff_value) //x0:pu1_pred //x1:pu1_dest //x2:pred_strd //x3:dst_strd push_v_regs stp x19, x20,[sp,#-16]! sxth x5, w5 // since the argument is of word16, sign extend to x register mov x10,#1 lsl x4,x10,x4 // trans_size = (1 << log2_trans_size)// mov x6,#64 // 1 << (shift1 - 1)// mov x7,#2048 // 1<<(shift2-1) add x8,x6,x5,lsl #6 asr x20, x8, #7 mov x19,#32767 cmp x20,x19 blt lbl36 mov x8,#32767 b lbl36_1 lbl36: mov x19,#-32768 cmp x20,x19 csel x8, x19, x20, lt lbl36_1: add x5,x7,x8,lsl #6 asr x20, x5, #12 mov x19,#32767 cmp x20,x19 blt lbl38 mov x6,#32767 b lbl38_1 lbl38: mov x19,#-32768 cmp x20,x19 csel x6, x19, x20, lt lbl38_1: mov x9,x4 mov x8,x4 // x6 has the dc_value // x4 has the trans_size value // x8 has the row value // x9 has the col value dup v0.8h,w6 cmp x4,#4 beq row_loop_4chroma row_loop_chroma: mov x9,x4 col_loop_chroma: mov x7,x0 ld2 {v2.8b, v3.8b},[x7],x2 ld2 {v4.8b, v5.8b},[x7],x2 ld2 {v6.8b, v7.8b},[x7],x2 ld2 {v8.8b, v9.8b},[x7],x2 ld2 {v10.8b, v11.8b},[x7],x2 ld2 {v12.8b, v13.8b},[x7],x2 ld2 {v14.8b, v15.8b},[x7],x2 ld2 {v16.8b, v17.8b},[x7] add x0,x0,#16 uaddw v30.8h, v0.8h , v2.8b uaddw v28.8h, v0.8h , v4.8b uaddw v26.8h, v0.8h , v6.8b uaddw v24.8h, v0.8h , v8.8b uaddw v22.8h, v0.8h , v10.8b uaddw v20.8h, v0.8h , v12.8b uaddw v18.8h, v0.8h , v14.8b mov x11,x1 sqxtun v2.8b, v30.8h sqxtun v4.8b, v28.8h sqxtun v6.8b, v26.8h sqxtun v8.8b, v24.8h uaddw v30.8h, v0.8h , v16.8b sqxtun v10.8b, v22.8h sqxtun v12.8b, v20.8h sqxtun v14.8b, v18.8h sqxtun v16.8b, v30.8h st2 {v2.8b, v3.8b},[x11],x3 st2 {v4.8b, v5.8b},[x11],x3 st2 {v6.8b, v7.8b},[x11],x3 st2 {v8.8b, v9.8b},[x11],x3 st2 {v10.8b, v11.8b},[x11],x3 st2 {v12.8b, v13.8b},[x11],x3 st2 {v14.8b, v15.8b},[x11],x3 st2 {v16.8b, v17.8b},[x11] add x1,x1,#16 subs x9,x9,#8 bgt col_loop_chroma subs x8,x8,#8 add x0,x0,x2,lsl #3 add x1,x1,x3,lsl #3 sub x0,x0,x4,lsl #1 sub x1,x1,x4,lsl #1 bgt row_loop_chroma b end_loops_chroma row_loop_4chroma: mov x9,x10 col_loop_4chroma: ld2 {v2.8b, v3.8b},[x0],x2 ld2 {v4.8b, v5.8b},[x0],x2 ld2 {v6.8b, v7.8b},[x0],x2 ld2 {v8.8b, v9.8b},[x0] uaddw v30.8h, v0.8h , v2.8b uaddw v28.8h, v0.8h , v4.8b uaddw v26.8h, v0.8h , v6.8b uaddw v24.8h, v0.8h , v8.8b sqxtun v31.8b, v30.8h sqxtun v29.8b, v28.8h sqxtun v27.8b, v26.8h sqxtun v25.8b, v24.8h zip1 v2.8b, v31.8b, v3.8b zip1 v4.8b, v29.8b, v5.8b zip1 v6.8b, v27.8b, v7.8b zip1 v8.8b, v25.8b, v9.8b st1 {v2.2s},[x1],x3 st1 {v4.2s},[x1],x3 st1 {v6.2s},[x1],x3 st1 {v8.2s},[x1] end_loops_chroma: ldp x19, x20,[sp],#16 pop_v_regs ret