• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_weighted_pred_uni.s
22@*
23@* @brief
24@*  contains function definitions for weighted prediction used in inter
25@* prediction
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_weighted_pred_uni()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38
39@/**
40@*******************************************************************************
41@*
42@* @brief
43@*  does uni-weighted prediction on the array pointed by  pi2_src and stores
44@* it at the location pointed by pi2_dst assumptions : the function is
45@* optimized considering the fact width and  height are multiple of 2.
46@*
47@* @par description:
48@*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
49@* offset
50@*
51@* @param[in] pi2_src
52@*  pointer to the source
53@*
54@* @param[out] pu1_dst
55@*  pointer to the destination
56@*
57@* @param[in] src_strd
58@*  source stride
59@*
60@* @param[in] dst_strd
61@*  destination stride
62@*
63@* @param[in] wgt0
64@*  weight to be multiplied to the source
65@*
66@* @param[in] off0
67@*  offset to be added after rounding and
68@*
69@* @param[in] shifting
70@*
71@*
72@* @param[in] shift
73@*  (14 bit depth) + log2_weight_denominator
74@*
75@* @param[in] lvl_shift
76@*  added before shift and offset
77@*
78@* @param[in] ht
79@*  height of the source
80@*
81@* @param[in] wd
82@*  width of the source
83@*
84@* @returns
85@*
86@* @remarks
87@*  none
88@*
89@*******************************************************************************
90@*/
91
92@void ihevc_weighted_pred_uni(word16 *pi2_src,
93@                             uword8 *pu1_dst,
94@                             word32 src_strd,
95@                             word32 dst_strd,
96@                             word32 wgt0,
97@                             word32 off0,
98@                             word32 shift,
99@                             word32 lvl_shift,
100@                             word32 ht,
101@                             word32 wd)
102
103@**************variables vs registers*****************************************
104@   r0 => *pi2_src
105@   r1 => *pu1_dst
106@   r2 =>  src_strd
107@   r3 =>  dst_strd
108@   r4 =>  wgt0
109@   r5 =>  off0
110@   r6 =>  shift
111@   r7 =>  lvl_shift
112@   r8 =>   ht
113@   r9  =>  wd
114
115.equ    wgt0_offset,        104
116.equ    off0_offset,        108
117.equ    shift_offset,       112
118.equ    lvl_shift_offset,   116
119.equ    ht_offset,          120
120.equ    wd_offset,          124
121
122.text
123.align 4
124
125
126
127
128.globl ihevc_weighted_pred_uni_a9q
129
130.type ihevc_weighted_pred_uni_a9q, %function
131
132ihevc_weighted_pred_uni_a9q:
133
134    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
135    vpush       {d8  -  d15}
136
137    ldr         r4,[sp,#wgt0_offset]        @load wgt0
138    ldr         r7,[sp,#lvl_shift_offset]   @load lvl_shift
139    mov         r11,#1
140    ldr         r5,[sp,#off0_offset]        @load off0
141    mul         r10,r7,r4                   @lvl_shift * wgt0
142    ldr         r6,[sp,#shift_offset]       @load shift
143    ldr         r8,[sp,#ht_offset]          @load ht
144    add         r10,r10,r5,lsl r6           @lvl_shift * wgt0 + (off0 << shift)
145    ldr         r9,[sp,#wd_offset]          @load wt
146    sub         r12,r6,#1
147    vmov.s16    d0[0],r4                    @moved for scalar multiplication
148    lsl         r2,r2,#1
149    vdup.u32    q14,r6                      @vmovq_n_s32(tmp_shift)
150    add         r10,r10,r11,lsl r12         @tmp_lvl_shift += (1 << (shift - 1))
151    vdup.s32    q15,r10                     @vmovq_n_s32(tmp_lvl_shift)
152    vneg.s32    q14,q14
153    lsl         r4,r9,#1
154
155    cmp         r8,#0                       @check ht == 0
156    beq         end_loops                   @if equal, then end the function
157
158outer_loop:
159    cmp         r9,#0                       @check wd == 0
160    beq         end_loops                   @if equal, then end the function
161
162core_loop:
163    add         r5,r0,r2                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
164    add         r6,r1,r3                    @pu1_dst_tmp = pu1_dst + dst_strd
165    vld1.s16    {d1},[r0]!                  @load and increment the pi2_src
166    vld1.s16    {d2},[r5],r2                @load and increment the pi2_src_tmp ii iteration
167    vmull.s16   q2,d1,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
168
169    vadd.i32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
170    vld1.s16    {d8},[r5],r2                @load and increment the pi2_src iii iteration
171
172    vmull.s16   q3,d2,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
173    vld1.s16    {d9},[r5],r2                @load and increment the pi2_src_tmp iv iteration
174
175    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t)
176    vadd.i32    q3,q3,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
177
178    vmull.s16   q5,d8,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
179    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
180
181    vadd.i32    q5,q5,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
182    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
183
184    vshl.s32    q3,q3,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
185
186    vmull.s16   q6,d9,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
187    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
188
189    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
190    vqmovun.s32 d6,q3                       @vqmovun_s32(sto_res_tmp1) ii iteration
191
192    vadd.i32    q6,q6,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
193    vmov.s32    d7,d6                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
194
195    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) iii iteration
196
197    vshl.s32    q6,q6,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
198    vst1.32     {d4[0]},[r1]!               @store pu1_dst i iteration
199    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
200
201    vqmovn.u16  d6,q3                       @vqmovn_u16(sto_res_tmp3) ii iteration
202    vst1.32     {d6[0]},[r6],r3             @store pu1_dst ii iteration
203
204    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) iii iteration
205    vqmovun.s32 d12,q6                      @vqmovun_s32(sto_res_tmp1) iv iteration
206
207    vmov.s32    d13,d12                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
208    vst1.32     {d10[0]},[r6],r3            @store pu1_dst i iteration iii iteration
209    vqmovn.u16  d12,q6                      @vqmovn_u16(sto_res_tmp3) iv iteration
210
211    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
212    vst1.32     {d12[0]},[r6],r3            @store pu1_dst iv iteration
213    bgt         core_loop                   @if greater than 0 repeat the core loop again
214
215end_core_loop:
216    rsb         r11,r4,r2,lsl #2            @2*src_strd - wd
217    subs        r8,r8,#4                    @decrement the ht by 4
218    add         r0,r0,r11                   @pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
219    asr         r9,r4,#1
220    rsb         r12,r9,r3,lsl #2            @2*dst_strd - wd
221    add         r1,r1,r12                   @pu1_dst + dst_std - wd
222    bgt         core_loop                   @if ht is greater than 0 goto outer_loop
223
224end_loops:
225    vpop        {d8  -  d15}
226    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
227
228
229