• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@/**
20@*******************************************************************************
21@*
22@* @brief
23@*     interprediction luma function for copy
24@*
25@* @par description:
26@*   copies the array of width 'wd' and height 'ht' from the  location pointed
27@*   by 'src' to the location pointed by 'dst'
28@*
29@* @param[in] pu1_src
30@*  uword8 pointer to the source
31@*
32@* @param[out] pu1_dst
33@*  uword8 pointer to the destination
34@*
35@* @param[in] src_strd
36@*  integer source stride
37@*
38@* @param[in] dst_strd
39@*  integer destination stride
40@*
41@* @param[in] pi1_coeff
42@*  word8 pointer to the filter coefficients
43@*
44@* @param[in] ht
45@*  integer height of the array
46@*
47@* @param[in] wd
48@*  integer width of the array
49@*
50@* @returns
51@*
52@* @remarks
53@*  none
54@*
55@*******************************************************************************
56@*/
57
58@void ihevc_inter_pred_luma_copy_w16out (
59@                                uword8 *pu1_src,
60@                                word16 *pi2_dst,
61@                                word32 src_strd,
62@                                word32 dst_strd,
63@                                word8 *pi1_coeff,
64@                                word32 ht,
65@                                word32 wd   )
66
67@**************variables vs registers*****************************************
68@   r0 => *pu1_src
69@   r1 => *pi2_dst
70@   r2 =>  src_strd
71@   r3 =>  dst_strd
72@   r7 =>  ht
73@   r12 => wd
74
75.equ    coeff_offset,   104
76.equ    ht_offset,      108
77.equ    wd_offset,      112
78
79.text
80.align 4
81
82
83
84
85.globl ihevc_inter_pred_luma_copy_w16out_a9q
86
87.type ihevc_inter_pred_luma_copy_w16out_a9q, %function
88
89ihevc_inter_pred_luma_copy_w16out_a9q:
90
91    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
92    vpush        {d8 - d15}
93    ldr         r12,[sp,#wd_offset]                @loads wd
94    ldr         r7,[sp,#ht_offset]                 @loads ht
95    cmp         r7,#0                       @ht condition(ht == 0)
96    ble         end_loops                   @loop
97    tst         r12,#7                      @conditional check for wd (multiples)
98    beq         core_loop_wd_8
99    sub         r11,r12,#4
100    lsls        r6,r3,#1
101
102outer_loop_wd_4:
103    subs        r4,r12,#0                   @wd conditional subtract
104    ble         end_inner_loop_wd_4
105
106inner_loop_wd_4:
107    vld1.8      {d0},[r0]                   @vld1_u8(pu1_src_tmp)
108    add         r5,r0,r2                    @pu1_src +src_strd
109    vmovl.u8    q0,d0                       @vmovl_u8(vld1_u8(pu1_src_tmp)
110    add         r10,r1,r6
111    subs        r4,r4,#4                    @wd - 4
112    vshl.i64    q0,q0,#6                    @vshlq_n_s64(temp, 6)
113    vld1.8      {d22},[r5],r2               @vld1_u8(pu1_src_tmp)
114    add         r0,r0,#4                    @pu1_src += 4
115    vst1.64     {d0},[r1]                   @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
116    add         r1,r1,#8
117    vmovl.u8    q11,d22                     @vmovl_u8(vld1_u8(pu1_src_tmp)
118    vld1.8      {d24},[r5],r2               @vld1_u8(pu1_src_tmp)
119    vshl.i64    q11,q11,#6                  @vshlq_n_s64(temp, 6)
120    vmovl.u8    q12,d24                     @vmovl_u8(vld1_u8(pu1_src_tmp)
121    vst1.64     {d22},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
122    vshl.i64    q12,q12,#6                  @vshlq_n_s64(temp, 6)
123    vld1.8      {d26},[r5],r2               @vld1_u8(pu1_src_tmp)
124    vst1.64     {d24},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
125    vmovl.u8    q13,d26                     @vmovl_u8(vld1_u8(pu1_src_tmp)
126    vshl.i64    q13,q13,#6                  @vshlq_n_s64(temp, 6)
127    vst1.64     {d26},[r10],r6              @vst1q_lane_s64(pi2_dst_tmp, temp, 0)
128    bgt         inner_loop_wd_4
129
130end_inner_loop_wd_4:
131    subs        r7,r7,#4                    @ht + 4
132    sub         r0,r5,r11
133    sub         r1,r10,r11,lsl #1
134    bgt         outer_loop_wd_4
135
136end_loops:
137    vpop         {d8 - d15}
138    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
139
140
141core_loop_wd_8:
142    @sub            r11,r12,#8
143    lsls        r5,r3,#1
144    rsb         r11,r12,r3, lsl #2          @ r11 = (dst_strd * 4) - width
145    rsb         r8,r12,r2,lsl #2            @r2->src_strd
146    mov         r4,r12, lsr #3              @ divide by 8
147    mul         r7, r4
148    sub         r4,r12,#0                   @wd conditional check
149    sub         r7,r7,#4                    @subtract one for epilog
150
151prolog:
152    add         r6,r0,r2                    @pu1_src_tmp += src_strd
153    add         r10,r1,r5
154    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
155    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
156    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
157    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
158    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
159    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
160    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
161    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
162    subs        r4,r4,#8                    @wd decrements by 8
163    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
164    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
165    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
166    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
167    addle       r0,r0,r8
168    add         r6,r0,r2                    @pu1_src_tmp += src_strd
169    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
170    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
171    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
172    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
173
174    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
175    addle       r1,r1,r11,lsl #1
176    suble       r4,r12,#0                   @wd conditional check
177
178    subs        r7,r7,#4                    @ht - 4
179
180    blt         epilog_end                  @jumps to epilog_end
181    beq         epilog                      @jumps to epilog
182
183
184
185outer_loop_wd_8:
186
187    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
188    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
189
190    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
191    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
192
193    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
194    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
195
196    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
197
198    subs        r4,r4,#8                    @wd decrements by 8
199    addle       r0,r0,r8
200
201    add         r6,r0,r2                    @pu1_src_tmp += src_strd
202
203    vld1.8      {d8},[r0]!                  @vld1_u8(pu1_src_tmp)
204    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
205
206    vld1.8      {d10},[r6],r2               @vld1_u8(pu1_src_tmp)
207    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
208
209    vld1.8      {d12},[r6],r2               @vld1_u8(pu1_src_tmp)
210    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
211
212    vld1.8      {d14},[r6],r2               @vld1_u8(pu1_src_tmp)
213    add         r10,r1,r5
214
215    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
216
217    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
218
219    addle       r1,r1,r11,lsl #1
220    suble       r4,r12,#0                   @wd conditional check
221
222    subs        r7,r7,#4                    @ht - 4
223    bgt         outer_loop_wd_8
224
225epilog:
226    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
227    vmovl.u8    q8,d8                       @vmovl_u8(vld1_u8(pu1_src_tmp))
228
229    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
230    vmovl.u8    q9,d10                      @vmovl_u8(vld1_u8(pu1_src_tmp)
231
232    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
233    vmovl.u8    q10,d12                     @vmovl_u8(vld1_u8(pu1_src_tmp)
234
235    vmovl.u8    q11,d14                     @vmovl_u8(vld1_u8(pu1_src_tmp)
236    @add        r6,r0,r2                @pu1_src_tmp += src_strd
237
238    vshl.i16    q0,q8,#6                    @vshlq_n_s16(tmp, 6)
239    vshl.i16    q1,q9,#6                    @vshlq_n_s16(tmp, 6)
240    vshl.i16    q2,q10,#6                   @vshlq_n_s16(tmp, 6)
241    add         r10,r1,r5
242    vshl.i16    q3,q11,#6                   @vshlq_n_s16(tmp, 6)
243
244    vst1.16     {d0,d1},[r1]!               @vst1q_s16(pi2_dst_tmp, tmp)
245epilog_end:
246    vst1.16     {d2,d3},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
247    vst1.16     {d4,d5},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
248    vst1.16     {d6,d7},[r10],r5            @vst1q_s16(pi2_dst_tmp, tmp)
249
250
251    vpop         {d8 - d15}
252    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
253
254
255
256
257