• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@/**
20@*******************************************************************************
21@*
22@* @brief
23@*     interprediction luma function for copy
24@*
25@* @par description:
26@*   copies the array of width 'wd' and height 'ht' from the  location pointed
27@*   by 'src' to the location pointed by 'dst'
28@*
29@* @param[in] pu1_src
30@*  uword8 pointer to the source
31@*
32@* @param[out] pu1_dst
33@*  uword8 pointer to the destination
34@*
35@* @param[in] src_strd
36@*  integer source stride
37@*
38@* @param[in] dst_strd
39@*  integer destination stride
40@*
41@* @param[in] pi1_coeff
42@*  word8 pointer to the filter coefficients
43@*
44@* @param[in] ht
45@*  integer height of the array
46@*
47@* @param[in] wd
48@*  integer width of the array
49@*
50@* @returns
51@*
52@* @remarks
53@*  none
54@*
55@*******************************************************************************
56@*/
57@void ihevc_inter_pred_luma_copy (
58@                            uword8 *pu1_src,
59@                            uword8 *pu1_dst,
60@                            word32 src_strd,
61@                            word32 dst_strd,
62@                            word8 *pi1_coeff,
63@                            word32 ht,
64@                            word32 wd   )
65
66@**************variables vs registers*****************************************
67@   r0 => *pu1_src
68@   r1 => *pu1_dst
69@   r2 =>  src_strd
70@   r3 =>  dst_strd
71@   r7 =>  ht
72@   r12 => wd
73
74.equ    coeff_offset,   104
75.equ    ht_offset,      108
76.equ    wd_offset,      112
77
78.text
79.align 4
80
81
82
83
84.globl ihevc_inter_pred_luma_copy_a9q
85
86.type ihevc_inter_pred_luma_copy_a9q, %function
87
88ihevc_inter_pred_luma_copy_a9q:
89    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
90    vpush        {d8 - d15}
91    ldr         r12,[sp,#wd_offset]                @loads wd
92    ldr         r7,[sp,#ht_offset]                 @loads ht
93    cmp         r7,#0                       @checks ht == 0
94    ble         end_loops
95    tst         r12,#15                     @checks wd for multiples for 4 & 8
96    beq         core_loop_wd_16
97    tst         r12,#7                      @checks wd for multiples for 4 & 8
98    beq         core_loop_wd_8
99    sub         r11,r12,#4
100
101outer_loop_wd_4:
102    subs        r4,r12,#0                   @checks wd == 0
103    ble         end_inner_loop_wd_4
104
105inner_loop_wd_4:
106    vld1.32     {d0[0]},[r0]                @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
107    add         r5,r0,r2                    @pu1_src_tmp += src_strd
108    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
109    vst1.32     {d0[0]},[r1]                @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
110    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
111    add         r0,r0,#4                    @pu1_src += 4
112    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
113    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
114    subs        r4,r4,#4                    @(wd -4)
115    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
116    vld1.32     {d0[0]},[r5],r2             @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
117    add         r1,r1,#4                    @pu1_dst += 4
118    vst1.32     {d0[0]},[r6],r3             @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
119
120    bgt         inner_loop_wd_4
121
122end_inner_loop_wd_4:
123    subs        r7,r7,#4                    @ht - 4
124    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
125    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
126    bgt         outer_loop_wd_4
127
128end_loops:
129    vpop         {d8 - d15}
130    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
131
132
133core_loop_wd_8:
134    sub         r11,r12,#8
135
136outer_loop_wd_8:
137    subs        r4,r12,#0                   @checks wd
138    ble         end_inner_loop_wd_8
139
140inner_loop_wd_8:
141    add         r5,r0,r2                    @pu1_src_tmp += src_strd
142    vld1.8      {d0},[r0]!                  @vld1_u8(pu1_src_tmp)
143    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
144    vst1.8      {d0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
145    vld1.8      {d1},[r5],r2                @vld1_u8(pu1_src_tmp)
146    vst1.8      {d1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
147    subs        r4,r4,#8                    @wd - 8(loop condition)
148    vld1.8      {d2},[r5],r2                @vld1_u8(pu1_src_tmp)
149    vst1.8      {d2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
150    vld1.8      {d3},[r5],r2                @vld1_u8(pu1_src_tmp)
151    vst1.8      {d3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
152    bgt         inner_loop_wd_8
153
154end_inner_loop_wd_8:
155    subs        r7,r7,#4                    @ht -= 4
156    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
157    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
158    bgt         outer_loop_wd_8
159
160    vpop         {d8 - d15}
161    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
162
163core_loop_wd_16:
164    sub         r11,r12,#16
165
166outer_loop_wd_16:
167    subs        r4,r12,#0                   @checks wd
168    ble         end_inner_loop_wd_16
169
170inner_loop_wd_16:
171    add         r5,r0,r2                    @pu1_src_tmp += src_strd
172    vld1.8      {q0},[r0]!                  @vld1_u8(pu1_src_tmp)
173    add         r6,r1,r3                    @pu1_dst_tmp += dst_strd
174    vst1.8      {q0},[r1]!                  @vst1_u8(pu1_dst_tmp, tmp_src)
175    vld1.8      {q1},[r5],r2                @vld1_u8(pu1_src_tmp)
176    vst1.8      {q1},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
177    subs        r4,r4,#16                   @wd - 8(loop condition)
178    vld1.8      {q2},[r5],r2                @vld1_u8(pu1_src_tmp)
179    vst1.8      {q2},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
180    vld1.8      {q3},[r5],r2                @vld1_u8(pu1_src_tmp)
181    vst1.8      {q3},[r6],r3                @vst1_u8(pu1_dst_tmp, tmp_src)
182    bgt         inner_loop_wd_16
183
184end_inner_loop_wd_16:
185    subs        r7,r7,#4                    @ht -= 4
186    sub         r0,r5,r11                   @pu1_src = pu1_src_tmp
187    sub         r1,r6,r11                   @pu1_dst = pu1_dst_tmp
188    bgt         outer_loop_wd_16
189
190    vpop         {d8 - d15}
191    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
192
193
194
195
196
197