• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_luma_mode2_neon.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref,
78@                                 word32 src_strd,
79@                                 uword8 *pu1_dst,
80@                                 word32 dst_strd,
81@                                 word32 nt,
82@                                 word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #104
91@   nt
92@   mode
93@   pi1_coeff
94
95.equ    nt_offset,      104
96
97.text
98.align 4
99
100
101
102
103.globl ihevc_intra_pred_luma_mode2_a9q
104
105.type ihevc_intra_pred_luma_mode2_a9q, %function
106
107ihevc_intra_pred_luma_mode2_a9q:
108
109    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
110    vpush       {d8 - d15}
111    ldr         r4,[sp,#nt_offset]          @loads nt
112    mov         r8,#-2
113
114    cmp         r4,#4
115    beq         mode2_4
116
117    add         r0,r0,r4,lsl #1
118
119    sub         r0,r0,#9                    @src[1]
120    add         r10,r0,#-1
121
122prologue_cpy_32:
123
124    vld1.8      {d0},[r0],r8
125    mov         r11,r4
126
127    vld1.8      {d1},[r10],r8
128    mov         r6, r2
129
130    vld1.8      {d2},[r0],r8
131    vld1.8      {d3},[r10],r8
132    lsr         r1, r4, #3
133
134    vld1.8      {d4},[r0],r8
135    vld1.8      {d5},[r10],r8
136    vld1.8      {d6},[r0],r8
137    mul         r1, r4, r1
138
139    vld1.8      {d7},[r10],r8
140    add         r7,r6,r3
141
142    vrev64.8    d8,d0
143    vrev64.8    d9,d1
144    lsl         r5, r3, #2
145
146    vrev64.8    d10,d2
147    vrev64.8    d11,d3
148    add         r9,r7,r3
149
150    vrev64.8    d12,d4
151    subs        r1,r1,#8
152
153    vrev64.8    d13,d5
154    vrev64.8    d14,d6
155    vrev64.8    d15,d7
156    add         r14,r9,r3
157
158    beq         epilogue_mode2
159
160    sub         r12,r4,#8
161
162kernel_mode2:
163
164    vst1.8      {d8},[r6],r5
165    vst1.8      {d9},[r7],r5
166    subs        r11,r11,#8
167
168    vst1.8      {d10},[r9],r5
169    addgt       r2,r2,#8
170
171    vst1.8      {d11},[r14],r5
172    vst1.8      {d12},[r6],r5
173    movle       r11,r4
174
175    vst1.8      {d13},[r7],r5
176    vst1.8      {d14},[r9],r5
177    addle       r2, r2, r3, lsl #2
178
179    vst1.8      {d15},[r14],r5
180    vld1.8      {d0},[r0],r8
181    sub         r14,r4,#8
182
183    vld1.8      {d1},[r10],r8
184    vld1.8      {d2},[r0],r8
185    addle       r2, r2, #8
186
187    vld1.8      {d3},[r10],r8
188    vld1.8      {d4},[r0],r8
189    suble       r2, r6, r14
190
191    vld1.8      {d5},[r10],r8
192    subs        r12,r12,#8
193
194    vld1.8      {d6},[r0],r8
195    mov         r6, r2
196
197    vld1.8      {d7},[r10],r8
198    addle       r0, r0, r4
199
200    vrev64.8    d8,d0
201    add         r7, r6, r3
202
203    vrev64.8    d9,d1
204    suble       r0, r0, #8
205
206    vrev64.8    d10,d2
207    movle       r12,r4
208
209    vrev64.8    d11,d3
210    add         r9, r7, r3
211
212    vrev64.8    d12,d4
213    add         r10,r0,#-1
214
215    vrev64.8    d13,d5
216    subs        r1, r1, #8
217
218    vrev64.8    d14,d6
219    add         r14, r9, r3
220
221    vrev64.8    d15,d7
222
223    bne         kernel_mode2
224
225epilogue_mode2:
226
227    vst1.8      {d8},[r6],r5
228    vst1.8      {d9},[r7],r5
229    vst1.8      {d10},[r9],r5
230    vst1.8      {d11},[r14],r5
231    vst1.8      {d12},[r6],r5
232    vst1.8      {d13},[r7],r5
233    vst1.8      {d14},[r9],r5
234    vst1.8      {d15},[r14],r5
235
236    b           end_func
237
238mode2_4:
239
240    mov         r8,#-2
241    sub         r0,r0,#1
242    add         r10,r0,#-1
243
244    vld1.8      {d0},[r0],r8
245    add         r5,r2,r3
246    vld1.8      {d2},[r10],r8
247    add         r6,r5,r3
248    vld1.8      {d4},[r0]
249    add         r7,r6,r3
250    vld1.8      {d6},[r10]
251
252    vrev64.8    d1,d0
253    vrev64.8    d3,d2
254
255
256
257    vst1.32     {d1[0]},[r2]
258    vrev64.8    d5,d4
259    vst1.32     {d3[0]},[r5]
260    vrev64.8    d7,d6
261    vst1.32     {d5[0]},[r6]
262    vst1.32     {d7[0]},[r7]
263
264end_func:
265    vpop        {d8 - d15}
266    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
267
268
269
270
271
272
273
274