• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_chroma_horz_neon.s
22@*
23@* @brief
24@*  contains function definition for intra prediction  interpolation filters
25@*
26@*
27@* @author
28@*  parthiban v
29@*
30@* @par list of functions:
31@*  - ihevc_intra_pred_luma_horz()
32@*
33@* @remarks
34@*  none
35@*
36@*******************************************************************************
37@*/
38@
39@/**
40@*******************************************************************************
41@*
42@* @brief
43@*     intra prediction interpolation filter for horizontal luma variable.
44@*
45@* @par description:
46@*      horizontal intraprediction(mode 10) with.extern  samples location
47@*      pointed by 'pu1_ref' to the tu block  location pointed by 'pu1_dst'  refer
48@*      to section 8.4.4.2.6 in the standard (special case)
49@*
50@* @param[in] pu1_src
51@*  uword8 pointer to the source
52@*
53@* @param[out] pu1_dst
54@*  uword8 pointer to the destination
55@*
56@* @param[in] src_strd
57@*  integer source stride
58@*
59@* @param[in] dst_strd
60@*  integer destination stride
61@*
62@* @param[in] nt
63@*  integer transform block size
64@*
65@* @param[in] mode
66@*  integer intraprediction mode
67@*
68@* @returns
69@*
70@* @remarks
71@*  none
72@*
73@*******************************************************************************
74@*/
75@void ihevc_intra_pred_chroma_horz(uword8 *pu1_ref,
76@                                  word32 src_strd,
77@                                  uword8 *pu1_dst,
78@                                  word32 dst_strd,
79@                                  word32 nt,
80@                                  word32 mode)
81@**************variables vs registers*****************************************
82@r0 => *pu1_ref
83@r1 =>  src_strd
84@r2 => *pu1_dst
85@r3 =>  dst_strd
86
87.equ    nt_offset,      104
88
89.text
90.align 4
91
92
93
94
95.globl ihevc_intra_pred_chroma_horz_a9q
96
97.type ihevc_intra_pred_chroma_horz_a9q, %function
98
99ihevc_intra_pred_chroma_horz_a9q:
100
101    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
102    vpush       {d8 - d15}
103
104    ldr         r4,[sp,#nt_offset]          @loads nt
105
106    lsl         r6,r4,#2                    @four_nt
107
108    add         r12,r0,r6                   @*pu1_ref[four_nt]
109    cmp         r4,#4                       @if nt == 4
110    beq         core_loop_4
111
112    cmp         r4,#8                       @if nt == 8
113    beq         core_loop_8
114
115    @cmp            r4,#16                          @if nt == 16
116    @beq            core_loop_16
117
118    sub         r12,r12,#16                 @move to 16th value pointer
119    add         r9,r2,#16
120
121core_loop_16:
122    vld1.16     {q0},[r12]                  @load 16 values. d1[7] will have the 1st value.
123    sub         r12,r12,#16
124    vld1.16     {q5},[r12]                  @load 16 values. d1[7] will have the 1st value.
125
126    vdup.16     q1,d1[3]                    @duplicate the i value.
127
128    vdup.16     q2,d1[2]                    @duplicate the ii value.
129    vdup.16     q3,d1[1]                    @duplicate the iii value.
130    vst1.16     {q1},[r2],r3                @store in 1st row 0-16 columns
131    vst1.16     {q1},[r9],r3                @store in 1st row 16-32 columns
132
133    vdup.16     q4,d1[0]
134    vst1.16     {q2},[r2],r3
135    vst1.16     {q2},[r9],r3
136
137    vdup.16     q1,d0[3]
138    vst1.16     {q3},[r2],r3
139    vst1.16     {q3},[r9],r3
140
141    vdup.16     q2,d0[2]
142    vst1.16     {q4},[r2],r3
143    vst1.16     {q4},[r9],r3
144
145    vdup.16     q3,d0[1]
146    vst1.16     {q1},[r2],r3
147    vst1.16     {q1},[r9],r3
148
149    vdup.16     q4,d0[0]
150    vst1.16     {q2},[r2],r3
151    vst1.16     {q2},[r9],r3
152
153    vdup.16     q1,d11[3]
154    vst1.16     {q3},[r2],r3
155    vst1.16     {q3},[r9],r3
156
157    vdup.16     q2,d11[2]
158    vst1.16     {q4},[r2],r3
159    vst1.16     {q4},[r9],r3
160
161    vdup.16     q3,d11[1]
162    vst1.16     {q1},[r2],r3
163    vst1.16     {q1},[r9],r3
164
165    vdup.16     q4,d11[0]
166    vst1.16     {q2},[r2],r3
167    vst1.16     {q2},[r9],r3
168
169    vdup.16     q1,d10[3]
170    vst1.16     {q3},[r2],r3
171    vst1.16     {q3},[r9],r3
172
173    vdup.16     q2,d10[2]
174    vst1.16     {q4},[r2],r3
175    vst1.16     {q4},[r9],r3
176
177    vdup.16     q3,d10[1]
178    vst1.16     {q1},[r2],r3
179    vst1.16     {q1},[r9],r3
180    sub         r12,r12,#16                 @move to 16th value pointer
181
182    vdup.16     q4,d10[0]
183    vst1.16     {q2},[r2],r3
184    vst1.16     {q2},[r9],r3
185
186    subs        r4,r4,#16                   @decrement the loop count by 16
187    vst1.16     {q3},[r2],r3
188    vst1.16     {q3},[r9],r3
189
190    vst1.16     {q4},[r2],r3
191    vst1.16     {q4},[r9],r3
192    bgt         core_loop_16
193    vpop        {d8 - d15}
194    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
195    b           endloop
196
197core_loop_8:
198    ldrb        lr,[r12],#1                 @pu1_ref[two_nt]
199    @vld1.8     {q15},[r12]                     @pu1_ref[two_nt + 1 + col]
200
201    vdup.8      d28,lr
202    sub         r12,r12,#17
203    vld1.8      {q0},[r12]
204
205    sub         r12,r12,#16
206    vld1.8      {q15},[r12]
207    vdup.16     q5,d1[3]
208    @vmovl.u8   q13,d26
209
210    vdup.16     q1,d1[2]
211    @vsubl.u8   q12,d30,d28
212
213    vdup.16     q2,d1[1]
214    @vshr.s16   q12,q12,#1
215
216    vdup.16     q3,d1[0]
217    @vqadd.s16  q11,q13,q12
218
219    vdup.16     q4,d0[3]
220    @vqmovun.s16 d22,q11
221
222    vst1.16     {q5},[r2],r3
223
224    vdup.16     q5,d0[2]
225    @vsubl.u8   q12,d31,d28
226
227    vdup.16     q6,d0[1]
228    @vshr.s16   q12,q12,#1
229
230    vdup.16     q7,d0[0]
231    @vqadd.s16  q11,q13,q12
232
233    vdup.16     q8,d0[3]
234    @vqmovun.s16 d22,q11
235
236    vst1.16     {q1},[r2],r3
237    @sub            r2,r2,#8
238
239    vst1.16     {q2},[r2],r3
240
241    vst1.16     {q3},[r2],r3
242    vst1.16     {q4},[r2],r3
243    vst1.16     {q5},[r2],r3
244
245    @vdup.8     q1,d0[2]
246    vst1.16     {q6},[r2],r3
247
248    @vdup.8     q2,d0[1]
249    vst1.16     {q7},[r2],r3
250
251    @vdup.8     q3,d0[0]
252    @vst1.8     {q7},[r2],r3
253
254    @vdup.8     q4,d0[3]
255    @vst1.8     {q8},[r2],r3
256
257    @vdup.8     q5,d0[2]
258    @vst1.8     {q1},[r2],r3
259
260    @vdup.8     q6,d0[1]
261    @vst1.8     {q2},[r2],r3
262
263    @vdup.8     q7,d0[0]
264    @vst1.8     {q3},[r2],r3
265
266    @vst1.8     {q4},[r2],r3
267    @vst1.8     {q5},[r2],r3
268    @vst1.8     {q6},[r2],r3
269    @vst1.8     {q7},[r2],r3
270    vpop        {d8 - d15}
271
272    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
273    b           endloop
274
275
276core_loop_4:
277    ldrb        lr,[r12]                    @pu1_ref[two_nt]
278    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
279    @vld1.8     {d30},[r12]                     @pu1_ref[two_nt + 1 + col]
280
281    sub         r12,r12,#9
282    vld1.8      {d0},[r12]
283    sub         r12,r12,#8
284    vld1.8      {d30},[r12]
285    vdup.16     d26,d0[3]
286    vdup.8      d28,lr
287
288    vdup.16     d3,d0[2]
289    vmovl.u8    q13,d26
290
291    vdup.16     d4,d0[1]
292    vsubl.u8    q12,d30,d28
293
294    vdup.16     d5,d0[0]
295    vshr.s16    q12,q12,#1
296
297    vdup.16     d6,d0[3]
298    vqadd.s16   q11,q13,q12
299
300    vdup.16     d7,d0[2]
301    vqmovun.s16 d22,q11
302
303    vst1.8      {d6},[r2],r3
304    vst1.8      {d3},[r2],r3
305
306    vdup.16     d8,d0[1]
307    vst1.8      {d4},[r2],r3
308    vst1.8      {d5},[r2],r3
309
310    vdup.16     d9,d0[0]
311    @vst1.8     {d6},[r2],r3
312    @vst1.8     {d7},[r2],r3
313
314    @vst1.8     {d8},[r2],r3
315    @vst1.8     {d9},[r2],r3
316    vpop        {d8 - d15}
317    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
318    b           endloop
319
320
321@core_loop_4
322    ldrb        lr,[r12]                    @pu1_ref[two_nt]
323    add         r12,r12,#1                  @pu1_ref[two_nt + 1]
324    vld1.8      {d30},[r12]                 @pu1_ref[two_nt + 1 + col]
325
326    sub         r12,r12,#5
327    vld1.8      {d0},[r12]
328    vdup.8      d28,lr
329    vdup.8      d26,d0[3]
330    vmovl.u8    q13,d26
331
332    vdup.8      d3,d0[2]
333    vsubl.u8    q12,d30,d28
334
335    vdup.8      d4,d0[1]
336    vshr.s16    q12,q12,#1
337
338    vdup.8      d5,d0[0]
339    vqadd.s16   q11,q13,q12
340
341    vqmovun.s16 d22,q11
342
343    vst1.32     {d22[0]},[r2],r3
344    vst1.32     {d3[0]},[r2],r3
345    vst1.32     {d4[0]},[r2],r3
346    vst1.32     {d5[0]},[r2],r3
347
348    vpop        {d8 - d15}
349    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
350
351endloop:
352
353
354