• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_filters_planar.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for planar input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
78@                                  word32 src_strd,
79@                                  uword8* pu1_dst,
80@                                  word32 dst_strd,
81@                                  word32 nt,
82@                                  word32 mode,
83@                  word32 pi1_coeff)
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #104
91@   nt
92@   mode
93@   pi1_coeff
94
95.equ    nt_offset,      104
96
97.text
98.align 4
99
100
101
102
103.globl ihevc_intra_pred_chroma_planar_a9q
104.extern gau1_ihevc_planar_factor
105
106gau1_ihevc_planar_factor_addr:
107.long gau1_ihevc_planar_factor - ulbl1 - 8
108
109.type ihevc_intra_pred_chroma_planar_a9q, %function
110
111ihevc_intra_pred_chroma_planar_a9q:
112
113    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
114    vpush       {d8 - d15}
115
116    ldr         r4,[sp,#nt_offset]          @loads nt
117    ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
118ulbl1:
119    add         r11,r11,pc
120
121    clz         r5, r4
122    rsb         r5, r5, #32
123    vdup.16     q7, r5
124    vneg.s16    q7, q7                      @shr value (so vneg)
125    vdup.8      d2, r4                      @nt
126    vdup.s16    q8, r4                      @nt
127
128    sub         r6, r4, #1                  @nt-1
129    add         r6, r0,r6,lsl #1            @2*(nt-1)
130    ldr         r7, [r6]
131    vdup.s16    d0, r7                      @src[nt-1]
132
133    add         r6, r4, r4,lsl #1           @3nt
134    add         r6, r6, #1                  @3nt + 1
135    lsl         r6,r6,#1                    @2*(3nt + 1)
136
137    add         r6, r6, r0
138    ldr         r7, [r6]
139    vdup.s16    d1, r7                      @src[3nt+1]
140
141
142    add         r6, r4, r4                  @2nt
143    add         r14, r6, #1                 @2nt+1
144    lsl         r14,#1                      @2*(2nt+1)
145    sub         r6, r6, #1                  @2nt-1
146    lsl         r6,#1                       @2*(2nt-1)
147    add         r6, r6, r0                  @&src[2nt-1]
148    add         r14, r14, r0                @&src[2nt+1]
149
150    mov         r8, #1                      @row+1 (row is first 0)
151    sub         r9, r4, r8                  @nt-1-row (row is first 0)
152
153    vdup.s8     d5, r8                      @row + 1
154    vdup.s8     d6, r9                      @nt - 1 - row
155    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
156
157    add         r12, r11, #1                @coeffs (to be reloaded after every row)
158    mov         r1, r4                      @nt (row counter) (dec after every row)
159    mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
160    mov         r10, #8                     @increment for the coeffs
161    mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
162
163    cmp         r4, #4
164    beq         tf_sz_4
165
166
167
168    mov         r10,r6
169tf_sz_8_16:
170    vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
171    vld1.s8     d8, [r12]!
172    vmov        d9,d8
173    vzip.8      d8,d9
174    vsub.s8     d30, d2, d8                 @[nt-1-col]
175    vsub.s8     d31, d2, d9
176
177
178
179
180loop_sz_8_16:
181
182    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
183    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
184    ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
185    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
186    vdup.s16    d4, r7                      @src[2nt-1-row]
187    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
188    vdup.s16    d3, r11                     @src[2nt-1-row]
189    vmlal.u8    q6, d30, d4                 @(nt-1-col) *   src[2nt-1-row]
190
191
192
193    vmull.u8    q14,d5,d0
194    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
195    vmlal.u8    q14,d6,d11
196    vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
197
198
199    vmlal.u8    q14,d31,d4
200    vsub.s8     d19, d6, d7                 @[nt-1-row]--
201    vmlal.u8    q14,d9,d1
202    vdup.s16    d4, r7                      @src[2nt-1-row]
203
204    vmull.u8    q13, d18, d0                @(row+1)    *   src[nt-1]
205    vadd.i16    q6, q6, q8                  @add (nt)
206    vmlal.u8    q13, d19, d10               @(nt-1-row) *   src[2nt+1+col]
207    vshl.s16    q6, q6, q7                  @shr
208    vmlal.u8    q13, d8, d1                 @(col+1)    *   src[3nt+1]
209    vadd.i16    q14,q14,q8
210    vmlal.u8    q13, d30, d3                @(nt-1-col) *   src[2nt-1-row]
211    vshl.s16    q14,q14,q7
212
213
214
215
216
217    vmull.u8    q12,d18,d0
218    vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
219    vmlal.u8    q12,d19,d11
220    vsub.s8     d6, d19, d7                 @[nt-1-row]--
221    vmlal.u8    q12,d9,d1
222    vmovn.i16   d12, q6
223    vmlal.u8    q12,d31,d3
224    vmovn.i16   d13,q14
225
226
227
228
229    vadd.i16    q13, q13, q8                @add (nt)
230    vmull.u8    q11, d5, d0                 @(row+1)    *   src[nt-1]
231    vshl.s16    q13, q13, q7                @shr
232    vmlal.u8    q11, d6, d10                @(nt-1-row) *   src[2nt+1+col]
233    vst1.s32    {d12,d13}, [r2], r3
234    vmlal.u8    q11, d8, d1                 @(col+1)    *   src[3nt+1]
235    vadd.i16    q12,q12,q8
236    vmlal.u8    q11, d30, d4                @(nt-1-col) *   src[2nt-1-row]
237    vshl.s16    q12,q12,q7
238
239    vmull.u8    q10,d5,d0
240    vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
241    vmlal.u8    q10,d6,d11
242    vsub.s8     d19, d6, d7                 @[nt-1-row]--
243    vmlal.u8    q10,d31,d4
244
245    ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
246    vmlal.u8    q10,d9,d1
247    vdup.s16    d3, r11                     @src[2nt-1-row]
248    vadd.i16    q11, q11, q8                @add (nt)
249
250    vmull.u8    q6, d18, d0                 @(row+1)    *   src[nt-1]
251    vmovn.i16   d26, q13
252    vmlal.u8    q6, d19, d10                @(nt-1-row) *   src[2nt+1+col]
253    vmovn.i16   d27,q12
254
255    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
256    vshl.s16    q11, q11, q7                @shr
257
258    vmlal.u8    q6, d30, d3                 @(nt-1-col) *   src[2nt-1-row]
259    vadd.i16    q10,q10,q8
260
261    vmull.u8    q14,d18,d0
262    vst1.s32    {d26,d27}, [r2], r3
263
264    vmlal.u8    q14,d19,d11
265    vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
266
267    vsub.s8     d6, d19, d7                 @[nt-1-row]--
268    vmlal.u8    q14,d9,d1
269
270    vmlal.u8    q14,d31,d3
271    vshl.s16    q10,q10,q7
272
273
274    vadd.i16    q6, q6 ,q8                  @add (nt)
275    vmovn.i16   d22, q11
276
277
278    vadd.i16    q14,q14,q8
279    vmovn.i16   d23,q10
280
281
282    vshl.s16    q6, q6, q7                  @shr
283    vst1.s32    {d22,d23}, [r2], r3
284    vshl.s16    q14,q14,q7
285
286
287
288
289
290    vmovn.i16   d20, q6
291    vmovn.i16   d21,q14
292
293    vst1.s32    {d20,d21}, [r2], r3
294
295
296    subs        r1, r1, #4
297
298    bne         loop_sz_8_16
299
300
301
302
303    cmp         r4,#16
304
305    bne         end_loop
306
307
308    sub         r4,#16
309    vdup.s8     d5, r8                      @row + 1
310    vdup.s8     d6, r9                      @nt - 1 - row
311    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
312
313    mov         r6,r10
314    mov         r1,#16
315    sub         r2,r2,r3,lsl #4
316    add         r2,r2,#16
317
318    vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
319    vld1.s8     d8, [r12]!
320    vmov        d9,d8
321    vzip.8      d8,d9
322    vsub.s8     d30, d2, d8                 @[nt-1-col]
323    vsub.s8     d31, d2, d9
324
325    beq         loop_sz_8_16
326
327
328
329tf_sz_4:
330    vld1.s8     d10, [r14]                  @load src[2nt+1+col]
331    vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
332    vmov        d9,d8
333    vzip.8      d8,d9
334loop_sz_4:
335    @mov        r10, #4             @reduce inc to #4 for 4x4
336    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
337    vdup.s16    d4, r7                      @src[2nt-1-row]
338
339    vsub.s8     d9, d2, d8                  @[nt-1-col]
340
341    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
342    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
343    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
344    vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
345@   vadd.i16    q6, q6, q8          @add (nt)
346@   vshl.s16    q6, q6, q7          @shr
347@   vmovn.i16   d12, q6
348    vrshrn.s16  d12,q6,#3
349
350    vst1.s32    {d12}, [r2], r3
351
352    vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
353    vsub.s8     d6, d6, d7                  @[nt-1-row]--
354    subs        r1, r1, #1
355
356    bne         loop_sz_4
357
358end_loop:
359    vpop        {d8 - d15}
360    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
361
362
363
364
365
366
367
368