• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_filters_planar.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for planar input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
78@                                  word32 src_strd,
79@                                  uword8* pu1_dst,
80@                                  word32 dst_strd,
81@                                  word32 nt,
82@                                  word32 mode,
83@                  word32 pi1_coeff)
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #40
91@   nt
92@   mode
93@   pi1_coeff
94
95.text
96.align 4
97
98
99
100
101.globl ihevc_intra_pred_chroma_planar_a9q
102.extern gau1_ihevc_planar_factor
103
104gau1_ihevc_planar_factor_addr:
105.long gau1_ihevc_planar_factor - ulbl1 - 8
106
107.type ihevc_intra_pred_chroma_planar_a9q, %function
108
109ihevc_intra_pred_chroma_planar_a9q:
110
111    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
112
113    ldr         r4,[sp,#40]                 @loads nt
114    ldr         r11, gau1_ihevc_planar_factor_addr @loads table of coeffs
115ulbl1:
116    add         r11,r11,pc
117
118    clz         r5, r4
119    rsb         r5, r5, #32
120    vdup.16     q7, r5
121    vneg.s16    q7, q7                      @shr value (so vneg)
122    vdup.8      d2, r4                      @nt
123    vdup.s16    q8, r4                      @nt
124
125    sub         r6, r4, #1                  @nt-1
126    add         r6, r0,r6,lsl #1            @2*(nt-1)
127    ldr         r7, [r6]
128    vdup.s16    d0, r7                      @src[nt-1]
129
130    add         r6, r4, r4,lsl #1           @3nt
131    add         r6, r6, #1                  @3nt + 1
132    lsl         r6,r6,#1                    @2*(3nt + 1)
133
134    add         r6, r6, r0
135    ldr         r7, [r6]
136    vdup.s16    d1, r7                      @src[3nt+1]
137
138
139    add         r6, r4, r4                  @2nt
140    add         r14, r6, #1                 @2nt+1
141    lsl         r14,#1                      @2*(2nt+1)
142    sub         r6, r6, #1                  @2nt-1
143    lsl         r6,#1                       @2*(2nt-1)
144    add         r6, r6, r0                  @&src[2nt-1]
145    add         r14, r14, r0                @&src[2nt+1]
146
147    mov         r8, #1                      @row+1 (row is first 0)
148    sub         r9, r4, r8                  @nt-1-row (row is first 0)
149
150    vdup.s8     d5, r8                      @row + 1
151    vdup.s8     d6, r9                      @nt - 1 - row
152    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
153
154    add         r12, r11, #1                @coeffs (to be reloaded after every row)
155    mov         r1, r4                      @nt (row counter) (dec after every row)
156    mov         r5, r2                      @dst (to be reloaded after every row and inc by dst_strd)
157    mov         r10, #8                     @increment for the coeffs
158    mov         r0, r14                     @&src[2nt+1] (to be reloaded after every row)
159
160    cmp         r4, #4
161    beq         tf_sz_4
162
163
164
165    mov         r10,r6
166tf_sz_8_16:
167    vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
168    vld1.s8     d8, [r12]!
169    vmov        d9,d8
170    vzip.8      d8,d9
171    vsub.s8     d30, d2, d8                 @[nt-1-col]
172    vsub.s8     d31, d2, d9
173
174
175
176
177loop_sz_8_16:
178
179    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
180    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
181    ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
182    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
183    vdup.s16    d4, r7                      @src[2nt-1-row]
184    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
185    vdup.s16    d3, r11                     @src[2nt-1-row]
186    vmlal.u8    q6, d30, d4                 @(nt-1-col) *   src[2nt-1-row]
187
188
189
190    vmull.u8    q14,d5,d0
191    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
192    vmlal.u8    q14,d6,d11
193    vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
194
195
196    vmlal.u8    q14,d31,d4
197    vsub.s8     d19, d6, d7                 @[nt-1-row]--
198    vmlal.u8    q14,d9,d1
199    vdup.s16    d4, r7                      @src[2nt-1-row]
200
201    vmull.u8    q13, d18, d0                @(row+1)    *   src[nt-1]
202    vadd.i16    q6, q6, q8                  @add (nt)
203    vmlal.u8    q13, d19, d10               @(nt-1-row) *   src[2nt+1+col]
204    vshl.s16    q6, q6, q7                  @shr
205    vmlal.u8    q13, d8, d1                 @(col+1)    *   src[3nt+1]
206    vadd.i16    q14,q14,q8
207    vmlal.u8    q13, d30, d3                @(nt-1-col) *   src[2nt-1-row]
208    vshl.s16    q14,q14,q7
209
210
211
212
213
214    vmull.u8    q12,d18,d0
215    vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
216    vmlal.u8    q12,d19,d11
217    vsub.s8     d6, d19, d7                 @[nt-1-row]--
218    vmlal.u8    q12,d9,d1
219    vmovn.i16   d12, q6
220    vmlal.u8    q12,d31,d3
221    vmovn.i16   d13,q14
222
223
224
225
226    vadd.i16    q13, q13, q8                @add (nt)
227    vmull.u8    q11, d5, d0                 @(row+1)    *   src[nt-1]
228    vshl.s16    q13, q13, q7                @shr
229    vmlal.u8    q11, d6, d10                @(nt-1-row) *   src[2nt+1+col]
230    vst1.s32    {d12,d13}, [r2], r3
231    vmlal.u8    q11, d8, d1                 @(col+1)    *   src[3nt+1]
232    vadd.i16    q12,q12,q8
233    vmlal.u8    q11, d30, d4                @(nt-1-col) *   src[2nt-1-row]
234    vshl.s16    q12,q12,q7
235
236    vmull.u8    q10,d5,d0
237    vadd.s8     d18, d5, d7                 @row++ [(row+1)++]c
238    vmlal.u8    q10,d6,d11
239    vsub.s8     d19, d6, d7                 @[nt-1-row]--
240    vmlal.u8    q10,d31,d4
241
242    ldr         r11, [r6], #-2              @src[2nt-1-row] (dec to take into account row)
243    vmlal.u8    q10,d9,d1
244    vdup.s16    d3, r11                     @src[2nt-1-row]
245    vadd.i16    q11, q11, q8                @add (nt)
246
247    vmull.u8    q6, d18, d0                 @(row+1)    *   src[nt-1]
248    vmovn.i16   d26, q13
249    vmlal.u8    q6, d19, d10                @(nt-1-row) *   src[2nt+1+col]
250    vmovn.i16   d27,q12
251
252    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
253    vshl.s16    q11, q11, q7                @shr
254
255    vmlal.u8    q6, d30, d3                 @(nt-1-col) *   src[2nt-1-row]
256    vadd.i16    q10,q10,q8
257
258    vmull.u8    q14,d18,d0
259    vst1.s32    {d26,d27}, [r2], r3
260
261    vmlal.u8    q14,d19,d11
262    vadd.s8     d5, d18, d7                 @row++ [(row+1)++]
263
264    vsub.s8     d6, d19, d7                 @[nt-1-row]--
265    vmlal.u8    q14,d9,d1
266
267    vmlal.u8    q14,d31,d3
268    vshl.s16    q10,q10,q7
269
270
271    vadd.i16    q6, q6 ,q8                  @add (nt)
272    vmovn.i16   d22, q11
273
274
275    vadd.i16    q14,q14,q8
276    vmovn.i16   d23,q10
277
278
279    vshl.s16    q6, q6, q7                  @shr
280    vst1.s32    {d22,d23}, [r2], r3
281    vshl.s16    q14,q14,q7
282
283
284
285
286
287    vmovn.i16   d20, q6
288    vmovn.i16   d21,q14
289
290    vst1.s32    {d20,d21}, [r2], r3
291
292
293    subs        r1, r1, #4
294
295    bne         loop_sz_8_16
296
297
298
299
300    cmp         r4,#16
301
302    bne         end_loop
303
304
305    sub         r4,#16
306    vdup.s8     d5, r8                      @row + 1
307    vdup.s8     d6, r9                      @nt - 1 - row
308    vmov        d7, d5                      @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
309
310    mov         r6,r10
311    mov         r1,#16
312    sub         r2,r2,r3,lsl #4
313    add         r2,r2,#16
314
315    vld1.s8     {d10,d11}, [r14]!           @load src[2nt+1+col]
316    vld1.s8     d8, [r12]!
317    vmov        d9,d8
318    vzip.8      d8,d9
319    vsub.s8     d30, d2, d8                 @[nt-1-col]
320    vsub.s8     d31, d2, d9
321
322    beq         loop_sz_8_16
323
324
325
326tf_sz_4:
327    vld1.s8     d10, [r14]                  @load src[2nt+1+col]
328    vld1.s8     d8, [r12], r10              @load 8 coeffs [col+1]
329    vmov        d9,d8
330    vzip.8      d8,d9
331loop_sz_4:
332    @mov        r10, #4             @reduce inc to #4 for 4x4
333    ldr         r7, [r6], #-2               @src[2nt-1-row] (dec to take into account row)
334    vdup.s16    d4, r7                      @src[2nt-1-row]
335
336    vsub.s8     d9, d2, d8                  @[nt-1-col]
337
338    vmull.u8    q6, d5, d0                  @(row+1)    *   src[nt-1]
339    vmlal.u8    q6, d6, d10                 @(nt-1-row) *   src[2nt+1+col]
340    vmlal.u8    q6, d8, d1                  @(col+1)    *   src[3nt+1]
341    vmlal.u8    q6, d9, d4                  @(nt-1-col) *   src[2nt-1-row]
342@   vadd.i16    q6, q6, q8          @add (nt)
343@   vshl.s16    q6, q6, q7          @shr
344@   vmovn.i16   d12, q6
345    vrshrn.s16  d12,q6,#3
346
347    vst1.s32    {d12}, [r2], r3
348
349    vadd.s8     d5, d5, d7                  @row++ [(row+1)++]
350    vsub.s8     d6, d6, d7                  @[nt-1-row]--
351    subs        r1, r1, #1
352
353    bne         loop_sz_4
354
355end_loop:
356    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
357
358
359
360
361
362
363
364