• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/*******************************************************************************
19@* @file
20@*  ihevc_deblk_luma_vert.s
21@*
22@* @brief
23@*  contains function definitions for inter prediction  interpolation.
24@* functions are coded using neon  intrinsics and can be compiled using
25
26@* rvct
27@*
28@* @author
29@*  anand s
30@*
31@* @par list of functions:
32@*
33@*
34@* @remarks
35@*  none
36@*
37@*******************************************************************************/
38
39.equ    qp_q_offset,                108
40.equ    beta_offset_div2_offset,    112
41.equ    tc_offset_div2_offset,      116
42.equ    filter_p_offset,            120
43.equ    filter_q_offset,            124
44
45.text
46.align 4
47
48
49
50
51
52.extern gai4_ihevc_tc_table
53.extern gai4_ihevc_beta_table
54.globl ihevc_deblk_luma_horz_a9q
55
56gai4_ihevc_tc_table_addr:
57.long gai4_ihevc_tc_table  - ulbl1 - 8
58
59gai4_ihevc_beta_table_addr:
60.long gai4_ihevc_beta_table  - ulbl2 - 8
61
62.type ihevc_deblk_luma_horz_a9q, %function
63
64ihevc_deblk_luma_horz_a9q:
65    stmfd       sp!, {r3-r12,lr}
66    vpush       {d8  -  d15}
67
68    ldr         r4,[sp,#qp_q_offset]
69    ldr         r5,[sp,#beta_offset_div2_offset]
70
71    add         r3,r3,r4
72    add         r3,r3,#1
73    ldr         r6, [sp,#tc_offset_div2_offset]
74    asr         r3,r3,#1
75    add         r7,r3,r5,lsl #1
76    add         r3,r3,r6,lsl #1
77    cmp         r7,#0x33
78    movgt       r7,#0x33
79    bgt         l1.1532
80    cmp         r7,#0x0
81    movlt       r7,#0x0                     @ r7 has the beta_index value
82l1.1532:
83    @     bic      r2,r2,#1
84    asr         r2,r2,#1
85
86    add         r3,r3,r2,lsl #1
87    cmp         r3,#0x35
88    movgt       r3,#0x35
89    bgt         l1.1564
90    cmp         r3,#0x0
91    movlt       r3,#0x0                     @ r3 has the tc_index value
92
93    @    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
94    @    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
95    @    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
96
97l1.1564:
98    ldr         r2,gai4_ihevc_beta_table_addr
99ulbl2:
100    add         r2,r2,pc
101    ldr         r4,gai4_ihevc_tc_table_addr
102ulbl1:
103    add         r4,r4,pc
104
105    ldr         r5,[r2,r7,lsl #2]           @ beta
106    ldr         r6,[r4,r3,lsl #2]           @ tc
107
108
109
110    cmp         r6,#0
111    beq         l1.2404
112    vmov.i16    d0,#0x2
113    lsl         r7,r6,#1
114    add         r14,r1,r1,lsl #1
115    ldr         r8,[r0,-r14]                @ -3 value
116    vdup.8      d1,r7
117    ldr         r10,[r0,-r1,lsl #1]         @-2 value
118    vdup.32     d23,r8                      @ -3 value
119    ldr         r11,[r0,-r1]                @-1 value
120    vdup.32     d24,r10                     @ -2 value
121    and         r8,#0xff
122    ldr         r12,[r0,#0]                 @ 0 value
123    vdup.32     d25, r11                    @-1 value
124    and         r10,#0xff
125    ldr         r9,[r0,r1]                  @ 1 value
126    vdup.32     d26,r12                     @ 0 value
127    and         r11,#0xff
128    ldr         r2,[r0,r1,lsl #1]           @ 2 value
129    vdup.32     d27,r9                      @ 1value
130    and         r12,#0xff
131    vdup.32     d28,r2                      @ 2 value
132    and         r9,#0xff
133    and         r2,#0xff
134
135    add         r12,r12,r2
136    subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
137    rsbmi       r9,r9,#0
138    @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
139
140    add         r8,r8,r11
141    subs        r8,r8,r10,lsl #1
142    rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
143    @  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
144
145
146
147    add         r3,r1,r1,lsl #1
148    add         r14,r0,#3
149
150
151    ldrb        r2,[r14,-r3]                @ -2 value
152    ldrb        r10,[r14,-r1,lsl #1]        @ -2 value
153    ldrb        r11,[r14,-r1]               @ -1 value
154    ldrb        r12,[r14,#0]                @ 0 value
155    ldrb        r3,[r14,r1]                 @ 1 value
156    ldrb        r4,[r14,r1,lsl #1]          @ 2 value
157
158
159    add         r12,r12,r4
160    subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
161    rsbmi       r12,r12,#0
162    @    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
163
164
165    add         r2,r2,r11
166    subs        r11,r2,r10,lsl #1
167    rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
168    @    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
169
170
171
172    add         r3,r8,r9                    @ r3 has the d0 value
173    add         r4,r11,r12                  @ r4 has the d3 value
174
175
176    @    d0 = dp0 + dq0@
177    @    d3 = dp3 + dq3@
178
179    add         r14,r8,r11                  @ r13 has the value dp
180    add         r12,r12,r9                  @ r12 has the value  dq
181    @    dp = dp0 + dp3@
182    @   dq = dq0 + dq3@
183
184    add         r11, r3, r4                 @ r3 has the value d
185
186    @   d = d0 + d3@
187
188
189    cmp         r11,r5
190    bge         l1.2404
191
192    @    if(d < beta)
193
194
195    @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
196
197    @ registers for use: r2,r7,r8,r9,r10,
198
199    asr         r10,r5,#2
200    vqadd.u8    d30,d26,d1
201    cmp         r10,r3,lsl #1
202    vqsub.u8    d31,d26,d1
203    ble         l1.1840
204    add         r10,r1,r1,lsl #1
205    vaddl.u8    q3,d25,d26
206    ldr         r2,[r0,-r1,lsl #2]          @ has the -4 value
207    ldrb        r7,[r0,-r1]                 @ has the -1 value
208    vdup.32     d22,r2                      @ -4 value
209    vaddw.u8    q4,q3,d27
210    ldrb        r3,[r0,#0]                  @ r4 has the 0 value
211    vqadd.u8    d16,d27,d1
212    and         r2,#0xff
213    vmul.i16    q6,q4,d0[0]
214    ldr         r8,[r0,r10]                 @ has the 3 value
215    vaddl.u8    q5,d24,d28
216    subs        r2,r2,r7
217    vqsub.u8    d17,d27,d1
218    vdup.32     d29,r8                      @ 3 value
219    and         r8,#0xff
220    vadd.i16    q6,q6,q5
221    rsbmi       r2,r2,#0
222    vrshrn.i16  d20,q6,#3
223    subs        r8,r8,r3
224    rsbmi       r8,r8,#0
225    vmin.u8     d18,d20,d30
226    add         r8,r8,r2
227
228    cmp         r8,r5,asr #3
229    bge         l1.1840
230    vaddw.u8    q7,q4,d28
231    subs        r7,r3,r7
232    vmax.u8     d4,d18,d31
233    rsbmi       r7,r7,#0
234    vqadd.u8    d30,d28,d1
235    mov         r10,#5
236    vrshrn.i16  d21,q7,#2
237    mul         r10,r10,r6
238    vqsub.u8    d31,d28,d1
239    add         r10,#1
240    cmp         r7,r10,asr #1
241    vmin.u8     d18,d21,d16
242    bge         l1.1840
243
244
245    @        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
246    @            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
247
248    vmax.u8     d5,d18,d17
249    asr         r10,r5,#2
250    vaddl.u8    q8,d29,d28
251    cmp         r10,r4,lsl #1
252    ble         l1.1840
253
254    add         r10,r1,r1,lsl #1
255    vmul.i16    q8,q8,d0[0]
256    add         r4,r0,#3
257
258
259    ldrb        r2,[r4,-r1,lsl #2]
260    vadd.i16    q8,q8,q7
261    ldrb        r7,[r4,-r1]
262    vrshrn.i16  d19,q8,#3
263    ldrb        r3,[r4,#0]
264    ldrb        r8,[r4,r10]
265    @   ubfx   r7,r2,#24,#8           @ has the -1 value
266    @  and    r2,#0xff               @ has the -4 value
267    @  ubfx   r8,r3,#24,#8           @ has the 3 value
268    @  and    r3,#0xff               @ r4 has the 0 value
269
270
271
272    subs        r8,r8,r3
273    vmin.u8     d18,d19,d30
274    rsbmi       r8,r8,#0
275    vaddl.u8    q3,d25,d24
276    subs        r2,r2,r7
277    vmax.u8     d3,d18,d31
278    rsbmi       r2,r2,#0
279    vaddw.u8    q4,q3,d26
280    add         r8,r8,r2
281    vqadd.u8    d30,d25,d1
282    cmp         r8,r5,asr #3
283    vqsub.u8    d31,d25,d1
284    bge         l1.1840
285    vmul.i16    q6,q4,d0[0]
286    subs        r7,r3,r7
287    vqadd.u8    d16,d24,d1
288    rsbmi       r7,r7,#0
289    vaddl.u8    q5,d23,d27
290    mov         r10,#5
291    vqsub.u8    d17,d24,d1
292    mul         r10,r10,r6
293    vadd.i16    q6,q6,q5
294    add         r10,#1
295    vrshrn.i16  d20,q6,#3
296    cmp         r7,r10,asr #1
297    vaddw.u8    q7,q4,d23
298    bge         l1.1840
299    vmin.u8     d18,d20,d30
300    mov         r2,#2
301    vqadd.u8    d30,d23,d1
302    ldr         r4,[sp,#filter_p_offset]         @ loading the filter_flag_p
303    vmax.u8     d2,d18,d31
304    ldr         r5,[sp,#filter_q_offset]         @ loading the filter_flag_q
305    vrshrn.i16  d21,q7,#2
306    b           end_dep_deq_decision_horz
307    @ r2 has the value of de
308    @ r6 has teh value of tc
309    @ r5 has the value of beta
310    @ r14 has the value of dp
311    @ r12 has the value of dq
312    @ r0 has the value of source address
313    @ r1 has the src stride
314
315l1.1840:
316    mov         r2,#1
317
318    mov         r11,r5
319    ldr         r4,[sp,#filter_p_offset]         @ loading the filter_flag_p
320    ldr         r5,[sp,#filter_q_offset]         @ loading the filter_flag_q
321
322    cmp         r6,#1
323    moveq       r9,#0
324    moveq       r10,#0
325    beq         end_dep_deq_decision_horz
326
327    and         r7,r4,r5
328    cmp         r7,#1
329    beq         both_flags_set_horz
330    cmp         r4,#0
331    beq         set_flag_dep_zero_horz
332
333
334    add         r8,r11,r11,asr #1
335    mov         r10,#0
336    asr         r8,#3
337    cmp         r8,r14
338    movgt       r9,#1
339    movle       r9,#0
340    b           end_dep_deq_decision_horz
341set_flag_dep_zero_horz:
342
343    add         r8,r11,r11,asr #1
344    mov         r9,#0
345    asr         r8,#3
346    cmp         r8,r12
347    movgt       r10,#1
348    movle       r10,#0
349    b           end_dep_deq_decision_horz
350
351both_flags_set_horz:
352    add         r8,r11,r11,asr #1
353    asr         r8,#3
354    cmp         r8,r14
355    movgt       r9,#1
356    movle       r9,#0
357    cmp         r8,r12
358    movgt       r10,#1
359    movle       r10,#0
360end_dep_deq_decision_horz:
361
362    @r0=source address
363    @r1=stride
364    @ r2 =de
365    @ r4=flag p
366    @r5= flag q
367    @r6 =tc
368    @ r9 =dep
369    @ r10=deq
370
371
372
373    @   add     r14,r1,r1,lsl #1
374    @   lsl     r7,r6,#1
375    @   vdup.8  d1,r7
376    @   vmov.i16  d0,#0x2
377    vmin.u8     d18,d21,d16
378    cmp         r2,#1
379    vqsub.u8    d31,d23,d1
380    beq         l1.2408
381    vaddl.u8    q4,d23,d22
382    cmp         r5,#1
383
384    bne         strong_filtering_p
385
386strong_filtering_q:
387    mov         r12,r0
388    vst1.32     d4[0],[r12],r1
389    vst1.32     d5[0],[r12],r1
390    vst1.32     d3[0],[r12]
391    cmp         r4,#1
392    bne         l1.2404
393strong_filtering_p:
394    vmax.u8     d5,d18,d17
395    mov         r12,r0
396    vmul.i16    q4,q4,d0[0]
397    rsb         r11,r1,#0
398    vadd.i16    q8,q4,q7
399    add         r12,r12,r11
400    vrshrn.i16  d19,q8,#3
401    vst1.32     d2[0],[r12],r11
402    vmin.u8     d18,d19,d30
403    vst1.32     d5[0],[r12],r11
404    vmax.u8     d3,d18,d31
405    vst1.32     d3[0],[r12]
406
407l1.2404:
408    vpop        {d8  -  d15}
409    ldmfd       sp!, {r3-r12,pc}
410
411    @ r4=flag p
412    @r5= flag q
413    @r6 =tc
414    @ r9 =dep
415    @ r10=deq
416
417
418    @       d22          -4 value
419
420    @d23        @ -3 value
421
422    @   vdup.32 d24,r11         @ -2 value
423
424    @   vdup.32 d25, r11        @-1 value
425
426    @   vdup.32 d26,r11         @ 0 value
427
428    @   vdup.32 d27,r11         @ 1value
429
430    @   vdup.32 d28,r11         @ 2 value
431
432    @   vdup.32 d29,r11         @ 3 value
433
434l1.2408:
435
436    vmov.i16    d0,#0x9
437
438    vsubl.u8    q5,d26,d25
439
440    vmul.i16    q5,q5,d0[0]
441
442    vmov.i16    d0,#0x3
443
444    vsubl.u8    q6,d27,d24
445    vmul.i16    q6,q6,d0[0]
446
447
448    vdup.8      d30,r6                      @ duplicating the +tc value
449
450    rsb         r12,r6,#0
451    vdup.8      d31,r12                     @ duplicating the -tc value
452
453
454
455    vsub.i16    q5,q5,q6
456
457
458
459    vrshr.s16   q5,q5,#4
460    @   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
461
462    vabs.s16    q4,q5
463    vmovn.i16   d9,q4
464    @ storing the absolute values of delta in d9
465
466    vqmovn.s16  d10,q5
467    @ storing the clipped values of delta in d16
468
469
470    vmin.s8     d11,d10,d30
471    vmax.s8     d8,d31,d11                  @ d8 has the value  delta = clip3(delta, -tc, tc)@
472
473
474    vmovl.u8    q3,d25
475
476    vaddw.s8    q2,q3,d8
477
478    vqmovun.s16 d12,q2
479    vmovl.u8    q3,d26
480    vsubw.s8    q2,q3,d8
481    vqmovun.s16 d13,q2
482
483
484    mov         r11,#0xa
485    mul         r12,r11,r6
486    vdup.8      d2,r12                      @ d2 has the 10*tc value
487    vmov        d18,d24
488    vdup.8      d0,r6
489    vshr.s8     d0,#1
490    vneg.s8     d1,d0
491
492    cmp         r4,#1
493    bne         l1.2724
494    cmp         r9,#1
495    bne         l1.2700
496
497    @ d12 and d13 have the value temp_p0 and temp_q0
498    vaddl.u8    q7,d23,d25
499    vrshrn.u16  d14,q7,#1
500    vsubl.u8    q7,d14,d24
501    vaddw.s8    q7,q7,d8
502    vqshrn.s16  d14,q7,#1
503    vmin.s8     d15,d14,d0
504    vmax.s8     d14,d1,d15
505
506    @ d14 has the delta p value
507    vmovl.u8    q8,d24
508    vaddw.s8    q8,q8,d14
509    vqmovun.s16 d14,q8
510
511    @  d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@
512    vcge.u8     d18,d9,d2
513    vbsl        d18,d24,d14
514
515l1.2700:
516    mov         r12,r0
517    rsb         r11,r1,#0
518    add         r12,r11
519    vcge.u8     d19,d9,d2
520    vbsl        d19,d25,d12
521    vst1.32     {d19[0]},[r12],r11
522    vst1.32     {d18[0]},[r12]
523l1.2724:
524    cmp         r5,#1
525    bne         l1.2404
526    cmp         r10,#1
527    vmov        d18, d27
528    bne         l1.2852
529
530    vaddl.u8    q7,d26,d28
531    vrshrn.u16  d14,q7,#1
532    vsubl.u8    q7,d14,d27
533    vsubw.s8    q7,q7,d8
534    vqshrn.s16  d14,q7,#1
535    vmin.s8     d15,d14,d0
536    vmax.s8     d14,d1,d15
537@ d14 has the delta p value
538    vmovl.u8    q8,d27
539    vaddw.s8    q8,q8,d14
540    vqmovun.s16 d14,q8
541    vcge.u8     d18,d9,d2
542    vbsl        d18,d27,d14
543l1.2852:
544    mov         r12,r0
545    vcge.u8     d19,d9,d2
546    vbsl        d19,d26,d13
547    vst1.32     {d19[0]},[r12],r1
548    vst1.32     {d18[0]},[r12]
549
550    vpop        {d8  -  d15}
551    ldmfd       sp!, {r3-r12,r15}
552
553
554
555