• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@/*******************************************************************************
20@* @file
21@*  ihevc_deblk_luma_vert.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  anand s
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************/
39
40.equ    qp_q_offset,                44
41.equ    beta_offset_div2_offset,    48
42.equ    tc_offset_div2_offset,      52
43.equ    filter_p_offset,            56
44.equ    filter_q_offset,            60
45
46.text
47.align 4
48
49
50
51
52
53.extern gai4_ihevc_tc_table
54.extern gai4_ihevc_beta_table
55
56.globl ihevc_deblk_luma_vert_a9q
57
58gai4_ihevc_tc_table_addr:
59.long gai4_ihevc_tc_table   - ulbl1 - 8
60
61gai4_ihevc_beta_table_addr:
62.long gai4_ihevc_beta_table   - ulbl2 - 8
63
64.type ihevc_deblk_luma_vert_a9q, %function
65
66ihevc_deblk_luma_vert_a9q:
67
68    push        {r3-r12,lr}
69    ldr         r4,[sp,#qp_q_offset]
70    ldr         r5,[sp,#beta_offset_div2_offset]
71
72    add         r3,r3,r4
73    add         r3,r3,#1
74    ldr         r6, [sp,#tc_offset_div2_offset]
75    asr         r3,r3,#1
76    add         r7,r3,r5,lsl #1
77    add         r3,r3,r6,lsl #1
78    cmp         r7,#0x33
79    movgt       r7,#0x33
80    bgt         l1.56
81    cmp         r7,#0x0
82    movlt       r7,#0x0                     @ r7 has the beta_index value
83l1.56:
84
85@     bic      r2,r2,#1
86    asr         r2,r2,#1
87
88    add         r3,r3,r2,lsl #1
89    cmp         r3,#0x35
90    movgt       r3,#0x35
91    bgt         l1.88
92    cmp         r3,#0x0
93    movlt       r3,#0x0                     @ r3 has the tc_index value
94
95@    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
96@    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
97@    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
98
99l1.88:
100    ldr         r2,gai4_ihevc_beta_table_addr
101ulbl2:
102    add         r2,r2,pc
103    vmov.i8     d18,#0x2
104    ldr         r4,gai4_ihevc_tc_table_addr
105ulbl1:
106    add         r4,r4,pc
107
108    ldr         r5,[r2,r7,lsl #2]           @ beta
109    vmov.i16    q8,#0x2
110    ldr         r6,[r4,r3,lsl #2]           @ tc
111    lsl         r8,r6,#1
112    cmp         r6,#0
113    vdup.8      d19,r8
114    sub         r7,r0,#4
115    vmov.i8     d23,#0x3
116    beq         l1.964
117
118
119    vld1.8      {d24},[r7],r1
120    ldrb        r8,[r0,#-3]                 @ -3 value
121    vld1.8      {d1},[r7],r1
122    ldrb        r10,[r0,#-2]                @-2 value
123    vld1.8      {d2},[r7],r1
124    ldrb        r11,[r0,#-1]                @-1 value
125    vld1.8      {d0},[r7]
126    ldrb        r12,[r0,#0]                 @ 0 value
127    ldrb        r9,[r0,#1]                  @ 1 value
128    vtrn.8      d24,d1
129    ldrb        r2,[r0,#2]                  @ 2 value
130    vtrn.8      d2,d0
131    add         r12,r12,r2
132    subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
133    rsbmi       r9,r9,#0
134@dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
135    vtrn.16     d24,d2
136    add         r8,r8,r11
137    vtrn.16     d1,d0
138    subs        r8,r8,r10,lsl #1
139    rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
140@  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
141
142
143
144    add         r14,r1,r1,lsl #1
145    add         r14,r0,r14
146
147    vdup.32     d4,d24[1]
148    ldrb        r2,[r14,#-3]                @ -2 value
149    vdup.32     d7,d2[1]
150    ldrb        r10,[r14,#-2]               @ -2 value
151    vdup.32     d3,d2[0]
152    ldrb        r11,[r14,#-1]               @ -1 value
153    vdup.32     d5,d1[1]
154    ldrb        r12,[r14,#0]                @ 0 value
155    vdup.32     d6,d1[0]
156    ldrb        r3,[r14,#1]                 @ 1 value
157    vdup.32     d2,d0[0]
158    ldrb        r4,[r14,#2]                 @ 2 value
159
160
161    add         r12,r12,r4
162    subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
163    rsbmi       r12,r12,#0
164@    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
165
166
167    add         r2,r2,r11
168    subs        r11,r2,r10,lsl #1
169    rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
170@    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
171
172
173
174    add         r3,r8,r9                    @ r3 has the d0 value
175    add         r4,r11,r12                  @ r4 has the d3 value
176
177
178@    d0 = dp0 + dq0@
179@    d3 = dp3 + dq3@
180
181    add         r14,r8,r11                  @ r13 has the value dp
182    add         r12,r12,r9                  @ r12 has the value  dq
183@    dp = dp0 + dp3@
184@   dq = dq0 + dq3@
185
186    add         r11, r3, r4                 @ r3 has the value d
187
188@   d = d0 + d3@
189
190
191    cmp         r11,r5
192    vdup.32     d22,d0[1]
193    bge         l1.964
194
195@    if(d < beta)
196
197
198    @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
199
200    @ registers for use: r2,r7,r8,r9,r10,
201    vqsub.u8    d30,d7,d19
202    asr         r10,r5,#2
203    vqadd.u8    d31,d7,d19
204    cmp         r10,r3,lsl #1
205    vaddl.u8    q0,d5,d4
206    ble         l1.336
207
208    ldrb        r2,[r0,#-4]
209    vaddw.u8    q0,q0,d2
210    ldrb        r7,[r0,#-1]
211    vmull.u8    q10,d7,d23
212    ldrb        r3,[r0,#0]
213    vmlal.u8    q10,d22,d18
214    ldrb        r8,[r0,#3]
215@   ubfx   r7,r2,#24,#8           @ has the -1 value
216@  and    r2,#0xff               @ has the -4 value
217@  ubfx   r8,r3,#24,#8           @ has the 3 value
218@  and    r3,#0xff               @ r4 has the 0 value
219
220    vadd.i16    q10,q10,q0
221    subs        r8,r8,r3
222    vrshrn.i16  d22,q10,#3
223    rsbmi       r8,r8,#0
224    subs        r2,r2,r7
225    vmin.u8     d21,d22,d31
226    rsbmi       r2,r2,#0
227    vmax.u8     d22,d21,d30
228    add         r8,r8,r2
229    vaddl.u8    q10,d7,d3
230    cmp         r8,r5,asr #3
231    vmla.i16    q10,q0,q8
232    bge         l1.336
233    vaddw.u8    q0,q0,d7
234    subs        r7,r3,r7
235    vrshrn.i16  d20,q10,#3
236    rsbmi       r7,r7,#0
237    vrshrn.i16  d0,q0,#2
238    mov         r10,#5
239    vqadd.u8    d30,d5,d19
240    mul         r10,r10,r6
241    vqsub.u8    d31,d5,d19
242    add         r10,#1
243    cmp         r7,r10,asr #1
244    bge         l1.336
245
246
247@        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
248@            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
249
250
251    asr         r10,r5,#2
252    vqsub.u8    d25,d4,d19
253    cmp         r10,r4,lsl #1
254    vqadd.u8    d21,d4,d19
255    ble         l1.336
256    vmin.u8     d26,d20,d21
257    add         r4,r1,r1,lsl #1
258    add         r4,r4,r0
259    vmax.u8     d20,d26,d25
260    ldrb        r2,[r4,#-4]
261    vmin.u8     d19,d0,d30
262    ldrb        r7,[r4,#-1]
263    vmax.u8     d21,d19,d31
264    ldrb        r3,[r4,#0]
265    lsl         r10,r6,#1
266    ldrb        r8,[r4,#3]
267@   ubfx   r7,r2,#24,#8           @ has the -1 value
268@  and    r2,#0xff               @ has the -4 value
269@  ubfx   r8,r3,#24,#8           @ has the 3 value
270@  and    r3,#0xff               @ r4 has the 0 value
271    vaddl.u8    q0,d2,d3
272    vdup.8      d19,r10
273    subs        r8,r8,r3
274    vaddw.u8    q0,q0,d4
275    rsbmi       r8,r8,#0
276    vqadd.u8    d30,d2,d19
277    subs        r2,r2,r7
278    vqsub.u8    d31,d2,d19
279    rsbmi       r2,r2,#0
280    vaddl.u8    q13,d5,d6
281    add         r8,r8,r2
282    vmla.i16    q13,q0,q8
283    cmp         r8,r5,asr #3
284    bge         l1.336
285    vrshrn.i16  d26,q13,#3
286    subs        r7,r3,r7
287    vqadd.u8    d27,d3,d19
288    rsbmi       r7,r7,#0
289    vqsub.u8    d28,d3,d19
290    mov         r10,#5
291    vmin.u8     d16,d26,d30
292    mul         r10,r10,r6
293    add         r10,#1
294    cmp         r7,r10,asr #1
295    vmax.u8     d26,d16,d31
296    bge         l1.336
297    vqadd.u8    d30,d6,d19
298
299    mov         r2,#2
300    ldr         r4,[sp,#filter_p_offset]        @ loading the filter_flag_p
301    vqsub.u8    d31,d6,d19
302    ldr         r5,[sp,#filter_q_offset]        @ loading the filter_flag_q
303    b           end_dep_deq_decision
304@ r2 has the value of de
305@ r6 has teh value of tc
306@ r5 has the value of beta
307@ r14 has the value of dp
308@ r12 has the value of dq
309@ r0 has the value of source address
310@ r1 has the src stride
311
312l1.336:
313    mov         r2,#1
314l1.424:
315    mov         r11,r5
316    ldr         r4,[sp,#filter_p_offset]        @ loading the filter_flag_p
317    ldr         r5,[sp,#filter_q_offset]        @ loading the filter_flag_q
318
319    cmp         r6,#1
320    moveq       r9,#0
321    moveq       r10,#0
322    beq         end_dep_deq_decision
323
324    and         r7,r4,r5
325
326    cmp         r7,#1
327    beq         both_flags_set
328    cmp         r4,#0
329    beq         set_flag_dep_zero
330
331
332    add         r8,r11,r11,asr #1
333    mov         r10,#0
334    asr         r8,#3
335    cmp         r8,r14
336    movgt       r9,#1
337    movle       r9,#0
338    b           end_dep_deq_decision
339set_flag_dep_zero:
340
341    add         r8,r11,r11,asr #1
342    mov         r9,#0
343    asr         r8,#3
344    cmp         r8,r12
345    movgt       r10,#1
346    movle       r10,#0
347    b           end_dep_deq_decision
348
349both_flags_set:
350    add         r8,r11,r11,asr #1
351    asr         r8,#3
352    cmp         r8,r14
353    movgt       r9,#1
354    movle       r9,#0
355    cmp         r8,r12
356    movgt       r10,#1
357    movle       r10,#0
358end_dep_deq_decision:
359
360@r0=source address
361@r1=stride
362@ r2 =de
363@ r4=flag p
364@r5= flag q
365@r6 =tc
366@ r9 =dep
367@ r10=deq
368@   b   l1.964
369
370
371    cmp         r2,#2
372@ r4 has the value of de
373    bne         l1.968
374
375    cmp         r5,#0
376    beq         l1.780
377@ r5 has the flag of q
378
379    add         r3,r0,#2
380    vst1.8      {d22[0]},[r3],r1
381
382    vst1.8      {d22[1]},[r3],r1
383
384    vst1.8      {d22[2]},[r3],r1
385
386    vst1.8      {d22[3]},[r3]
387    add         r3,r0,r1
388    vtrn.8      d20,d21
389
390    vst1.16     {d20[0]},[r0]
391    vst1.16     {d21[0]},[r3],r1
392    vst1.16     {d20[1]},[r3],r1
393    vst1.16     {d21[1]},[r3]
394
395
396l1.780:
397    cmp         r4,#0
398    beq         l1.964
399    @ r5 has the flag p
400
401
402    vdup.32     d7,d24[0]
403    sub         r3,r0,#1
404    vaddw.u8    q8,q0,d6
405    add         r7,r3,r1
406    vrshrn.i16  d2,q8,#2
407    vst1.8      {d26[0]},[r3]
408    sub         r0,r0,#3
409    vmin.u8     d16,d2,d27
410    vst1.8      {d26[1]},[r7],r1
411    vmull.u8    q1,d6,d23
412    vmlal.u8    q1,d7,d18
413    vst1.8      {d26[2]},[r7],r1
414    vmax.u8     d5,d16,d28
415    vst1.8      {d26[3]},[r7]
416    vadd.i16    q0,q1,q0
417    vrshrn.i16  d0,q0,#3
418
419
420    vmin.u8     d1,d0,d30
421    vmax.u8     d0,d1,d31
422
423    vtrn.8      d0,d5
424    vst1.16     {d0[0]},[r0],r1
425    vst1.16     {d5[0]},[r0],r1
426    vst1.16     {d0[1]},[r0],r1
427    vst1.16     {d5[1]},[r0]
428l1.964:
429    pop         {r3-r12,pc}
430l1.968:
431
432
433    vmov.i16    q0,#0x9
434    rsb         r11,r6,#0
435    cmp         r4,#0
436    @ checks for the flag p
437    vmov.i16    q8,#0x3
438    vmov.i8     d24,#0x1
439
440
441    vdup.8      d30,r11
442    and         r11,r6,#0xff
443    vdup.8      d31,r11
444
445    vsubl.u8    q9,d4,d2
446    vmul.i16    q9,q9,q0
447    vsubl.u8    q0,d5,d3
448
449
450
451    vmul.i16    q8,q0,q8
452    vsub.i16    q8,q9,q8
453    vrshr.s16   q8,q8,#4
454@   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
455
456    vabs.s16    q0,q8
457    vmovn.i16   d0,q0
458    @ storing the absolute values of delta in d0
459
460    vqmovn.s16  d16,q8
461    @ storing the clipped values of delta in d16
462
463    vmov.i8     d1,#0xa
464    vdup.8      d21,r11
465    vmul.i8     d1,d1,d21
466    @ d1 stores the value (10 * tc)
467
468@if(abs(delta) < 10 * tc)
469
470    vmin.s8     d18,d16,d31
471    vmax.s8     d20,d18,d30
472
473@ delta = clip3(delta, -tc, tc)@
474    vmovl.s8    q8,d20
475    vmovl.u8    q9,d2
476    vadd.i16    q9,q9,q8
477
478    vqmovun.s16 d22,q9
479    vmovl.u8    q9,d4
480    vsub.i16    q8,q9,q8
481    vqmovun.s16 d23,q8
482@ tmp_p0 = clip_u8(pu1_src[-1] + delta)@
483@  tmp_q0 = clip_u8(pu1_src[0] - delta)@
484    beq         l1.1272
485
486
487
488    cmp         r9,#1
489    bne         l1.1212
490@ checks for the flag dep
491
492    asr         r3,r6,#1
493
494
495    vaddl.u8    q8,d6,d2
496    vaddw.u8    q8,q8,d24
497    vdup.8      d18,r3
498    rsb         r3,r3,#0
499    vdup.8      d19,r3
500    vshr.u16    q8,q8,#1
501    vmovn.i16   d16,q8
502
503    vsubl.u8    q8,d16,d3
504    vaddw.s8    q8,q8,d20
505    vshr.s16    q8,q8,#1
506    vqmovn.s16  d16,q8
507
508    vmin.s8     d17,d16,d18
509    vmax.s8     d16,d19,d17
510
511
512
513
514    vmovl.u8    q9,d3
515    vmovl.s8    q8,d16
516    vadd.i16    q8,q9,q8
517
518    vqmovun.s16 d16,q8
519    vmov        d30,d3
520    vcge.u8     d3,d0,d1
521
522
523    vbsl        d3,d30,d16
524l1.1212:
525    vdup.8      d16,r11
526    sub         r12,r0,#3
527    sub         r3,r0,#1
528@     vmul.i8  d16,d16,d1
529    vtrn.8      d6,d3
530    vst1.16     {d6[0]},[r12],r1
531    vcge.u8     d16,d0,d1
532    vst1.16     {d3[0]},[r12],r1
533    vbsl        d16,d2,d22
534    vst1.8      {d16[0]},[r3],r1
535    vst1.8      {d16[1]},[r3],r1
536    vst1.16     {d6[1]},[r12],r1
537    vst1.8      {d16[2]},[r3],r1
538    vst1.16     {d3[1]},[r12]
539    vst1.8      {d16[3]},[r3]
540l1.1272:
541    cmp         r5,#0
542    beq         l1.964
543    @ checks for the flag q
544    cmp         r10,#1
545    bne         l1.1412
546    @ checks for the flag deq
547    vmov        d2,d7
548    asr         r3,r6,#1
549
550    vdup.8      d6,r3
551    rsb         r3,r3,#0
552    vdup.8      d16,r3
553    vaddl.u8    q1,d2,d4
554    vaddw.u8    q1,q1,d24
555    vshr.u16    q1,q1,#1
556    vmovn.i16   d2,q1
557
558    vsubl.u8    q1,d2,d5
559    vsubw.s8    q1,q1,d20
560    vshr.s16    q1,q1,#1
561    vqmovn.s16  d3,q1
562
563    vmin.s8     d2,d3,d6
564    vmax.s8     d3,d16,d2
565    @  vdup.8   d6,r2
566    @   vmul.i8  d6,d6,d1
567
568
569
570    vmovl.u8    q8,d5
571    vmovl.s8    q1,d3
572    vadd.i16    q1,q8,q1
573    vqmovun.s16 d3,q1
574    vmov        d30,d5
575    vcge.u8     d5,d0,d1
576
577
578    vbsl        d5,d30,d3
579l1.1412:
580    @  vdup.8   d2,r2
581    add         r3,r0,#2
582    add         r11,r3,r1
583    @   vmul.i8  d1,d2,d1
584    vst1.8      {d7[0]},[r3]
585    vst1.8      {d7[1]},[r11],r1
586    vst1.8      {d7[2]},[r11],r1
587    vcge.u8     d0,d0,d1
588    vst1.8      {d7[3]},[r11]
589    vbsl        d0,d4,d23
590    vtrn.8      d0,d5
591    vst1.16     {d0[0]},[r0],r1
592    vst1.16     {d5[0]},[r0],r1
593    vst1.16     {d0[1]},[r0],r1
594    vst1.16     {d5[1]},[r0]
595    pop         {r3-r12,pc}
596
597
598
599