• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@/*******************************************************************************
20@* @file
21@*  ihevc_deblk_luma_vert.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  anand s
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************/
39
40.text
41.align 4
42
43
44
45
46
47.extern gai4_ihevc_tc_table
48.extern gai4_ihevc_beta_table
49
50.globl ihevc_deblk_luma_vert_a9q
51
52gai4_ihevc_tc_table_addr:
53.long gai4_ihevc_tc_table   - ulbl1 - 8
54
55gai4_ihevc_beta_table_addr:
56.long gai4_ihevc_beta_table   - ulbl2 - 8
57
58.type ihevc_deblk_luma_vert_a9q, %function
59
60ihevc_deblk_luma_vert_a9q:
61
62    push        {r3-r12,lr}
63    ldr         r4,[sp,#0x2c]
64    ldr         r5,[sp,#0x30]
65
66    add         r3,r3,r4
67    add         r3,r3,#1
68    ldr         r6, [sp,#0x34]
69    asr         r3,r3,#1
70    add         r7,r3,r5,lsl #1
71    add         r3,r3,r6,lsl #1
72    cmp         r7,#0x33
73    movgt       r7,#0x33
74    bgt         l1.56
75    cmp         r7,#0x0
76    movlt       r7,#0x0                     @ r7 has the beta_index value
77l1.56:
78
79@     bic      r2,r2,#1
80    asr         r2,r2,#1
81
82    add         r3,r3,r2,lsl #1
83    cmp         r3,#0x35
84    movgt       r3,#0x35
85    bgt         l1.88
86    cmp         r3,#0x0
87    movlt       r3,#0x0                     @ r3 has the tc_index value
88
89@    qp_luma = (quant_param_p + quant_param_q + 1) >> 1@
90@    beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@
91@    tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@
92
93l1.88:
94    ldr         r2,gai4_ihevc_beta_table_addr
95ulbl2:
96    add         r2,r2,pc
97    vmov.i8     d18,#0x2
98    ldr         r4,gai4_ihevc_tc_table_addr
99ulbl1:
100    add         r4,r4,pc
101
102    ldr         r5,[r2,r7,lsl #2]           @ beta
103    vmov.i16    q8,#0x2
104    ldr         r6,[r4,r3,lsl #2]           @ tc
105    lsl         r8,r6,#1
106    cmp         r6,#0
107    vdup.8      d19,r8
108    sub         r7,r0,#4
109    vmov.i8     d23,#0x3
110    beq         l1.964
111
112
113    vld1.8      {d24},[r7],r1
114    ldrb        r8,[r0,#-3]                 @ -3 value
115    vld1.8      {d1},[r7],r1
116    ldrb        r10,[r0,#-2]                @-2 value
117    vld1.8      {d2},[r7],r1
118    ldrb        r11,[r0,#-1]                @-1 value
119    vld1.8      {d0},[r7]
120    ldrb        r12,[r0,#0]                 @ 0 value
121    ldrb        r9,[r0,#1]                  @ 1 value
122    vtrn.8      d24,d1
123    ldrb        r2,[r0,#2]                  @ 2 value
124    vtrn.8      d2,d0
125    add         r12,r12,r2
126    subs        r9,r12,r9,lsl #1            @ dq0 value is stored in r9
127    rsbmi       r9,r9,#0
128@dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@
129    vtrn.16     d24,d2
130    add         r8,r8,r11
131    vtrn.16     d1,d0
132    subs        r8,r8,r10,lsl #1
133    rsbmi       r8,r8,#0                    @ dp0 value is stored in r8
134@  dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@
135
136
137
138    add         r14,r1,r1,lsl #1
139    add         r14,r0,r14
140
141    vdup.32     d4,d24[1]
142    ldrb        r2,[r14,#-3]                @ -2 value
143    vdup.32     d7,d2[1]
144    ldrb        r10,[r14,#-2]               @ -2 value
145    vdup.32     d3,d2[0]
146    ldrb        r11,[r14,#-1]               @ -1 value
147    vdup.32     d5,d1[1]
148    ldrb        r12,[r14,#0]                @ 0 value
149    vdup.32     d6,d1[0]
150    ldrb        r3,[r14,#1]                 @ 1 value
151    vdup.32     d2,d0[0]
152    ldrb        r4,[r14,#2]                 @ 2 value
153
154
155    add         r12,r12,r4
156    subs        r12,r12,r3,lsl #1           @ dq3value is stored in r12
157    rsbmi       r12,r12,#0
158@    dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@
159
160
161    add         r2,r2,r11
162    subs        r11,r2,r10,lsl #1
163    rsbmi       r11,r11,#0                  @ dp3 value is stored in r8
164@    dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2]   + pu1_src[3 * src_strd - 1] )@
165
166
167
168    add         r3,r8,r9                    @ r3 has the d0 value
169    add         r4,r11,r12                  @ r4 has the d3 value
170
171
172@    d0 = dp0 + dq0@
173@    d3 = dp3 + dq3@
174
175    add         r14,r8,r11                  @ r13 has the value dp
176    add         r12,r12,r9                  @ r12 has the value  dq
177@    dp = dp0 + dp3@
178@   dq = dq0 + dq3@
179
180    add         r11, r3, r4                 @ r3 has the value d
181
182@   d = d0 + d3@
183
184
185    cmp         r11,r5
186    vdup.32     d22,d0[1]
187    bge         l1.964
188
189@    if(d < beta)
190
191
192    @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11
193
194    @ registers for use: r2,r7,r8,r9,r10,
195    vqsub.u8    d30,d7,d19
196    asr         r10,r5,#2
197    vqadd.u8    d31,d7,d19
198    cmp         r10,r3,lsl #1
199    vaddl.u8    q0,d5,d4
200    ble         l1.336
201
202    ldrb        r2,[r0,#-4]
203    vaddw.u8    q0,q0,d2
204    ldrb        r7,[r0,#-1]
205    vmull.u8    q10,d7,d23
206    ldrb        r3,[r0,#0]
207    vmlal.u8    q10,d22,d18
208    ldrb        r8,[r0,#3]
209@   ubfx   r7,r2,#24,#8           @ has the -1 value
210@  and    r2,#0xff               @ has the -4 value
211@  ubfx   r8,r3,#24,#8           @ has the 3 value
212@  and    r3,#0xff               @ r4 has the 0 value
213
214    vadd.i16    q10,q10,q0
215    subs        r8,r8,r3
216    vrshrn.i16  d22,q10,#3
217    rsbmi       r8,r8,#0
218    subs        r2,r2,r7
219    vmin.u8     d21,d22,d31
220    rsbmi       r2,r2,#0
221    vmax.u8     d22,d21,d30
222    add         r8,r8,r2
223    vaddl.u8    q10,d7,d3
224    cmp         r8,r5,asr #3
225    vmla.i16    q10,q0,q8
226    bge         l1.336
227    vaddw.u8    q0,q0,d7
228    subs        r7,r3,r7
229    vrshrn.i16  d20,q10,#3
230    rsbmi       r7,r7,#0
231    vrshrn.i16  d0,q0,#2
232    mov         r10,#5
233    vqadd.u8    d30,d5,d19
234    mul         r10,r10,r6
235    vqsub.u8    d31,d5,d19
236    add         r10,#1
237    cmp         r7,r10,asr #1
238    bge         l1.336
239
240
241@        if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4])  < (beta >> 3) )
242@            && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) )
243
244
245    asr         r10,r5,#2
246    vqsub.u8    d25,d4,d19
247    cmp         r10,r4,lsl #1
248    vqadd.u8    d21,d4,d19
249    ble         l1.336
250    vmin.u8     d26,d20,d21
251    add         r4,r1,r1,lsl #1
252    add         r4,r4,r0
253    vmax.u8     d20,d26,d25
254    ldrb        r2,[r4,#-4]
255    vmin.u8     d19,d0,d30
256    ldrb        r7,[r4,#-1]
257    vmax.u8     d21,d19,d31
258    ldrb        r3,[r4,#0]
259    lsl         r10,r6,#1
260    ldrb        r8,[r4,#3]
261@   ubfx   r7,r2,#24,#8           @ has the -1 value
262@  and    r2,#0xff               @ has the -4 value
263@  ubfx   r8,r3,#24,#8           @ has the 3 value
264@  and    r3,#0xff               @ r4 has the 0 value
265    vaddl.u8    q0,d2,d3
266    vdup.8      d19,r10
267    subs        r8,r8,r3
268    vaddw.u8    q0,q0,d4
269    rsbmi       r8,r8,#0
270    vqadd.u8    d30,d2,d19
271    subs        r2,r2,r7
272    vqsub.u8    d31,d2,d19
273    rsbmi       r2,r2,#0
274    vaddl.u8    q13,d5,d6
275    add         r8,r8,r2
276    vmla.i16    q13,q0,q8
277    cmp         r8,r5,asr #3
278    bge         l1.336
279    vrshrn.i16  d26,q13,#3
280    subs        r7,r3,r7
281    vqadd.u8    d27,d3,d19
282    rsbmi       r7,r7,#0
283    vqsub.u8    d28,d3,d19
284    mov         r10,#5
285    vmin.u8     d16,d26,d30
286    mul         r10,r10,r6
287    add         r10,#1
288    cmp         r7,r10,asr #1
289    vmax.u8     d26,d16,d31
290    bge         l1.336
291    vqadd.u8    d30,d6,d19
292
293    mov         r2,#2
294    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
295    vqsub.u8    d31,d6,d19
296    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
297    b           end_dep_deq_decision
298@ r2 has the value of de
299@ r6 has teh value of tc
300@ r5 has the value of beta
301@ r14 has the value of dp
302@ r12 has the value of dq
303@ r0 has the value of source address
304@ r1 has the src stride
305
306l1.336:
307    mov         r2,#1
308l1.424:
309    mov         r11,r5
310    ldr         r4,[sp,#0x38]               @ loading the filter_flag_p
311    ldr         r5,[sp,#0x3c]               @ loading the filter_flag_q
312
313    cmp         r6,#1
314    moveq       r9,#0
315    moveq       r10,#0
316    beq         end_dep_deq_decision
317
318    and         r7,r4,r5
319
320    cmp         r7,#1
321    beq         both_flags_set
322    cmp         r4,#0
323    beq         set_flag_dep_zero
324
325
326    add         r8,r11,r11,asr #1
327    mov         r10,#0
328    asr         r8,#3
329    cmp         r8,r14
330    movgt       r9,#1
331    movle       r9,#0
332    b           end_dep_deq_decision
333set_flag_dep_zero:
334
335    add         r8,r11,r11,asr #1
336    mov         r9,#0
337    asr         r8,#3
338    cmp         r8,r12
339    movgt       r10,#1
340    movle       r10,#0
341    b           end_dep_deq_decision
342
343both_flags_set:
344    add         r8,r11,r11,asr #1
345    asr         r8,#3
346    cmp         r8,r14
347    movgt       r9,#1
348    movle       r9,#0
349    cmp         r8,r12
350    movgt       r10,#1
351    movle       r10,#0
352end_dep_deq_decision:
353
354@r0=source address
355@r1=stride
356@ r2 =de
357@ r4=flag p
358@r5= flag q
359@r6 =tc
360@ r9 =dep
361@ r10=deq
362@   b   l1.964
363
364
365    cmp         r2,#2
366@ r4 has the value of de
367    bne         l1.968
368
369    cmp         r5,#0
370    beq         l1.780
371@ r5 has the flag of q
372
373    add         r3,r0,#2
374    vst1.8      {d22[0]},[r3],r1
375
376    vst1.8      {d22[1]},[r3],r1
377
378    vst1.8      {d22[2]},[r3],r1
379
380    vst1.8      {d22[3]},[r3]
381    add         r3,r0,r1
382    vtrn.8      d20,d21
383
384    vst1.16     {d20[0]},[r0]
385    vst1.16     {d21[0]},[r3],r1
386    vst1.16     {d20[1]},[r3],r1
387    vst1.16     {d21[1]},[r3]
388
389
390l1.780:
391    cmp         r4,#0
392    beq         l1.964
393    @ r5 has the flag p
394
395
396    vdup.32     d7,d24[0]
397    sub         r3,r0,#1
398    vaddw.u8    q8,q0,d6
399    add         r7,r3,r1
400    vrshrn.i16  d2,q8,#2
401    vst1.8      {d26[0]},[r3]
402    sub         r0,r0,#3
403    vmin.u8     d16,d2,d27
404    vst1.8      {d26[1]},[r7],r1
405    vmull.u8    q1,d6,d23
406    vmlal.u8    q1,d7,d18
407    vst1.8      {d26[2]},[r7],r1
408    vmax.u8     d5,d16,d28
409    vst1.8      {d26[3]},[r7]
410    vadd.i16    q0,q1,q0
411    vrshrn.i16  d0,q0,#3
412
413
414    vmin.u8     d1,d0,d30
415    vmax.u8     d0,d1,d31
416
417    vtrn.8      d0,d5
418    vst1.16     {d0[0]},[r0],r1
419    vst1.16     {d5[0]},[r0],r1
420    vst1.16     {d0[1]},[r0],r1
421    vst1.16     {d5[1]},[r0]
422l1.964:
423    pop         {r3-r12,pc}
424l1.968:
425
426
427    vmov.i16    q0,#0x9
428    rsb         r11,r6,#0
429    cmp         r4,#0
430    @ checks for the flag p
431    vmov.i16    q8,#0x3
432    vmov.i8     d24,#0x1
433
434
435    vdup.8      d30,r11
436    and         r11,r6,#0xff
437    vdup.8      d31,r11
438
439    vsubl.u8    q9,d4,d2
440    vmul.i16    q9,q9,q0
441    vsubl.u8    q0,d5,d3
442
443
444
445    vmul.i16    q8,q0,q8
446    vsub.i16    q8,q9,q8
447    vrshr.s16   q8,q8,#4
448@   delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@
449
450    vabs.s16    q0,q8
451    vmovn.i16   d0,q0
452    @ storing the absolute values of delta in d0
453
454    vqmovn.s16  d16,q8
455    @ storing the clipped values of delta in d16
456
457    vmov.i8     d1,#0xa
458    vdup.8      d21,r11
459    vmul.i8     d1,d1,d21
460    @ d1 stores the value (10 * tc)
461
462@if(abs(delta) < 10 * tc)
463
464    vmin.s8     d18,d16,d31
465    vmax.s8     d20,d18,d30
466
467@ delta = clip3(delta, -tc, tc)@
468    vmovl.s8    q8,d20
469    vmovl.u8    q9,d2
470    vadd.i16    q9,q9,q8
471
472    vqmovun.s16 d22,q9
473    vmovl.u8    q9,d4
474    vsub.i16    q8,q9,q8
475    vqmovun.s16 d23,q8
476@ tmp_p0 = clip_u8(pu1_src[-1] + delta)@
477@  tmp_q0 = clip_u8(pu1_src[0] - delta)@
478    beq         l1.1272
479
480
481
482    cmp         r9,#1
483    bne         l1.1212
484@ checks for the flag dep
485
486    asr         r3,r6,#1
487
488
489    vaddl.u8    q8,d6,d2
490    vaddw.u8    q8,q8,d24
491    vdup.8      d18,r3
492    rsb         r3,r3,#0
493    vdup.8      d19,r3
494    vshr.u16    q8,q8,#1
495    vmovn.i16   d16,q8
496
497    vsubl.u8    q8,d16,d3
498    vaddw.s8    q8,q8,d20
499    vshr.s16    q8,q8,#1
500    vqmovn.s16  d16,q8
501
502    vmin.s8     d17,d16,d18
503    vmax.s8     d16,d19,d17
504
505
506
507
508    vmovl.u8    q9,d3
509    vmovl.s8    q8,d16
510    vadd.i16    q8,q9,q8
511
512    vqmovun.s16 d16,q8
513    vmov        d30,d3
514    vcge.u8     d3,d0,d1
515
516
517    vbsl        d3,d30,d16
518l1.1212:
519    vdup.8      d16,r11
520    sub         r12,r0,#3
521    sub         r3,r0,#1
522@     vmul.i8  d16,d16,d1
523    vtrn.8      d6,d3
524    vst1.16     {d6[0]},[r12],r1
525    vcge.u8     d16,d0,d1
526    vst1.16     {d3[0]},[r12],r1
527    vbsl        d16,d2,d22
528    vst1.8      {d16[0]},[r3],r1
529    vst1.8      {d16[1]},[r3],r1
530    vst1.16     {d6[1]},[r12],r1
531    vst1.8      {d16[2]},[r3],r1
532    vst1.16     {d3[1]},[r12]
533    vst1.8      {d16[3]},[r3]
534l1.1272:
535    @   ldr      r3,[sp,#0x38]
536    cmp         r5,#0
537    beq         l1.964
538    @ checks for the flag q
539    cmp         r10,#1
540    bne         l1.1412
541    @ checks for the flag deq
542    vmov        d2,d7
543    asr         r3,r6,#1
544
545    vdup.8      d6,r3
546    rsb         r3,r3,#0
547    vdup.8      d16,r3
548    vaddl.u8    q1,d2,d4
549    vaddw.u8    q1,q1,d24
550    vshr.u16    q1,q1,#1
551    vmovn.i16   d2,q1
552
553    vsubl.u8    q1,d2,d5
554    vsubw.s8    q1,q1,d20
555    vshr.s16    q1,q1,#1
556    vqmovn.s16  d3,q1
557
558    vmin.s8     d2,d3,d6
559    vmax.s8     d3,d16,d2
560    @  vdup.8   d6,r2
561    @   vmul.i8  d6,d6,d1
562
563
564
565    vmovl.u8    q8,d5
566    vmovl.s8    q1,d3
567    vadd.i16    q1,q8,q1
568    vqmovun.s16 d3,q1
569    vmov        d30,d5
570    vcge.u8     d5,d0,d1
571
572
573    vbsl        d5,d30,d3
574l1.1412:
575    @  vdup.8   d2,r2
576    add         r3,r0,#2
577    add         r11,r3,r1
578    @   vmul.i8  d1,d2,d1
579    vst1.8      {d7[0]},[r3]
580    vst1.8      {d7[1]},[r11],r1
581    vst1.8      {d7[2]},[r11],r1
582    vcge.u8     d0,d0,d1
583    vst1.8      {d7[3]},[r11]
584    vbsl        d0,d4,d23
585    vtrn.8      d0,d5
586    vst1.16     {d0[0]},[r0],r1
587    vst1.16     {d5[0]},[r0],r1
588    vst1.16     {d0[1]},[r0],r1
589    vst1.16     {d5[1]},[r0]
590    pop         {r3-r12,pc}
591
592
593
594