• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19
20@/**
21@******************************************************************************
22@* @file
23@*  ihevc_inter_pred_luma_horz_w16out.s
24@*
25@* @brief
26@*  contains function definitions for inter prediction  interpolation.
27@* functions are coded using neon  intrinsics and can be compiled using
28
29@* rvct
30@*
31@* @author
32@*  parthiban v
33@*
34@* @par list of functions:
35@*
36@*  - ihevc_inter_pred_luma_horz_w16out()
37@*
38@* @remarks
39@*  none
40@*
41@*******************************************************************************
42@*/
43@/**
44@*******************************************************************************
45@*
46@* @brief
47@*   interprediction luma filter for horizontal 16bit output
48@*
49@* @par description:
50@*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
51@*     to the elements pointed by 'pu1_src' and  writes to the location pointed
52@*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
53@*     as an input for vertical filtering or weighted  prediction   assumptions :
54@*     the function is optimized considering the fact width is  multiple of 4 or
55@*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
56@*     is optimized further.
57@*
58@* @param[in] pu1_src
59@*  uword8 pointer to the source
60@*
61@* @param[out] pi2_dst
62@*  word16 pointer to the destination
63@*
64@* @param[in] src_strd
65@*  integer source stride
66@*
67@* @param[in] dst_strd
68@*  integer destination stride
69@*
70@* @param[in] pi1_coeff
71@*  word8 pointer to the filter coefficients
72@*
73@* @param[in] ht
74@*  integer height of the array
75@*
76@* @param[in] wd
77@*  integer width of the array
78@*
79@* @returns
80@*
81@* @remarks
82@*  none
83@*
84@*******************************************************************************
85@*/
86
87@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
88@                                word16 *pi2_dst,
89@                                word32 src_strd,
90@                                word32 dst_strd,
91@                                word8 *pi1_coeff,
92@                                word32 ht,
93@                                word32 wd
94
95
96@r0 - free
97@r1 - dst_ptr
98@r2 - src_strd
99@r3 - dst_strd
100@r4 - src_ptr2
101@r5 - inner loop counter
102@r6 - dst_ptr2
103@r7 - free
104@r8 - dst_strd2
105@r9 - src_strd1
106@r10 - wd
107@r11 - #1
108@r12 - src_ptr1
109@r14 - loop_counter
110
111.equ    coeff_offset,   104
112.equ    ht_offset,      108
113.equ    wd_offset,      112
114
115.text
116.align 4
117.syntax unified
118
119
120
121
122.globl ihevc_inter_pred_luma_horz_w16out_a9q
123
124.type ihevc_inter_pred_luma_horz_w16out_a9q, %function
125
126ihevc_inter_pred_luma_horz_w16out_a9q:
127
128    bic         r14, #1                     @ clearing bit[0], so that it goes back to mode
129    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
130    vpush       {d8 - d15}
131    ldr         r4,[sp,#coeff_offset]                 @loads pi1_coeff
132    ldr         r7,[sp,#ht_offset]                 @loads ht
133
134
135    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
136    sub         r14,r7,#0                   @checks for ht == 0
137    vabs.s8     d2,d0                       @vabs_s8(coeff)
138    mov         r11,#1
139    ldr         r10,[sp,#wd_offset]                @loads wd
140    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
141    sub         r12,r0,#3                   @pu1_src - 3
142    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
143    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
144    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
145    rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
146    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
147    rsb         r8,r10,r3                   @dst_strd - wd
148    vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
149
150    vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
151    and         r7,r14,#1                   @calculating ht_residue ht_residue = (ht & 1)
152    vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
153    sub         r14,r14,r7                  @decrement height by ht_residue(residue value is calculated outside)
154    vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
155
156    cmp         r7,#1
157    beq         odd_height_decision
158
159even_height_decision:
160    mov         r7,r1
161    cmp         r10,#4
162    ble         outer_loop_4
163
164    cmp         r10,#24
165    moveq       r10,#16
166    addeq       r8,#8
167    addeq       r9,#8
168
169    cmp         r10,#16
170    bge         outer_loop_16_branch
171
172    cmp         r10,#12
173    addeq       r8,#4
174    addeq       r9,#4
175outer_loop_8_branch:
176    b           outer_loop_8
177
178outer_loop_16_branch:
179    b           outer_loop_16
180
181
182odd_height_decision:
183    cmp         r10,#24
184    beq         outer_loop_8_branch
185    cmp         r10,#12
186    beq         outer_loop_4
187    b           even_height_decision
188
189outer_loop4_residual:
190    sub         r12,r0,#3                   @pu1_src - 3
191    mov         r1,r7
192    add         r1,#16
193    mov         r10,#4
194    add         r12,#8
195    mov         r14,#16
196    add         r8,#4
197    add         r9,#4
198
199outer_loop_4:
200    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
201    add         r4,r12,r2                   @pu1_src + src_strd
202
203    subs        r5,r10,#0                   @checks wd
204    ble         end_inner_loop_4
205
206inner_loop_4:
207    vld1.u32    {d0},[r12],r11              @vector load pu1_src
208    vld1.u32    {d1},[r12],r11
209    vld1.u32    {d2},[r12],r11
210    vld1.u32    {d3},[r12],r11
211    vld1.u32    {d4},[r12],r11
212    vld1.u32    {d5},[r12],r11
213    vld1.u32    {d6},[r12],r11
214    vld1.u32    {d7},[r12],r11
215    @add       r12,r12,#4                      @increment the input pointer
216    sub         r12,r12,#4
217    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
218    @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
219    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
220
221    @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
222    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
223    @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
224    @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
225    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
226    vld1.u32    {d13},[r4],r11
227    vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
228    vld1.u32    {d14},[r4],r11
229    vzip.32     d1,d13
230    vld1.u32    {d15},[r4],r11
231    vzip.32     d2,d14
232    vld1.u32    {d16},[r4],r11
233    vzip.32     d3,d15
234    vld1.u32    {d17},[r4],r11
235    vzip.32     d4,d16
236    vld1.u32    {d18},[r4],r11
237    vzip.32     d5,d17
238    vld1.u32    {d19},[r4],r11
239    sub         r4,r4,#4
240    @ add       r4,r4,#4                        @increment the input pointer
241    @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
242    @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
243    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
244    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
245    @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
246    @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
247    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
248
249
250
251
252
253
254
255    vzip.32     d6,d18
256    vzip.32     d7,d19
257
258    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
259    vmlsl.u8    q4,d0,d24
260    vmlsl.u8    q4,d2,d26
261    vmlal.u8    q4,d3,d27
262    vmlal.u8    q4,d4,d28
263    vmlsl.u8    q4,d5,d29
264    vmlal.u8    q4,d6,d30
265    vmlsl.u8    q4,d7,d31
266
267    @ vqrshrun.s16 d8,q4,#6                     @narrow right shift and saturating the result
268    vst1.64     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
269    vst1.64     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
270    subs        r5,r5,#4                    @decrement the wd by 4
271    bgt         inner_loop_4
272
273end_inner_loop_4:
274    subs        r14,r14,#2                  @decrement the ht by 4
275    add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
276    add         r1,r6,r8,lsl #1             @increment the output pointer 2*dst_strd-wd
277    bgt         outer_loop_4
278
279
280height_residue_4:
281
282    ldr         r7,[sp,#ht_offset]                 @loads ht
283    and         r7,r7,#1                    @calculating ht_residue ht_residue = (ht & 1)
284    cmp         r7,#0
285    beq         end_loops
286
287outer_loop_height_residue_4:
288
289
290    subs        r5,r10,#0                   @checks wd
291    ble         end_inner_loop_height_residue_4
292
293inner_loop_height_residue_4:
294    vld1.u32    {d0},[r12],r11              @vector load pu1_src
295    vld1.u32    {d1},[r12],r11
296
297
298
299
300
301
302    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
303    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
304    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
305
306
307
308    @add        r12,r12,#4                      @increment the input pointer
309    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
310    @ vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
311    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
312    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
313    vld1.u32    {d2},[r12],r11
314    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
315    vld1.u32    {d3},[r12],r11
316    vmlsl.u8    q4,d0,d24
317    vld1.u32    {d4},[r12],r11
318    vmlsl.u8    q4,d2,d26
319    vld1.u32    {d5},[r12],r11
320    vmlal.u8    q4,d3,d27
321    vld1.u32    {d6},[r12],r11
322    vmlal.u8    q4,d4,d28
323    vld1.u32    {d7},[r12],r11
324    vmlsl.u8    q4,d5,d29
325    sub         r12,r12,#4
326    vmlal.u8    q4,d6,d30
327    vmlsl.u8    q4,d7,d31                   @store the i iteration result which is in upper part of the register
328    subs        r5,r5,#4                    @decrement the wd by 4
329    vst1.64     {d8},[r1]!
330    bgt         inner_loop_height_residue_4
331
332end_inner_loop_height_residue_4:
333    subs        r7,r7,#1                    @decrement the ht by 4
334    rsb         r9,r10,r2
335    add         r12,r12,r9                  @increment the input pointer src_strd-wd
336    add         r1,r1,r8                    @increment the output pointer dst_strd-wd
337    bgt         outer_loop_height_residue_4
338    vpop        {d8 - d15}
339    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
340
341outer_loop8_residual:
342    sub         r12,r0,#3                   @pu1_src - 3
343    mov         r1,r7
344    mov         r14,#32
345    add         r1,#32
346    add         r12,#16
347    mov         r10,#8
348    add         r8,#8
349    add         r9,#8
350
351outer_loop_8:
352
353    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
354    add         r4,r12,r2                   @pu1_src + src_strd
355    subs        r5,r10,#0                   @checks wd
356
357    ble         end_inner_loop_8
358
359inner_loop_8:
360    vld1.u32    {d0},[r12],r11              @vector load pu1_src
361    vld1.u32    {d1},[r12],r11
362    vld1.u32    {d2},[r12],r11
363    vld1.u32    {d3},[r12],r11
364
365
366
367
368
369    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
370    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
371    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
372    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
373    @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
374    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
375    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
376    @ vext.u8   d14,d12,d13,#2
377
378    @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
379    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
380    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
381    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
382    @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
383    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
384    vld1.u32    {d4},[r12],r11
385    vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
386    vld1.u32    {d5},[r12],r11
387    vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
388    vld1.u32    {d6},[r12],r11
389    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
390    vld1.u32    {d7},[r12],r11
391    vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
392    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
393    vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
394    vld1.u32    {d13},[r4],r11
395    vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
396    vld1.u32    {d14},[r4],r11
397    vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
398    vld1.u32    {d15},[r4],r11
399    vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
400    vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
401
402    vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
403    vld1.u32    {d17},[r4],r11
404    vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
405    vld1.u32    {d18},[r4],r11
406    vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
407    vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
408    vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
409    @ vqrshrun.s16  d20,q4,#6                       @right shift and saturating narrow result 1
410    vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
411    vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
412    vst1.16     {q4},[r1]!                  @store the result pu1_dst
413    vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
414    vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
415
416
417
418    @ vqrshrun.s16 d8,q5,#6                     @right shift and saturating narrow result 2
419    subs        r5,r5,#8                    @decrement the wd loop
420    vst1.16     {q5},[r6]!                  @store the result pu1_dst
421    cmp         r5,#4
422    bgt         inner_loop_8
423
424end_inner_loop_8:
425    subs        r14,r14,#2                  @decrement the ht loop
426    add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
427    add         r1,r6,r8,lsl #1             @increment the dst pointer by 2*dst_strd-wd
428    bgt         outer_loop_8
429
430
431
432
433
434    ldr         r10,[sp,#wd_offset]                @loads wd
435    cmp         r10,#12
436
437    beq         outer_loop4_residual
438
439    ldr         r7,[sp,#ht_offset]                 @loads ht
440    and         r7,r7,#1
441    cmp         r7,#1
442    beq         height_residue_4
443
444
445    vpop        {d8 - d15}
446    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
447
448
449
450
451
452outer_loop_16:
453    str         r0, [sp, #-4]!
454    str         r7, [sp, #-4]!
455    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
456    add         r4,r12,r2                   @pu1_src + src_strd
457    and         r0, r12, #31
458    sub         r5,r10,#0                   @checks wd
459    pld         [r12, r2, lsl #1]
460    vld1.u32    {q0},[r12],r11              @vector load pu1_src
461    pld         [r4, r2, lsl #1]
462    vld1.u32    {q1},[r12],r11
463    vld1.u32    {q2},[r12],r11
464    vld1.u32    {q3},[r12],r11
465    vld1.u32    {q6},[r12],r11
466    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
467    vld1.u32    {q7},[r12],r11
468    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
469    vld1.u32    {q8},[r12],r11
470    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
471    vld1.u32    {q9},[r12],r11
472    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
473    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
474    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
475    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
476    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
477
478
479inner_loop_16:
480
481
482    subs        r5,r5,#16
483    vmull.u8    q10,d3,d25
484
485    add         r12,#8
486    vmlsl.u8    q10,d1,d24
487
488    vld1.u32    {q0},[r4],r11               @vector load pu1_src
489    vmlal.u8    q10,d7,d27
490
491    vld1.u32    {q1},[r4],r11
492    vmlsl.u8    q10,d5,d26
493
494    vld1.u32    {q2},[r4],r11
495    vmlal.u8    q10,d13,d28
496
497    vld1.u32    {q3},[r4],r11
498    vmlal.u8    q10,d17,d30
499
500    vld1.u32    {q6},[r4],r11
501    vmlsl.u8    q10,d15,d29
502
503    vld1.u32    {q7},[r4],r11
504    vmlsl.u8    q10,d19,d31
505
506    vld1.u32    {q8},[r4],r11
507    vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
508
509    vld1.u32    {q9},[r4],r11
510    vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
511
512    add         r4,#8
513    vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
514    pld         [r12, r2, lsl #2]
515    pld         [r4, r2, lsl #2]
516    vst1.8      {q4},[r1]!                  @store the result pu1_dst
517    vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
518
519    addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
520    vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
521
522    addeq       r4,r12,r2                   @pu1_src + src_strd
523    vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
524
525@   and         r7, r12, #31
526    vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
527
528    subeq       r14,r14,#2
529    vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
530
531    @cmp            r7, r0
532    vmull.u8    q11,d3,d25
533
534@   pld     [r12, r2, lsl #2]
535    vmlsl.u8    q11,d1,d24
536
537    vst1.16     {q10},[r1]!
538    vmlal.u8    q11,d7,d27
539
540@   pld     [r4, r2, lsl #2]
541    vmlsl.u8    q11,d5,d26
542
543@   mov         r0, r7
544    vmlal.u8    q11,d13,d28
545
546    cmp         r14,#0
547    vmlal.u8    q11,d17,d30
548
549    vst1.16     {q5},[r6]!
550    vmlsl.u8    q11,d15,d29
551
552    vmlsl.u8    q11,d19,d31
553
554    beq         epilog_16
555
556    vld1.u32    {q0},[r12],r11              @vector load pu1_src
557    vld1.u32    {q1},[r12],r11
558    vld1.u32    {q2},[r12],r11
559    vld1.u32    {q3},[r12],r11
560    vld1.u32    {q6},[r12],r11
561    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
562    vld1.u32    {q7},[r12],r11
563    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
564    vld1.u32    {q8},[r12],r11
565    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
566    vld1.u32    {q9},[r12],r11
567    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
568    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
569    cmp         r5,#0
570    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
571    moveq       r5,r10
572    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
573    vst1.8      {q11},[r6]!                 @store the result pu1_dst
574    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
575    addeq       r1,r6,r8,lsl #1
576    addeq       r6,r1,r3,lsl #1             @pu1_dst + dst_strd
577    b           inner_loop_16
578
579
580epilog_16:
581@   vqrshrun.s16 d11,q11,#6
582    vst1.8      {q11},[r6]!                 @store the result pu1_dst
583
584    ldr         r7, [sp], #4
585    ldr         r0, [sp], #4
586    ldr         r10,[sp,#wd_offset]
587    cmp         r10,#24
588    beq         outer_loop8_residual
589    add         r1,r6,r8,lsl #1
590    ldr         r7,[sp,#ht_offset]                 @loads ht
591    and         r7,r7,#1
592    cmp         r7,#1
593    beq         height_residue_4
594
595end_loops:
596    vpop        {d8 - d15}
597    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
598
599
600
601
602
603
604
605
606
607