• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_horz_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs / akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    chroma interprediction filter for horizontal input
45@*
46@* @par description:
47@*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
48@*    to the elements pointed by 'pu1_src' and  writes to the location pointed
49@*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
50@*    assumptions : the function is optimized considering the fact width is
51@*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
52@*    width 4,8 is optimized further
53@*
54@* @param[in] pu1_src
55@*  uword8 pointer to the source
56@*
57@* @param[out] pu1_dst
58@*  uword8 pointer to the destination
59@*
60@* @param[in] src_strd
61@*  integer source stride
62@*
63@* @param[in] dst_strd
64@*  integer destination stride
65@*
66@* @param[in] pi1_coeff
67@*  word8 pointer to the filter coefficients
68@*
69@* @param[in] ht
70@*  integer height of the array
71@*
72@* @param[in] wd
73@*  integer width of the array
74@*
75@* @returns
76@*
77@* @remarks
78@*  none
79@*
80@*******************************************************************************
81@*/
82
83@void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
84@                                   uword8 *pu1_dst,
85@                                   word32 src_strd,
86@                                   word32 dst_strd,
87@                                   word8 *pi1_coeff,
88@                                   word32 ht,
89@                                   word32 wd)
90@**************variables vs registers*****************************************
91@r0 => *pu1_src
92@r1 => *pi2_dst
93@r2 =>  src_strd
94@r3 =>  dst_strd
95
96.equ    coeff_offset,   104
97.equ    ht_offset,      108
98.equ    wd_offset,      112
99
100.text
101.align 4
102
103
104
105
106.globl ihevc_inter_pred_chroma_horz_a9q
107
108.type ihevc_inter_pred_chroma_horz_a9q, %function
109
110ihevc_inter_pred_chroma_horz_a9q:
111
112    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
113    vpush        {d8 - d15}
114
115    ldr         r4,[sp,#coeff_offset]                 @loads pi1_coeff
116    ldr         r7,[sp,#ht_offset]                 @loads ht
117    ldr         r10,[sp,#wd_offset]                @loads wd
118
119    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
120    subs        r14,r7,#0                   @checks for ht == 0
121    vabs.s8     d2,d0                       @vabs_s8(coeff)
122    mov         r11,#2
123    ble         end_loops
124
125    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
126    sub         r12,r0,#2                   @pu1_src - 2
127    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
128    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
129    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
130
131    tst         r10,#3                      @checks wd for multiples
132    mov         r5,r10,lsl #1
133
134    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
135
136    bne         outer_loop_4
137    cmp         r10,#12
138    beq         skip_16
139
140    cmp         r10,#8
141    bge         outer_loop_16
142skip_16:
143    tst         r7,#3
144
145    sub         r9,r0,#2
146    beq         outer_loop_ht_4             @jumps to else condition
147
148    b           outer_loop_8
149
150
151outer_loop_16:
152    mov         r10,r5                      @2wd
153    mul         r14,r14,r10
154
155    rsb         r6,r3,#16
156
157    add         r4,r12,r2
158    mov         r9,#10
159    and         r0, r12, #31
160    rsb         r8,r5,r3,lsl #1
161    pld         [r12, r2, lsl #1]
162
163
164
165
166    vld1.u32    {q0},[r12],r11              @vector load pu1_src
167    pld         [r4, r2, lsl #1]
168    vld1.u32    {q1},[r12],r11              @vector load pu1_src
169
170    vld1.u32    {q2},[r12],r11              @vector load pu1_src
171
172    vld1.u32    {q3},[r12],r9               @vector load pu1_src
173
174
175    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
176    vld1.u32    {q4},[r4],r11               @vector load pu1_src
177    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
178    vld1.u32    {q5},[r4],r11               @vector load pu1_src
179    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
180    vld1.u32    {q6},[r4],r11               @vector load pu1_src
181    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
182    vld1.u32    {q7},[r4],r9                @vector load pu1_src
183    vmull.u8    q14,d3,d25
184
185    vmlsl.u8    q14,d1,d24
186
187
188    vmlal.u8    q14,d5,d26
189
190    vmlsl.u8    q14,d7,d27
191
192
193    cmp         r14,#32
194    beq         epilog_end
195    sub         r14,#64
196
197inner_loop_16:
198
199
200
201
202@    bgt            l_2
203
204@   pld         [r12, r2, lsl #1]
205@   pld         [r4, r2, lsl #1]
206
207    pld         [r12, r2, lsl #2]
208    pld         [r4, r2, lsl #2]
209
210    subs        r10,r10,#16
211
212    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
213
214
215    addeq       r12,r12,r8
216    addeq       r4,r12,r2
217    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
218
219
220
221    vqrshrun.s16 d30,q15,#6
222
223    vld1.u32    {q0},[r12],r11              @vector load pu1_src
224    vqrshrun.s16 d31,q14,#6
225
226
227    vld1.u32    {q1},[r12],r11              @vector load pu1_src
228    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
229
230
231
232
233    vld1.u32    {q2},[r12],r11              @vector load pu1_src
234    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
235
236
237    vld1.u32    {q3},[r12],r9               @vector load pu1_src
238    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
239
240    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
241
242    vst1.16     {q15}, [r1],r3
243    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
244
245    vld1.u32    {q4},[r4],r11               @vector load pu1_src
246    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
247
248
249    vld1.u32    {q5},[r4],r11               @vector load pu1_src
250    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
251
252    vld1.u32    {q6},[r4],r11               @vector load pu1_src
253    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
254
255    vld1.u32    {q7},[r4],r9                @vector load pu1_src
256    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
257
258    cmp         r10,#0
259    vqrshrun.s16 d22,q11,#6
260    vqrshrun.s16 d23,q10,#6
261
262
263
264    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
265
266    moveq       r10,r5                      @2wd
267    vmull.u8    q14,d3,d25
268
269
270    vst1.16     {q11},[r1],r6               @store the result pu1_dst
271    vmlsl.u8    q14,d1,d24
272
273
274    addeq       r1,r1,r8
275    vmlal.u8    q14,d5,d26
276
277    subs        r14,r14,#32                 @decrement the ht loop
278    vmlsl.u8    q14,d7,d27
279
280@     mov           r0, r7
281
282    bgt         inner_loop_16
283
284
285
286    add         r14,r14,#64
287    cmp         r14,#32
288    beq         epilog_end
289
290epilog:
291    vqrshrun.s16 d30,q15,#6
292    vqrshrun.s16 d31,q14,#6
293
294
295
296    vst1.16     {q15}, [r1],r3
297    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
298
299
300
301
302    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
303    subs        r10,r10,#16                 @decrement the wd loop
304    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
305    addeq       r12,r12,r8
306    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
307    moveq       r10,r5                      @2wd
308
309
310    addeq       r4,r12,r2
311    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
312    vld1.u32    {q0},[r12],r11              @vector load pu1_src
313    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
314    vld1.u32    {q1},[r12],r11              @vector load pu1_src
315    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
316    vld1.u32    {q2},[r12],r11              @vector load pu1_src
317    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
318    vld1.u32    {q3},[r12],r9               @vector load pu1_src
319    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
320
321
322    vld1.u32    {q4},[r4],r11               @vector load pu1_src
323    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
324    vld1.u32    {q5},[r4],r11               @vector load pu1_src
325    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
326
327    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
328
329    vld1.u32    {q6},[r4],r11               @vector load pu1_src
330    vmull.u8    q14,d3,d25
331    vld1.u32    {q7},[r4],r9                @vector load pu1_src
332    vmlsl.u8    q14,d1,d24
333    vqrshrun.s16 d22,q11,#6
334    vqrshrun.s16 d23,q10,#6
335
336    vst1.16     {q11},[r1],r6               @store the result pu1_dst
337    vmlal.u8    q14,d5,d26
338
339    vmlsl.u8    q14,d7,d27
340    addeq       r1,r1,r8
341
342
343
344epilog_end:
345    vqrshrun.s16 d30,q15,#6
346    vqrshrun.s16 d31,q14,#6
347
348
349    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
350    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
351    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
352    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
353
354
355    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
356    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
357    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
358    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
359    vqrshrun.s16 d22,q11,#6
360    vqrshrun.s16 d23,q10,#6
361
362
363    vst1.16     {q15}, [r1],r3
364
365    vst1.16     {q11},[r1]                  @store the result pu1_dst
366
367
368
369    b           end_loops
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389outer_loop_8:
390
391
392    add         r6,r1,r3                    @pu1_dst + dst_strd
393    mov         r7,r5
394    add         r4,r12,r2                   @pu1_src + src_strd
395
396
397inner_loop_8:
398    @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
399    vld1.u32    {d0},[r12],r11              @vector load pu1_src
400    vld1.u32    {d1},[r12],r11              @vector load pu1_src
401    vld1.u32    {d2},[r12],r11              @vector load pu1_src
402    vld1.u32    {d3},[r12],r11              @vector load pu1_src
403
404    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
405    vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
406    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
407    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
408    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
409    vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
410    vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
411
412    vld1.u32    {d4},[r4],r11               @vector load pu1_src
413    vld1.u32    {d5},[r4],r11               @vector load pu1_src
414    vld1.u32    {d6},[r4],r11               @vector load pu1_src
415    vld1.u32    {d7},[r4],r11               @vector load pu1_src
416    @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
417    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
418    vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
419    vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
420    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
421    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
422    vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
423    vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
424    vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
425
426    vst1.8      {d8},[r1]!                  @store the result pu1_dst
427
428    vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
429    subs        r7,r7,#8                    @decrement the wd loop
430    vst1.8      {d10},[r6]!                 @store the result pu1_dst
431    bgt         inner_loop_8
432
433    sub         r12,r12,r5
434    subs        r14,r14,#2                  @decrement the ht loop
435    sub         r1,r1,r5
436    add         r12,r12,r2,lsl #1
437    add         r1,r1,r3,lsl #1
438    bgt         outer_loop_8
439    b           end_loops
440
441@height if 4 comes
442outer_loop_ht_4:
443
444    mov         r7,r5
445
446prologue_ht_4:
447
448inner_loop_ht_4:
449
450    mov         r12,r9
451    mov         r4,r1
452
453    sub         r8, r2, #6
454
455    vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
456    vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
457    vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
458    @vld1.u32  {d3},[r12],r2               @(1)vector load pu1_src
459    vld1.u32    {d3},[r12],r8               @(1)vector load pu1_src
460
461    @sub       r12, r12, #6                @(1)
462
463    vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
464    vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
465    vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
466    @vld1.u32  {d7},[r12],r2               @(2)vector load pu1_src
467    vld1.u32    {d7},[r12],r8               @(2)vector load pu1_src
468
469    @sub       r12, r12, #6                @(2)
470
471    vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
472    vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
473
474    vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
475    vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
476
477    vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
478    vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
479
480    @vld1.u32  {d17},[r12],r2              @(3)vector load pu1_src
481    vld1.u32    {d17},[r12],r8              @(3)vector load pu1_src
482    vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
483
484    @sub       r12, r12, #6                @(3)
485    vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
486
487    vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
488    vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
489
490    vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
491    vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
492
493    vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
494    vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
495
496    vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
497    vqrshrun.s16 d8,q4,#6                   @(1)right shift and saturating narrow result 1
498
499    add         r9,r9,#8                    @(core loop)
500
501    subs        r7,r7,#8                    @(prologue)decrement the wd loop
502    beq         epilogue
503
504core_loop:
505    mov         r12,r9
506
507    vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
508    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
509
510    vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
511    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
512
513    vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
514    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
515
516    @vld1.u32  {d3},[r12],r2               @(1_1)vector load pu1_src
517    vld1.u32    {d3},[r12],r8               @(1_1)vector load pu1_src
518    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
519
520    @sub       r12, r12, #6                @(1_1)
521
522    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
523    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
524
525    vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
526    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
527
528    vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
529    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
530
531    vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
532    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
533
534    @vld1.u32  {d7},[r12],r2               @(2_1)vector load pu1_src
535    vld1.u32    {d7},[r12],r8               @(2_1)vector load pu1_src
536    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
537
538    @sub       r12, r12, #6                @(2_1)
539
540    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
541    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
542
543    vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
544    vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
545
546    vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
547    vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
548
549    vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
550    vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
551
552    @vld1.u32  {d17},[r12],r2              @(3_1)vector load pu1_src
553    vld1.u32    {d17},[r12],r8              @(3_1)vector load pu1_src
554    vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
555
556    @sub       r12, r12, #6                @(3_1)
557
558    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
559    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
560
561    add         r9,r9,#8                    @(core loop)
562
563    vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
564    vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
565
566    vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
567    vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
568
569    vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
570    vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
571
572    vld1.u32    {d21},[r12],r2              @(4_1)vector load pu1_src
573    vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
574
575    add         r1,r1,#8                    @(core loop)
576
577    subs        r7,r7,#8                    @(core loop)
578
579    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
580    vqrshrun.s16 d8,q4,#6                   @(1_1)right shift and saturating narrow result 1
581
582    mov         r4, r1                      @(core loop)
583
584    bgt         core_loop                   @loopback
585
586epilogue:
587    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
588
589    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
590
591    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
592
593    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
594
595    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
596    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
597
598    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
599    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
600
601    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
602
603    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
604
605    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
606    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
607
608    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
609
610    add         r1,r1,#8                    @(core loop)
611
612    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
613
614
615    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
616
617    sub         r9,r9,r5
618    subs        r14,r14,#4                  @decrement the ht loop
619    sub         r1,r1,r5
620    add         r9,r9,r2,lsl #2
621    add         r1,r1,r3,lsl #2
622    bgt         outer_loop_ht_4
623    b           end_loops
624
625outer_loop_4:
626    add         r6,r1,r3                    @pu1_dst + dst_strd
627    mov         r7,r5
628    add         r4,r12,r2                   @pu1_src + src_strd
629
630inner_loop_4:
631    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
632
633    vld1.u32    {d0},[r12],r11              @vector load pu1_src
634    vld1.u32    {d1},[r12],r11              @vector load pu1_src
635    vld1.u32    {d2},[r12],r11              @vector load pu1_src
636    vld1.u32    {d3},[r12]                  @vector load pu1_src
637
638    sub         r12,r12,#2                  @increment the input pointer
639    vld1.u32    {d4},[r4],r11               @vector load pu1_src
640    vld1.u32    {d5},[r4],r11               @vector load pu1_src
641    vld1.u32    {d6},[r4],r11               @vector load pu1_src
642    vld1.u32    {d7},[r4]                   @vector load pu1_src
643    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
644    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
645    @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
646    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
647
648    sub         r4,r4,#2                    @increment the input pointer
649    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
650    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
651    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
652
653    vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
654    vzip.32     d1,d5
655    vzip.32     d2,d6
656    vzip.32     d3,d7
657
658    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
659    vmlsl.u8    q4,d0,d24
660    vmlal.u8    q4,d2,d26
661    vmlsl.u8    q4,d3,d27
662
663    vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
664    vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
665    subs        r7,r7,#4                    @decrement the wd by 4
666
667    vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
668
669    bgt         inner_loop_4
670
671    sub         r12,r12,r5
672    subs        r14,r14,#2                  @decrement the ht by 2
673    sub         r1,r1,r5
674    add         r12,r12,r2,lsl #1
675    add         r1,r1,r3,lsl #1
676    bgt         outer_loop_4
677
678end_loops:
679
680    vpop         {d8 - d15}
681    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
682
683
684
685
686
687
688
689
690