• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_horz_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs / akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    chroma interprediction filter for horizontal input
45@*
46@* @par description:
47@*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
48@*    to the elements pointed by 'pu1_src' and  writes to the location pointed
49@*    by 'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
50@*    assumptions : the function is optimized considering the fact width is
51@*    multiple of 2,4 or 8. if width is 2, then height  should be multiple of 2.
52@*    width 4,8 is optimized further
53@*
54@* @param[in] pu1_src
55@*  uword8 pointer to the source
56@*
57@* @param[out] pu1_dst
58@*  uword8 pointer to the destination
59@*
60@* @param[in] src_strd
61@*  integer source stride
62@*
63@* @param[in] dst_strd
64@*  integer destination stride
65@*
66@* @param[in] pi1_coeff
67@*  word8 pointer to the filter coefficients
68@*
69@* @param[in] ht
70@*  integer height of the array
71@*
72@* @param[in] wd
73@*  integer width of the array
74@*
75@* @returns
76@*
77@* @remarks
78@*  none
79@*
80@*******************************************************************************
81@*/
82
83@void ihevc_inter_pred_chroma_horz(uword8 *pu1_src,
84@                                   uword8 *pu1_dst,
85@                                   word32 src_strd,
86@                                   word32 dst_strd,
87@                                   word8 *pi1_coeff,
88@                                   word32 ht,
89@                                   word32 wd)
90@**************variables vs registers*****************************************
91@r0 => *pu1_src
92@r1 => *pi2_dst
93@r2 =>  src_strd
94@r3 =>  dst_strd
95
96.text
97.align 4
98
99
100
101
102.globl ihevc_inter_pred_chroma_horz_a9q
103
104.type ihevc_inter_pred_chroma_horz_a9q, %function
105
106ihevc_inter_pred_chroma_horz_a9q:
107
108    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
109
110    ldr         r4,[sp,#40]                 @loads pi1_coeff
111    ldr         r7,[sp,#44]                 @loads ht
112    ldr         r10,[sp,#48]                @loads wd
113
114    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
115    subs        r14,r7,#0                   @checks for ht == 0
116    vabs.s8     d2,d0                       @vabs_s8(coeff)
117    mov         r11,#2
118    ble         end_loops
119
120    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
121    sub         r12,r0,#2                   @pu1_src - 2
122    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
123    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
124    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
125
126    tst         r10,#3                      @checks wd for multiples
127    mov         r5,r10,lsl #1
128
129    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
130
131    bne         outer_loop_4
132    cmp         r10,#12
133    beq         skip_16
134
135    cmp         r10,#8
136    bge         outer_loop_16
137skip_16:
138    tst         r7,#3
139
140    sub         r9,r0,#2
141    beq         outer_loop_ht_4             @jumps to else condition
142
143    b           outer_loop_8
144
145
146outer_loop_16:
147    mov         r10,r5                      @2wd
148    mul         r14,r14,r10
149
150    rsb         r6,r3,#16
151
152    add         r4,r12,r2
153    mov         r9,#10
154    and         r0, r12, #31
155    rsb         r8,r5,r3,lsl #1
156    pld         [r12, r2, lsl #1]
157
158
159
160
161    vld1.u32    {q0},[r12],r11              @vector load pu1_src
162    pld         [r4, r2, lsl #1]
163    vld1.u32    {q1},[r12],r11              @vector load pu1_src
164
165    vld1.u32    {q2},[r12],r11              @vector load pu1_src
166
167    vld1.u32    {q3},[r12],r9               @vector load pu1_src
168
169
170    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
171    vld1.u32    {q4},[r4],r11               @vector load pu1_src
172    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
173    vld1.u32    {q5},[r4],r11               @vector load pu1_src
174    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
175    vld1.u32    {q6},[r4],r11               @vector load pu1_src
176    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
177    vld1.u32    {q7},[r4],r9                @vector load pu1_src
178    vmull.u8    q14,d3,d25
179
180    vmlsl.u8    q14,d1,d24
181
182
183    vmlal.u8    q14,d5,d26
184
185    vmlsl.u8    q14,d7,d27
186
187
188    cmp         r14,#32
189    beq         epilog_end
190    sub         r14,#64
191
192inner_loop_16:
193
194
195
196
197@    bgt            l_2
198
199@   pld         [r12, r2, lsl #1]
200@   pld         [r4, r2, lsl #1]
201
202
203
204    subs        r10,r10,#16
205
206    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
207
208
209    addeq       r12,r12,r8
210    addeq       r4,r12,r2
211    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
212
213
214
215    pld         [r12, r2, lsl #2]
216    vqrshrun.s16 d30,q15,#6
217
218    vld1.u32    {q0},[r12],r11              @vector load pu1_src
219    vqrshrun.s16 d31,q14,#6
220
221
222    vld1.u32    {q1},[r12],r11              @vector load pu1_src
223    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
224
225
226
227
228    vld1.u32    {q2},[r12],r11              @vector load pu1_src
229    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
230
231
232    vld1.u32    {q3},[r12],r9               @vector load pu1_src
233    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
234
235    pld         [r4, r2, lsl #2]
236    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
237
238    vst1.16     {q15}, [r1],r3
239    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
240
241    vld1.u32    {q4},[r4],r11               @vector load pu1_src
242    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
243
244
245    vld1.u32    {q5},[r4],r11               @vector load pu1_src
246    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
247
248    vld1.u32    {q6},[r4],r11               @vector load pu1_src
249    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
250
251    vld1.u32    {q7},[r4],r9                @vector load pu1_src
252    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
253
254    cmp         r10,#0
255    vqrshrun.s16 d22,q11,#6
256    vqrshrun.s16 d23,q10,#6
257
258
259
260    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
261
262    moveq       r10,r5                      @2wd
263    vmull.u8    q14,d3,d25
264
265
266    vst1.16     {q11},[r1],r6               @store the result pu1_dst
267    vmlsl.u8    q14,d1,d24
268
269
270    addeq       r1,r1,r8
271    vmlal.u8    q14,d5,d26
272
273    subs        r14,r14,#32                 @decrement the ht loop
274    vmlsl.u8    q14,d7,d27
275
276@     mov           r0, r7
277
278    bgt         inner_loop_16
279
280
281
282    add         r14,r14,#64
283    cmp         r14,#32
284    beq         epilog_end
285
286epilog:
287    vqrshrun.s16 d30,q15,#6
288    vqrshrun.s16 d31,q14,#6
289
290
291
292    vst1.16     {q15}, [r1],r3
293    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
294
295
296
297
298    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
299    subs        r10,r10,#16                 @decrement the wd loop
300    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
301    addeq       r12,r12,r8
302    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
303    moveq       r10,r5                      @2wd
304
305
306    addeq       r4,r12,r2
307    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
308    vld1.u32    {q0},[r12],r11              @vector load pu1_src
309    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
310    vld1.u32    {q1},[r12],r11              @vector load pu1_src
311    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
312    vld1.u32    {q2},[r12],r11              @vector load pu1_src
313    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
314    vld1.u32    {q3},[r12],r9               @vector load pu1_src
315    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
316
317
318    vld1.u32    {q4},[r4],r11               @vector load pu1_src
319    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
320    vld1.u32    {q5},[r4],r11               @vector load pu1_src
321    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
322
323    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
324
325    vld1.u32    {q6},[r4],r11               @vector load pu1_src
326    vmull.u8    q14,d3,d25
327    vld1.u32    {q7},[r4],r9                @vector load pu1_src
328    vmlsl.u8    q14,d1,d24
329    vqrshrun.s16 d22,q11,#6
330    vqrshrun.s16 d23,q10,#6
331
332    vst1.16     {q11},[r1],r6               @store the result pu1_dst
333    vmlal.u8    q14,d5,d26
334
335    vmlsl.u8    q14,d7,d27
336    addeq       r1,r1,r8
337
338
339
340epilog_end:
341    vqrshrun.s16 d30,q15,#6
342    vqrshrun.s16 d31,q14,#6
343
344
345    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
346    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
347    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
348    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
349
350
351    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
352    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
353    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
354    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
355    vqrshrun.s16 d22,q11,#6
356    vqrshrun.s16 d23,q10,#6
357
358
359    vst1.16     {q15}, [r1],r3
360
361    vst1.16     {q11},[r1]                  @store the result pu1_dst
362
363
364
365    b           end_loops
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385outer_loop_8:
386
387
388    add         r6,r1,r3                    @pu1_dst + dst_strd
389    mov         r7,r5
390    add         r4,r12,r2                   @pu1_src + src_strd
391
392
393inner_loop_8:
394    @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
395    vld1.u32    {d0},[r12],r11              @vector load pu1_src
396    vld1.u32    {d1},[r12],r11              @vector load pu1_src
397    vld1.u32    {d2},[r12],r11              @vector load pu1_src
398    vld1.u32    {d3},[r12],r11              @vector load pu1_src
399
400    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
401    vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
402    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
403    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
404    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
405    vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
406    vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
407
408    vld1.u32    {d4},[r4],r11               @vector load pu1_src
409    vld1.u32    {d5},[r4],r11               @vector load pu1_src
410    vld1.u32    {d6},[r4],r11               @vector load pu1_src
411    vld1.u32    {d7},[r4],r11               @vector load pu1_src
412    @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
413    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
414    vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
415    vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
416    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
417    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
418    vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
419    vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
420    vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
421
422    vst1.8      {d8},[r1]!                  @store the result pu1_dst
423
424    vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
425    subs        r7,r7,#8                    @decrement the wd loop
426    vst1.8      {d10},[r6]!                 @store the result pu1_dst
427    bgt         inner_loop_8
428
429    sub         r12,r12,r5
430    subs        r14,r14,#2                  @decrement the ht loop
431    sub         r1,r1,r5
432    add         r12,r12,r2,lsl #1
433    add         r1,r1,r3,lsl #1
434    bgt         outer_loop_8
435    b           end_loops
436
437@height if 4 comes
438outer_loop_ht_4:
439
440    mov         r7,r5
441
442prologue_ht_4:
443
444inner_loop_ht_4:
445
446    mov         r12,r9
447    mov         r4,r1
448
449    sub         r8, r2, #6
450
451    vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
452    vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
453    vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
454    @vld1.u32  {d3},[r12],r2               @(1)vector load pu1_src
455    vld1.u32    {d3},[r12],r8               @(1)vector load pu1_src
456
457    @sub       r12, r12, #6                @(1)
458
459    vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
460    vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
461    vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
462    @vld1.u32  {d7},[r12],r2               @(2)vector load pu1_src
463    vld1.u32    {d7},[r12],r8               @(2)vector load pu1_src
464
465    @sub       r12, r12, #6                @(2)
466
467    vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
468    vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
469
470    vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
471    vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
472
473    vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
474    vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
475
476    @vld1.u32  {d17},[r12],r2              @(3)vector load pu1_src
477    vld1.u32    {d17},[r12],r8              @(3)vector load pu1_src
478    vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
479
480    @sub       r12, r12, #6                @(3)
481    vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
482
483    vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
484    vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
485
486    vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
487    vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
488
489    vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
490    vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
491
492    vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
493    vqrshrun.s16 d8,q4,#6                   @(1)right shift and saturating narrow result 1
494
495    add         r9,r9,#8                    @(core loop)
496
497    subs        r7,r7,#8                    @(prologue)decrement the wd loop
498    beq         epilogue
499
500core_loop:
501    mov         r12,r9
502
503    vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
504    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
505
506    vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
507    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
508
509    vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
510    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
511
512    @vld1.u32  {d3},[r12],r2               @(1_1)vector load pu1_src
513    vld1.u32    {d3},[r12],r8               @(1_1)vector load pu1_src
514    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
515
516    @sub       r12, r12, #6                @(1_1)
517
518    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
519    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
520
521    vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
522    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
523
524    vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
525    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
526
527    vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
528    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
529
530    @vld1.u32  {d7},[r12],r2               @(2_1)vector load pu1_src
531    vld1.u32    {d7},[r12],r8               @(2_1)vector load pu1_src
532    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
533
534    @sub       r12, r12, #6                @(2_1)
535
536    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
537    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
538
539    vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
540    vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
541
542    vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
543    vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
544
545    vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
546    vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
547
548    @vld1.u32  {d17},[r12],r2              @(3_1)vector load pu1_src
549    vld1.u32    {d17},[r12],r8              @(3_1)vector load pu1_src
550    vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
551
552    @sub       r12, r12, #6                @(3_1)
553
554    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
555    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
556
557    add         r9,r9,#8                    @(core loop)
558
559    vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
560    vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
561
562    vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
563    vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
564
565    vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
566    vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
567
568    vld1.u32    {d21},[r12],r2              @(4_1)vector load pu1_src
569    vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
570
571    add         r1,r1,#8                    @(core loop)
572
573    subs        r7,r7,#8                    @(core loop)
574
575    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
576    vqrshrun.s16 d8,q4,#6                   @(1_1)right shift and saturating narrow result 1
577
578    mov         r4, r1                      @(core loop)
579
580    bgt         core_loop                   @loopback
581
582epilogue:
583    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
584
585    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
586
587    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
588
589    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
590
591    vst1.8      {d8},[r4],r3                @(1)store the result pu1_dst
592    vqrshrun.s16 d10,q5,#6                  @(2)right shift and saturating narrow result 2
593
594    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
595    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
596
597    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
598
599    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
600
601    vst1.8      {d10},[r4],r3               @(2)store the result pu1_dst
602    vqrshrun.s16 d12,q6,#6                  @(3)right shift and saturating narrow result 1
603
604    vst1.8      {d12},[r4],r3               @(3)store the result pu1_dst
605
606    add         r1,r1,#8                    @(core loop)
607
608    vqrshrun.s16 d22,q11,#6                 @(4)right shift and saturating narrow result 2
609
610
611    vst1.8      {d22}, [r4], r3             @(4)store the result pu1_dst
612
613    sub         r9,r9,r5
614    subs        r14,r14,#4                  @decrement the ht loop
615    sub         r1,r1,r5
616    add         r9,r9,r2,lsl #2
617    add         r1,r1,r3,lsl #2
618    bgt         outer_loop_ht_4
619    b           end_loops
620
621outer_loop_4:
622    add         r6,r1,r3                    @pu1_dst + dst_strd
623    mov         r7,r5
624    add         r4,r12,r2                   @pu1_src + src_strd
625
626inner_loop_4:
627    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
628
629    vld1.u32    {d0},[r12],r11              @vector load pu1_src
630    vld1.u32    {d1},[r12],r11              @vector load pu1_src
631    vld1.u32    {d2},[r12],r11              @vector load pu1_src
632    vld1.u32    {d3},[r12]                  @vector load pu1_src
633
634    sub         r12,r12,#2                  @increment the input pointer
635    vld1.u32    {d4},[r4],r11               @vector load pu1_src
636    vld1.u32    {d5},[r4],r11               @vector load pu1_src
637    vld1.u32    {d6},[r4],r11               @vector load pu1_src
638    vld1.u32    {d7},[r4]                   @vector load pu1_src
639    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
640    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
641    @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
642    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
643
644    sub         r4,r4,#2                    @increment the input pointer
645    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
646    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
647    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
648
649    vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
650    vzip.32     d1,d5
651    vzip.32     d2,d6
652    vzip.32     d3,d7
653
654    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
655    vmlsl.u8    q4,d0,d24
656    vmlal.u8    q4,d2,d26
657    vmlsl.u8    q4,d3,d27
658
659    vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
660    vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
661    subs        r7,r7,#4                    @decrement the wd by 4
662
663    vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
664
665    bgt         inner_loop_4
666
667    sub         r12,r12,r5
668    subs        r14,r14,#2                  @decrement the ht by 2
669    sub         r1,r1,r5
670    add         r12,r12,r2,lsl #1
671    add         r1,r1,r3,lsl #1
672    bgt         outer_loop_4
673
674end_loops:
675
676    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
677
678
679
680
681
682
683
684
685