• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_inter_pred_chroma_horz_neon.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  yogeswaran rs / akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*       chroma interprediction filter to store horizontal 16bit ouput
45@*
46@* @par description:
47@*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
48@*    to the elements pointed by 'pu1_src' and  writes to the location pointed
49@*    by 'pu1_dst'  no downshifting or clipping is done and the output is  used
50@*    as an input for vertical filtering or weighted  prediction
51@*
52@* @param[in] pu1_src
53@*  uword8 pointer to the source
54@*
55@* @param[out] pi2_dst
56@*  word16 pointer to the destination
57@*
58@* @param[in] src_strd
59@*  integer source stride
60@*
61@* @param[in] dst_strd
62@*  integer destination stride
63@*
64@* @param[in] pi1_coeff
65@*  word8 pointer to the filter coefficients
66@*
67@* @param[in] ht
68@*  integer height of the array
69@*
70@* @param[in] wd
71@*  integer width of the array
72@*
73@* @returns
74@*
75@* @remarks
76@*  none
77@*
78@*******************************************************************************
79@*/
80@void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
81@                                          word16 *pi2_dst,
82@                                          word32 src_strd,
83@                                          word32 dst_strd,
84@                                          word8 *pi1_coeff,
85@                                          word32 ht,
86@                                          word32 wd)
87@**************variables vs registers*****************************************
88@r0 => *pu1_src
89@r1 => *pi2_dst
90@r2 =>  src_strd
91@r3 =>  dst_strd
92
93.equ    coeff_offset,   104
94.equ    ht_offset,      108
95.equ    wd_offset,      112
96
97.text
98.align 4
99
100
101
102
103.globl ihevc_inter_pred_chroma_horz_w16out_a9q
104
105
106.type ihevc_inter_pred_chroma_horz_w16out_a9q, %function
107
108ihevc_inter_pred_chroma_horz_w16out_a9q:
109
110    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
111    vpush        {d8 - d15}
112
113    ldr         r4,[sp,#coeff_offset]                 @loads pi1_coeff
114    ldr         r6,[sp,#ht_offset]                 @loads ht
115    ldr         r10,[sp,#wd_offset]                @loads wd
116
117    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
118    subs        r14,r6,#0                   @checks for ht == 0
119    vabs.s8     d2,d0                       @vabs_s8(coeff)
120
121@******* added
122    mov         r11, #2
123@******* added ends
124
125    ble         end_loops
126
127    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
128    sub         r12,r0,#2                   @pu1_src - 2
129    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
130    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
131    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
132
133    tst         r10,#3                      @checks wd for multiples of 4
134    mov         r5,r10,lsl #1               @2wd
135
136    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
137
138    and         r7,r14,#1                   @added              @calculating ht_residue ht_residue = (ht & 1)
139    sub         r14,r14,r7                  @added              @decrement height by ht_residue(residue value is calculated outside)
140
141    bne         outer_loop_4                @ this branching happens when the width is 2 or 6
142
143    cmp         r10,#12
144    beq         skip_16
145
146    cmp         r10,#8
147    bge         outer_loop_16
148
149skip_16:
150    tst         r6,#3
151
152@******* removal
153    @mov       r11,#8
154@******* removal ends
155
156    sub         r9,r0,#2
157    beq         outer_loop_ht_4             @this branching happens when the height is a a multiple of 4
158
159
160
161@    cmp        r10,#12
162@    beq    outer_loop_8
163@    cmp        r10,#16
164@    bge    outer_loop_16
165    b           outer_loop_8
166
167
168
169outer_loop_16:
170    add         r4,r12,r2
171
172
173    and         r0, r12, #31
174    pld         [r12, r2, lsl #1]
175
176
177
178
179
180
181
182    vld1.u32    {q0},[r12],r11              @vector load pu1_src
183    mov         r10,r5                      @2wd
184    mul         r14,r14,r10
185    vld1.u32    {q1},[r12],r11              @vector load pu1_src
186    pld         [r4, r2, lsl #1]
187    mov         r9,#10
188    vld1.u32    {q2},[r12],r11              @vector load pu1_src
189    rsb         r6,r3,#8
190    sub         r8,r3,#8
191    vld1.u32    {q3},[r12],r9               @vector load pu1_src
192
193
194    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
195    vld1.u32    {q4},[r4],r11               @vector load pu1_src
196    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
197    vld1.u32    {q5},[r4],r11               @vector load pu1_src
198    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
199    vld1.u32    {q6},[r4],r11               @vector load pu1_src
200    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
201    vld1.u32    {q7},[r4],r9                @vector load pu1_src
202    vmull.u8    q14,d3,d25
203    lsl         r6,#1
204    rsb         r3,r5,r3,lsl #1
205    vmlsl.u8    q14,d1,d24
206    lsl         r8,#1
207    rsb         r7,r5,r2,lsl #1
208    vmlal.u8    q14,d5,d26
209
210    vmlsl.u8    q14,d7,d27
211    cmp         r14,#32
212    beq         epilog_end
213    sub         r14,#64
214
215inner_loop_16:
216
217    @ and           r7, r12, #31                    @decrement the wd loop
218    @ cmp           r7, r0
219    pld         [r12, r2, lsl #2]
220    pld         [r4, r2, lsl #2]
221
222
223    subs        r10,r10,#16
224
225    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
226
227
228
229@    addeq      r12,r12,r2,lsl #1
230@    subeq      r12,r12,r5
231    addeq       r12,r12,r7
232    addeq       r4,r12,r2
233
234
235    vst1.16     {q15}, [r1]!
236    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
237
238
239
240
241
242    vld1.u32    {q0},[r12],r11              @vector load pu1_src
243    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
244
245
246
247
248    vld1.u32    {q1},[r12],r11              @vector load pu1_src
249    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
250
251
252    vld1.u32    {q2},[r12],r11              @vector load pu1_src
253    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
254
255    vst1.16     {q14}, [r1],r8
256    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
257
258    vld1.u32    {q3},[r12],r9               @vector load pu1_src
259    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
260
261    vld1.u32    {q4},[r4],r11               @vector load pu1_src
262    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
263
264
265    vld1.u32    {q5},[r4],r11               @vector load pu1_src
266    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
267
268    vld1.u32    {q6},[r4],r11               @vector load pu1_src
269    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
270
271    vld1.u32    {q7},[r4],r9                @vector load pu1_src
272    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
273
274    vst1.16     {q11},[r1]!                 @store the result pu1_dst
275    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
276
277    moveq       r10,r5                      @2wd
278    vmull.u8    q14,d3,d25
279
280
281
282    vmlsl.u8    q14,d1,d24
283    vst1.16     {q10},[r1],r6               @store the result pu1_dst
284
285
286    addeq       r1,r1,r3,lsl #1
287    vmlal.u8    q14,d5,d26
288
289    subs        r14,r14,#32                 @decrement the ht loop
290    vmlsl.u8    q14,d7,d27
291
292
293
294@    mov            r0, r7
295    bgt         inner_loop_16
296
297
298
299    add         r14,r14,#64
300    cmp         r14,#32
301    beq         epilog_end
302
303epilog:
304
305    vst1.16     {q15}, [r1]!
306    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
307    vst1.16     {q14}, [r1],r8
308
309
310
311    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
312    subs        r10,r10,#16                 @decrement the wd loop
313    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
314@    addeq      r12,r12,r2,lsl #1
315    addeq       r12,r12,r7
316    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
317    @ subeq     r12,r12,r5
318    moveq       r10,r5                      @2wd
319    addeq       r4,r12,r2
320    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
321    vld1.u32    {q0},[r12],r11              @vector load pu1_src
322    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
323    vld1.u32    {q1},[r12],r11              @vector load pu1_src
324    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
325    vld1.u32    {q2},[r12],r11              @vector load pu1_src
326    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
327    vld1.u32    {q3},[r12],r9               @vector load pu1_src
328    vmull.u8    q15,d2,d25                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
329
330
331    vld1.u32    {q4},[r4],r11               @vector load pu1_src
332    vmlsl.u8    q15,d0,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
333    vld1.u32    {q5},[r4],r11               @vector load pu1_src
334    vmlal.u8    q15,d4,d26                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
335
336    vmlsl.u8    q15,d6,d27                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
337
338    vld1.u32    {q6},[r4],r11               @vector load pu1_src
339    vmull.u8    q14,d3,d25
340    vld1.u32    {q7},[r4],r9                @vector load pu1_src
341    vmlsl.u8    q14,d1,d24
342    vst1.16     {q11},[r1]!                 @store the result pu1_dst
343    vmlal.u8    q14,d5,d26
344    vst1.16     {q10},[r1],r6               @store the result pu1_dst
345    vmlsl.u8    q14,d7,d27
346    addeq       r1,r1,r3,lsl #1
347
348
349epilog_end:
350
351    vmull.u8    q11,d10,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
352    vmlsl.u8    q11,d8,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
353    vmlal.u8    q11,d12,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
354    vmlsl.u8    q11,d14,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
355
356
357    vmull.u8    q10,d11,d25                 @mul_res = vmull_u8(src[0_3], coeffabs_3)@
358    vmlsl.u8    q10,d9,d24                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
359    vmlal.u8    q10,d13,d26                 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
360    vmlsl.u8    q10,d15,d27                 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
361
362
363    vst1.16     {q15}, [r1]!
364    vst1.16     {q14}, [r1],r8
365    vst1.16     {q11},[r1]!                 @store the result pu1_dst
366    vst1.16     {q10},[r1],r6               @store the result pu1_dst
367
368
369    ldr         r6,[sp,#ht_offset]                 @loads ht
370
371    and         r7,r6,#1
372
373    cmp         r7,#0
374    mov         r10,r5
375    addne       r12,r12,r2,lsl #1
376    subne       r12,r12,r5
377    addne       r1,r1,r3,lsl #1
378
379
380    bgt         loop_residue_4
381
382    b           end_loops
383
384
385
386
387outer_loop_8:
388
389    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
390    mov         r10,r5                      @2wd
391    add         r4,r12,r2                   @pu1_src + src_strd
392
393inner_loop_8:
394    @vld1.u32  {d0,d1},[r12],r11               @vector load pu1_src
395    vld1.u32    {d0},[r12],r11              @vector load pu1_src
396    vld1.u32    {d1},[r12],r11              @vector load pu1_src
397    vld1.u32    {d2},[r12],r11              @vector load pu1_src
398    vld1.u32    {d3},[r12],r11              @vector load pu1_src
399
400
401    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
402    vmull.u8    q4,d1,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
403    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
404    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
405    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
406    vmlal.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
407    vmlsl.u8    q4,d3,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
408
409    @vld1.u32  {d12,d13},[r4],r11              @vector load pu1_src + src_strd
410    vld1.u32    {d4},[r4],r11               @vector load pu1_src
411    vld1.u32    {d5},[r4],r11               @vector load pu1_src
412    vld1.u32    {d6},[r4],r11               @vector load pu1_src
413    vld1.u32    {d7},[r4],r11               @vector load pu1_src
414    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
415    vmull.u8    q5,d5,d25                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
416    vmlsl.u8    q5,d4,d24                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
417    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
418    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
419    vmlal.u8    q5,d6,d26                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
420    vmlsl.u8    q5,d7,d27                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
421
422    vst1.16     {d8, d9}, [r1]!
423
424    subs        r10,r10,#8                  @decrement the wd loop
425    vst1.16     {d10, d11},[r6]!            @store the result pu1_dst
426    bgt         inner_loop_8
427
428    sub         r12,r12,r5
429    subs        r14,r14,#2                  @decrement the ht loop
430    sub         r1,r1,r5,lsl #1
431    add         r12,r12,r2,lsl #1
432    add         r1,r1,r3,lsl #2
433    bgt         outer_loop_8
434
435    cmp         r7,#0
436    mov         r10,r5
437    bgt         loop_residue_4
438
439    b           end_loops
440
441
442
443@height if 4 comes
444outer_loop_ht_4:
445
446    mov         r10,r5
447
448prologue_ht_4:
449    mov         r8,r3,lsl #1
450
451inner_loop_ht_4:
452
453    mov         r12,r9
454    mov         r4,r1
455
456    sub         r0, r2, #6                  @ not sure if r0 needs to be preserved
457
458    vld1.u32    {d0},[r12],r11              @(1)vector load pu1_src
459    vld1.u32    {d1},[r12],r11              @(1)vector load pu1_src
460    vld1.u32    {d2},[r12],r11              @(1)vector load pu1_src
461    vld1.u32    {d3},[r12],r0               @(1)vector load pu1_src
462
463    vld1.u32    {d4},[r12],r11              @(2)vector load pu1_src
464    vld1.u32    {d5},[r12],r11              @(2)vector load pu1_src
465    vld1.u32    {d6},[r12],r11              @(2)vector load pu1_src
466    vld1.u32    {d7},[r12],r0               @(2)vector load pu1_src
467
468    vld1.u32    {d14},[r12],r11             @(3)vector load pu1_src
469    vmull.u8    q4,d1,d25                   @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
470
471    vld1.u32    {d15},[r12],r11             @(3)vector load pu1_src
472    vmlsl.u8    q4,d0,d24                   @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
473
474    vld1.u32    {d16},[r12],r11             @(3)vector load pu1_src
475    vmlal.u8    q4,d2,d26                   @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
476
477    vld1.u32    {d17},[r12],r0              @(3)vector load pu1_src
478    vmlsl.u8    q4,d3,d27                   @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
479
480    vld1.u32    {d18},[r12],r11             @(4)vector load pu1_src
481    vmull.u8    q5,d5,d25                   @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@
482
483    vld1.u32    {d19},[r12],r11             @(4)vector load pu1_src
484    vmlsl.u8    q5,d4,d24                   @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
485
486    vld1.u32    {d20},[r12],r11             @(4)vector load pu1_src
487    vmlal.u8    q5,d6,d26                   @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
488
489    vld1.u32    {d21},[r12],r2              @(4)vector load pu1_src
490    vmlsl.u8    q5,d7,d27                   @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
491
492    add         r9,r9,#8                    @(core loop)
493
494    subs        r10,r10,#8                  @(prologue)decrement the wd loop
495    beq         epilogue
496
497core_loop:
498    vst1.16     {d8, d9},[r4],r8            @(1)store the result pu1_dst
499    mov         r12,r9
500
501    vld1.u32    {d0},[r12],r11              @(1_1)vector load pu1_src
502    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
503
504    vld1.u32    {d1},[r12],r11              @(1_1)vector load pu1_src
505    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
506
507    vld1.u32    {d2},[r12],r11              @(1_1)vector load pu1_src
508    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
509
510    vld1.u32    {d3},[r12],r0               @(1_1)vector load pu1_src
511    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
512
513    vst1.16     {d10, d11},[r4],r8          @(2)store the result pu1_dst
514    add         r9,r9,#8                    @(core loop)
515
516    vld1.u32    {d4},[r12],r11              @(2_1)vector load pu1_src
517    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
518
519    vld1.u32    {d5},[r12],r11              @(2_1)vector load pu1_src
520    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
521
522    vld1.u32    {d6},[r12],r11              @(2_1)vector load pu1_src
523    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
524
525    vld1.u32    {d7},[r12],r0               @(2_1)vector load pu1_src
526    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
527
528    vst1.16     {d12, d13},[r4],r8          @(3)store the result pu1_dst
529    add         r1,r1,#16                   @(core loop)
530
531    vld1.u32    {d14},[r12],r11             @(3_1)vector load pu1_src
532    vmull.u8    q4,d1,d25                   @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
533
534    vld1.u32    {d15},[r12],r11             @(3_1)vector load pu1_src
535    vmlsl.u8    q4,d0,d24                   @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
536
537    vld1.u32    {d16},[r12],r11             @(3_1)vector load pu1_src
538    vmlal.u8    q4,d2,d26                   @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
539
540    vld1.u32    {d17},[r12],r0              @(3_1)vector load pu1_src
541    vmlsl.u8    q4,d3,d27                   @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
542
543    vst1.16     {d22, d23}, [r4], r8        @(4)store the result pu1_dst
544    subs        r10,r10,#8                  @(core loop)
545
546    vmull.u8    q5,d5,d25                   @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@
547    vld1.u32    {d18},[r12],r11             @(4_1)vector load pu1_src
548
549    vld1.u32    {d19},[r12],r11             @(4_1)vector load pu1_src
550    vmlsl.u8    q5,d4,d24                   @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
551
552    vld1.u32    {d20},[r12],r11             @(4_1)vector load pu1_src
553    vmlal.u8    q5,d6,d26                   @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
554
555    mov         r4, r1                      @(core loop)
556
557    vld1.u32    {d21},[r12],r0              @(4_1)vector load pu1_src
558    vmlsl.u8    q5,d7,d27                   @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
559
560
561
562    bgt         core_loop                   @loopback
563
564epilogue:
565    vmull.u8    q6,d15,d25                  @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@
566
567    vmlsl.u8    q6,d14,d24                  @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
568
569    vmlal.u8    q6,d16,d26                  @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
570
571    vmlsl.u8    q6,d17,d27                  @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
572
573    vst1.16     {d8, d9},[r4], r8           @(1)store the result pu1_dst
574
575    vmull.u8    q11,d19,d25                 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@
576    vmlsl.u8    q11,d18,d24                 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
577
578    vmlal.u8    q11,d20,d26                 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
579
580    vmlsl.u8    q11,d21,d27                 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@
581
582    vst1.16     {d10, d11},[r4], r8         @(2)store the result pu1_dst
583
584    vst1.16     {d12, d13},[r4], r8         @(3)store the result pu1_dst
585
586    add         r1,r1,#16                   @(core loop)
587
588    vst1.16     {d22, d23},[r4], r8         @(4)store the result pu1_dst
589
590    sub         r9,r9,r5
591    subs        r14,r14,#4                  @decrement the ht loop
592    sub         r1,r1,r5,lsl #1
593    add         r9,r9,r2,lsl #2
594    add         r1,r1,r3,lsl #3
595    bgt         outer_loop_ht_4
596
597    cmp         r7,#0
598    mov         r10,r5
599    movgt       r12,r9
600    movgt       r4,r1
601    bgt         loop_residue_4
602
603    b           end_loops
604
605outer_loop_4:
606    add         r6,r1,r3,lsl #1             @pu1_dst + dst_strd
607    mov         r10,r5
608    add         r4,r12,r2                   @pu1_src + src_strd
609
610inner_loop_4:
611    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
612    vld1.u32    {d0},[r12],r11              @vector load pu1_src
613    vld1.u32    {d1},[r12],r11              @vector load pu1_src
614    vld1.u32    {d2},[r12],r11              @vector load pu1_src
615    vld1.u32    {d3},[r12]                  @vector load pu1_src
616
617@**** removal
618    @add       r12,r12,#4                      @increment the input pointer
619@**** removal ends
620@**** addn
621    sub         r12,r12,#2                  @increment the input pointer
622@**** addn ends
623    vld1.u32    {d4},[r4],r11               @vector load pu1_src
624    vld1.u32    {d5},[r4],r11               @vector load pu1_src
625    vld1.u32    {d6},[r4],r11               @vector load pu1_src
626    vld1.u32    {d7},[r4]                   @vector load pu1_src
627    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
628    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
629    @vld1.u32  {d12,d13},[r4]                  @vector load pu1_src + src_strd
630    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
631
632    @add       r4,r4,#4                        @increment the input pointer
633    sub         r4,r4,#2
634    @vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
635    @vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
636    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
637
638@**** removal
639    @vzip.32   d0,d12                          @vector zip the i iteration and ii interation in single register
640    @vzip.32   d2,d14
641    @vzip.32   d4,d16
642    @vzip.32   d6,d18
643@**** removal ends
644@**** addn
645    vzip.32     d0,d4                       @vector zip the i iteration and ii interation in single register
646    vzip.32     d1,d5
647    vzip.32     d2,d6
648    vzip.32     d3,d7
649@**** addn ends
650
651    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
652    vmlsl.u8    q4,d0,d24
653    vmlal.u8    q4,d2,d26
654    vmlsl.u8    q4,d3,d27
655
656    vst1.32     {d8},[r1]!                  @store the i iteration result which is in upper part of the register
657    subs        r10,r10,#4                  @decrement the wd by 4
658
659    vst1.32     {d9},[r6]!                  @store the ii iteration result which is in lower part of the register
660
661    bgt         inner_loop_4
662
663    sub         r12,r12,r5
664    subs        r14,r14,#2                  @decrement the ht by 2
665    sub         r1,r1,r5,lsl #1
666    add         r12,r12,r2,lsl #1
667    add         r1,r1,r3,lsl #2
668    bgt         outer_loop_4
669
670    cmp         r7,#0
671    mov         r10,r5
672    beq         end_loops
673
674loop_residue_4:
675
676    mov         r10,r5                      @2wd
677
678loop_residue:
679
680    @vld1.u32  {d0,d1},[r12]                   @vector load pu1_src
681    vld1.u32    {d0},[r12],r11              @vector load pu1_src
682    vld1.u32    {d1},[r12],r11              @vector load pu1_src
683    vld1.u32    {d2},[r12],r11              @vector load pu1_src
684    vld1.u32    {d3},[r12]                  @vector load pu1_src
685    @vext.u8       d2,d0,d1,#2             @vector extract of src[0_2]
686    @vmull.u8      q4,d2,d25               @mul_res = vmull_u8(src[0_3], coeffabs_3)@
687    @vmlsl.u8      q4,d0,d24               @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
688    @vext.u8       d4,d0,d1,#4             @vector extract of src[0_4]
689    @add           r12,r12,#4              @pu1_src + 4
690    sub         r12, r12, #2
691    @vext.u8       d6,d0,d1,#6             @vector extract of src[0_6]
692    @vmlal.u8      q4,d4,d26               @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
693    @vmlsl.u8      q4,d6,d27               @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
694    vmull.u8    q4,d1,d25
695    vmlsl.u8    q4,d0,d24
696    vmlal.u8    q4,d2,d26
697    vmlsl.u8    q4,d3,d27
698
699    vst1.64     {d8 },[r1]                  @store the result pu1_dst
700    subs        r10,r10,#4                  @decrement the wd loop
701    add         r1,r1,#8                    @pi2_dst + 8
702
703    bgt         loop_residue                @loop again
704
705    @inner loop ends
706    @add           r8,r3,lsl #1            @2*dst_strd
707    @sub           r8,r8,r5,lsl #1         @2*dst_strd - 2wd
708    @sub           r9,r2,r5                @src_strd - 2wd
709    @subs          r7,r7,#1                @decrement the ht loop
710    @add           r12,r12,r9              @pu1_src + src_strd
711    @add           r1,r1,r8                @pu1_dst + 2*dst_strd
712    @bgt           outer_loop_residue_4    @loop again
713    @b                 end_loops               @jumps to end
714
715end_loops:
716
717    vpop         {d8 - d15}
718    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
719
720
721
722
723
724
725