• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON
34#include "arm_arch_common_macro.S"
35
36
37WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
38    stmdb   sp!, {r4-r8, lr}
39
40    //Get   the width   and height
41    ldr  r4, [sp,   #24]    //src_width
42    ldr  r5, [sp,   #28]    //src_height
43
44    //Initialize the register
45    mov r6, r2
46    mov r8, r0
47    mov lr, #0
48    lsr r5, #1
49
50    //Save the tailer   for the unasigned   size
51    mla  r7, r1, r5, r0
52    vld1.32 {q15}, [r7]
53
54    add r7, r2, r3
55    //processing a colume   data
56comp_ds_bilinear_loop0:
57
58    vld1.8 {q0,q1}, [r2]!
59    vld1.8 {q2,q3}, [r7]!
60    vuzp.8 q0, q1
61    vuzp.8 q2, q3
62    vrhadd.u8 q0, q0, q1
63    vrhadd.u8 q2, q2, q3
64    vrhadd.u8 q0, q0, q2
65    vst1.32 {q0},   [r0]!
66    add lr, #32
67
68    cmp lr, r4
69    movcs   lr, #0
70    addcs   r6, r6, r3, lsl #1
71    movcs   r2, r6
72    addcs   r7, r2, r3
73    addcs   r8, r1
74    movcs   r0, r8
75    subscs r5, #1
76    bne comp_ds_bilinear_loop0
77
78    //restore   the tailer for the unasigned size
79    vst1.32 {q15}, [r0]
80
81    ldmia   sp!, {r4-r8,lr}
82WELS_ASM_FUNC_END
83
84
85WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
86    stmdb   sp!, {r4-r7, lr}
87
88    //Get   the width   and height
89    ldr  r4, [sp,   #20]    //src_width
90    ldr  r5, [sp,   #24]    //src_height
91
92    //Get   the difference
93    sub lr, r3, r4
94    sub r1, r1, r4, lsr #1
95
96    lsr r5, #1
97
98    //processing a colume   data
99comp_ds_bilinear_w_x8_loop0:
100
101    lsr r6, r4, #3
102    add r7, r2, r3
103    //processing a line data
104comp_ds_bilinear_w_x8_loop1:
105
106    vld1.8 {d0}, [r2]!
107    vld1.8 {d1}, [r7]!
108    vpaddl.u8   q0, q0
109    vrshr.u16   q0, #1
110    vrhadd.u16 d0, d1
111
112    vmovn.u16   d0, q0
113    vst1.32 {d0[0]}, [r0]!
114    subs r6, #1
115    bne comp_ds_bilinear_w_x8_loop1
116
117    add r2, r7, lr
118    add r0, r1
119    subs r5, #1
120    bne comp_ds_bilinear_w_x8_loop0
121
122    ldmia   sp!, {r4-r7,lr}
123WELS_ASM_FUNC_END
124
125
126WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
127    stmdb   sp!, {r4-r7, lr}
128
129    //Get   the width   and height
130    ldr  r4, [sp,   #20]    //src_width
131    ldr  r5, [sp,   #24]    //src_height
132
133    //Get   the difference
134    sub lr, r3, r4
135    sub r1, r1, r4, lsr #1
136
137    lsr r5, #1
138
139    //processing a colume   data
140comp_ds_bilinear_w_x16_loop0:
141
142    lsr r6, r4, #4
143    add r7, r2, r3
144    //processing a line data
145comp_ds_bilinear_w_x16_loop1:
146
147    vld1.8 {q0}, [r2]!
148    vld1.8 {q1}, [r7]!
149    vpaddl.u8   q0, q0
150    vpaddl.u8   q1, q1
151    vrshr.u16   q0, #1
152    vrshr.u16   q1, #1
153    vrhadd.u16 q0, q1
154
155    vmovn.u16   d0, q0
156    vst1.32 {d0},   [r0]!
157    subs r6, #1
158    bne comp_ds_bilinear_w_x16_loop1
159
160    add r2, r7, lr
161    add r0, r1
162    subs r5, #1
163    bne comp_ds_bilinear_w_x16_loop0
164
165    ldmia   sp!, {r4-r7,lr}
166WELS_ASM_FUNC_END
167
168
169WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
170    stmdb   sp!, {r4-r7, lr}
171
172    //Get   the width   and height
173    ldr  r4, [sp,   #20]    //src_width
174    ldr  r5, [sp,   #24]    //src_height
175
176    //Get   the difference
177    sub lr, r3, r4
178    sub r1, r1, r4, lsr #1
179
180    lsr r5, #1
181
182    //processing a colume   data
183comp_ds_bilinear_w_x32_loop0:
184
185    lsr r6, r4, #5
186    add r7, r2, r3
187    //processing a line data
188comp_ds_bilinear_w_x32_loop1:
189
190    vld1.8 {q0,q1}, [r2]!
191    vld1.8 {q2,q3}, [r7]!
192    vuzp.8 q0, q1
193    vuzp.8 q2, q3
194    vrhadd.u8 q0, q0, q1
195    vrhadd.u8 q2, q2, q3
196    vrhadd.u8 q0, q0, q2
197    vst1.32 {q0},   [r0]!
198    subs r6, #1
199    bne comp_ds_bilinear_w_x32_loop1
200
201    add r2, r7, lr
202    add r0, r1
203    subs r5, #1
204    bne comp_ds_bilinear_w_x32_loop0
205
206    ldmia   sp!, {r4-r7,lr}
207WELS_ASM_FUNC_END
208
209
210WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
211    stmdb sp!, {r4-r12, lr}
212
213    //Get the data from stack
214    ldr r4, [sp, #40] //the addr of src
215    ldr r5, [sp, #44] //the value of src_stride
216    ldr r6, [sp, #48] //the value of scaleX
217    ldr r7, [sp, #52] //the value of scaleY
218
219    mov     r10, #32768
220    sub     r10, #1
221    and     r8, r6, r10         // r8 uinc(scaleX mod 32767)
222    mov     r11, #-1
223    mul     r11, r8         // r11 -uinc
224
225    vdup.s16 d2, r8
226    vdup.s16 d0, r11
227    vzip.s16 d0, d2         // uinc -uinc uinc -uinc
228
229    and     r9, r7, r10         // r9 vinc(scaleY mod 32767)
230    mov     r11, #-1
231    mul     r11, r9         // r11 -vinc
232
233    vdup.s16 d2, r9
234    vdup.s16 d3, r11
235    vext.8   d5, d3, d2, #4     // vinc vinc -vinc -vinc
236
237    mov      r11, #0x40000000
238    mov      r12, #0x4000
239    sub      r12, #1
240    add      r11, r12
241    vdup.s32 d1, r11;           //init u  16384 16383 16384 16383
242
243    mov      r11, #16384
244    vdup.s16 d16, r11
245    sub      r11, #1
246    vdup.s16 d17, r11
247    vext.8   d7, d17, d16, #4       //init v  16384 16384 16383 16383
248
249    veor    q14,     q14
250    sub     r1,     r2          // stride - width
251    mov     r8,     #16384      // yInverse
252    sub     r3,     #1
253
254_HEIGHT:
255    ldr     r4, [sp, #40]           //the addr of src
256    mov     r11,    r8
257    lsr     r11,    #15
258    mul     r11,    r5
259    add     r11,    r4                  // get current row address
260    mov     r12,    r11
261    add     r12,    r5
262
263    mov     r9,     #16384              // xInverse
264    sub     r10, r2, #1
265    vmov.s16 d6, d1
266
267_WIDTH:
268    mov     lr,     r9
269    lsr     lr,     #15
270    add     r4,     r11,lr
271    vld2.8  {d28[0],d29[0]},    [r4]        //q14: 0000000b0000000a;
272    add     r4,     r12,lr
273    vld2.8  {d28[4],d29[4]},    [r4]        //q14: 000d000b000c000a;
274    vzip.32     d28, d29                    //q14: 000d000c000b000a;
275
276    vmull.u16   q13, d6, d7         //q13: init u  *  init  v
277    vmull.u32   q12, d26,d28
278    vmlal.u32   q12, d27,d29
279    vqadd.u64   d24, d24,d25
280    vrshr.u64   d24, #30
281
282    vst1.8  {d24[0]},   [r0]!
283    add     r9, r6
284    vadd.u16    d6, d0              // inc u
285    vshl.u16    d6, #1
286    vshr.u16    d6, #1
287    subs    r10, #1
288    bne     _WIDTH
289
290WIDTH_END:
291    lsr     r9,     #15
292    add     r4,r11,r9
293    vld1.8  {d24[0]},   [r4]
294    vst1.8  {d24[0]},   [r0]
295    add     r0,     #1
296    add     r8,     r7
297    add     r0,     r1
298    vadd.s16    d7, d5              // inc v
299    vshl.u16    d7, #1
300    vshr.u16    d7, #1
301    subs    r3,     #1
302    bne     _HEIGHT
303
304LAST_ROW:
305    ldr     r4, [sp, #40]           //the addr of src
306    lsr     r8, #15
307    mul     r8, r5
308    add     r4, r8                  // get current row address
309    mov     r9,     #16384
310
311_LAST_ROW_WIDTH:
312    mov     r11,    r9
313    lsr     r11,    #15
314
315    add     r3,     r4,r11
316    vld1.8  {d0[0]},    [r3]
317    vst1.8  {d0[0]},    [r0]
318    add     r0,     #1
319    add     r9,     r6
320    subs    r2,     #1
321    bne     _LAST_ROW_WIDTH
322
323    ldmia sp!, {r4-r12, lr}
324WELS_ASM_FUNC_END
325
326WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon
327    stmdb sp!, {r4-r8, lr}
328
329    //Get the width and height
330    ldr  r4, [sp, #24]  //src_width
331    ldr  r5, [sp, #28]  //src_height
332
333    //Initialize the register
334    mov r6, r2
335    mov r8, r0
336    mov lr, #0
337
338    //Save the tailer for the un-aligned size
339    mla  r7, r1, r5, r0
340    vld1.32 {q15}, [r7]
341
342    add r7, r2, r3
343    //processing a colume data
344comp_ds_bilinear_onethird_loop0:
345
346    vld3.8 {d0, d1, d2}, [r2]!
347    vld3.8 {d3, d4, d5}, [r2]!
348    vld3.8 {d16, d17, d18}, [r7]!
349    vld3.8 {d19, d20, d21}, [r7]!
350
351    vaddl.u8 q11, d0, d1
352    vaddl.u8 q12, d3, d4
353    vaddl.u8 q13, d16, d17
354    vaddl.u8 q14, d19, d20
355    vrshr.u16 q11, #1
356    vrshr.u16 q12, #1
357    vrshr.u16 q13, #1
358    vrshr.u16 q14, #1
359
360    vrhadd.u16 q11, q13
361    vrhadd.u16 q12, q14
362
363    vmovn.u16 d0, q11
364    vmovn.u16 d1, q12
365    vst1.8 {q0}, [r0]!
366
367    add lr, #48
368    cmp lr, r4
369    movcs lr, #0
370    addcs r6, r6, r3, lsl #1
371    addcs r6, r6, r3
372    movcs r2, r6
373    addcs r7, r2, r3
374    addcs r8, r1
375    movcs r0, r8
376    subscs r5, #1
377    bne comp_ds_bilinear_onethird_loop0
378
379    //restore the tailer for the un-aligned size
380    vst1.32 {q15}, [r0]
381
382    ldmia sp!, {r4-r8,lr}
383WELS_ASM_FUNC_END
384
385WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon
386    stmdb sp!, {r4-r8, lr}
387
388    //Get the width and height
389    ldr  r4, [sp, #24]  //src_width
390    ldr  r5, [sp, #28]  //src_height
391
392    //Initialize the register
393    mov r6, r2
394    mov r8, r0
395    mov lr, #0
396    lsr r5, #2
397
398    //Save the tailer for the un-aligned size
399    mla  r7, r1, r5, r0
400    vld1.32 {q15}, [r7]
401
402    add r7, r2, r3
403    //processing a colume data
404comp_ds_bilinear_quarter_loop0:
405
406    vld2.16 {q0, q1}, [r2]!
407    vld2.16 {q2, q3}, [r2]!
408    vld2.16 {q8, q9}, [r7]!
409    vld2.16 {q10, q11}, [r7]!
410
411    vpaddl.u8 q0, q0
412    vpaddl.u8 q2, q2
413    vpaddl.u8 q8, q8
414    vpaddl.u8 q10, q10
415    vrshr.u16 q0, #1
416    vrshr.u16 q2, #1
417    vrshr.u16 q8, #1
418    vrshr.u16 q10, #1
419
420    vrhadd.u16 q0, q8
421    vrhadd.u16 q2, q10
422    vmovn.u16 d0, q0
423    vmovn.u16 d1, q2
424    vst1.8 {q0}, [r0]!
425
426    add lr, #64
427    cmp lr, r4
428    movcs lr, #0
429    addcs r6, r6, r3, lsl #2
430    movcs r2, r6
431    addcs r7, r2, r3
432    addcs r8, r1
433    movcs r0, r8
434    subscs r5, #1
435    bne comp_ds_bilinear_quarter_loop0
436
437    //restore the tailer for the un-aligned size
438    vst1.32 {q15}, [r0]
439
440    ldmia sp!, {r4-r8,lr}
441WELS_ASM_FUNC_END
442
443#endif
444