• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35
36WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsampler_AArch64_neon
37
38    //Initialize the register
39    mov x6, x2
40    mov x8, x0
41    mov w9, #0
42    lsr w5, w5, #1
43
44    //Save the tailer   for the unasigned   size
45    smaddl  x7, w1, w5, x0
46    ld1 {v4.16b}, [x7]
47
48    add x7, x2, w3, sxtw
49    //processing a colume   data
50comp_ds_bilinear_loop0:
51
52    ld1     {v0.16b, v1.16b}, [x2], #32
53    ld1     {v2.16b, v3.16b}, [x7], #32
54    uzp1    v4.16b, v0.16b, v1.16b
55    uzp2    v5.16b, v0.16b, v1.16b
56    uzp1    v6.16b, v2.16b, v3.16b
57    uzp2    v7.16b, v2.16b, v3.16b
58    urhadd  v0.16b, v4.16b, v5.16b
59    urhadd  v1.16b, v6.16b, v7.16b
60    urhadd  v2.16b, v0.16b, v1.16b
61    st1     {v2.16b}, [x0], #16
62    add     w9, w9, #32
63
64    cmp     w9, w4
65    b.cc    comp_ds_bilinear_loop0
66
67    mov     w9, #0
68    add     x6, x6, w3, sxtw #1
69    mov     x2, x6
70    add     x7, x2, w3, sxtw
71    add     x8, x8, w1, sxtw
72    mov     x0, x8
73    sub     w5, w5, #1
74
75    cbnz    w5, comp_ds_bilinear_loop0
76
77    //restore   the tailer for the unasigned size
78    st1     {v4.16b}, [x0]
79
80WELS_ASM_AARCH64_FUNC_END
81
82WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_AArch64_neon
83    sub     w9, w3, w4
84    sub     w1, w1, w4, lsr #1
85    lsr     w5, w5, #1
86
87    //processing a colume   data
88comp_ds_bilinear_w_x32_loop0:
89
90    lsr     w6, w4, #5
91    add     x7, x2, w3, sxtw
92    //processing a line data
93comp_ds_bilinear_w_x32_loop1:
94
95    ld1     {v0.16b, v1.16b}, [x2], #32
96    ld1     {v2.16b, v3.16b}, [x7], #32
97    uzp1    v4.16b, v0.16b, v1.16b
98    uzp2    v5.16b, v0.16b, v1.16b
99    uzp1    v6.16b, v2.16b, v3.16b
100    uzp2    v7.16b, v2.16b, v3.16b
101    urhadd  v0.16b, v4.16b, v5.16b
102    urhadd  v1.16b, v6.16b, v7.16b
103    urhadd  v2.16b, v0.16b, v1.16b
104    st1     {v2.16b}, [x0], #16
105
106    sub     w6, w6, #1
107    cbnz    w6, comp_ds_bilinear_w_x32_loop1
108
109    add     x2, x7, w9, sxtw
110    add     x0, x0, w1, sxtw
111    sub     w5, w5, #1
112    cbnz    w5, comp_ds_bilinear_w_x32_loop0
113WELS_ASM_AARCH64_FUNC_END
114
115WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_AArch64_neon
116
117    //Initialize the register
118    mov x6, x2
119    mov x8, x0
120    mov w9, #0
121
122    //Save the tailer   for the unasigned   size
123    smaddl  x7, w1, w5, x0
124    ld1 {v16.16b}, [x7]
125
126    add x7, x2, w3, sxtw
127    //processing a colume   data
128comp_ds_bilinear_onethird_loop0:
129
130    ld3     {v0.16b, v1.16b, v2.16b}, [x2], #48
131    ld3     {v4.16b, v5.16b, v6.16b}, [x7], #48
132
133    uaddl   v2.8h, v0.8b, v1.8b
134    uaddl2  v3.8h, v0.16b, v1.16b
135    uaddl   v6.8h, v4.8b, v5.8b
136    uaddl2  v7.8h, v4.16b, v5.16b
137    urshr   v2.8h, v2.8h, #1
138    urshr   v3.8h, v3.8h, #1
139    urshr   v6.8h, v6.8h, #1
140    urshr   v7.8h, v7.8h, #1
141
142    urhadd  v0.8h, v2.8h, v6.8h
143    urhadd  v1.8h, v3.8h, v7.8h
144    xtn     v0.8b, v0.8h
145    xtn     v1.8b, v1.8h
146    st1     {v0.8b,v1.8b}, [x0], #16
147
148    add     w9, w9, #48
149
150    cmp     w9, w4
151    b.cc    comp_ds_bilinear_onethird_loop0
152
153    mov     w9, #0
154    add     x6, x6, w3, sxtw #1
155    add     x6, x6, w3, sxtw
156    mov     x2, x6
157    add     x7, x2, w3, sxtw
158    add     x8, x8, w1, sxtw
159    mov     x0, x8
160    sub     w5, w5, #1
161
162    cbnz    w5, comp_ds_bilinear_onethird_loop0
163
164    //restore   the tailer for the unasigned size
165    st1     {v16.16b}, [x0]
166WELS_ASM_AARCH64_FUNC_END
167//void DyadicBilinearQuarterDownsampler_AArch64_neon(uint8_t* pDst, const int32_t kiDstStride,
168//uint8_t* pSrc, const int32_t kiSrcStride,
169//const int32_t kiSrcWidth, const int32_t kiHeight);
170
171WELS_ASM_AARCH64_FUNC_BEGIN DyadicBilinearQuarterDownsampler_AArch64_neon
172    //Initialize the register
173    mov x6, x2
174    mov x8, x0
175    mov w9, #0
176    lsr w5, w5, #2
177
178    //Save the tailer   for the unasigned   size
179    smaddl  x7, w1, w5, x0
180    ld1 {v16.16b}, [x7]
181
182    add x7, x2, w3, sxtw
183    //processing a colume   data
184comp_ds_bilinear_quarter_loop0:
185
186    ld2     {v0.8h, v1.8h}, [x2], #32
187    ld2     {v2.8h, v3.8h}, [x2], #32
188    ld2     {v4.8h, v5.8h}, [x7], #32
189    ld2     {v6.8h, v7.8h}, [x7], #32
190
191    uaddlp  v0.8h, v0.16b
192    uaddlp  v1.8h, v2.16b
193    uaddlp  v4.8h, v4.16b
194    uaddlp  v5.8h, v6.16b
195    urshr   v0.8h, v0.8h, #1
196    urshr   v1.8h, v1.8h, #1
197    urshr   v4.8h, v4.8h, #1
198    urshr   v5.8h, v5.8h, #1
199
200    urhadd  v0.8h, v0.8h, v4.8h
201    urhadd  v1.8h, v1.8h, v5.8h
202    xtn     v0.8b, v0.8h
203    xtn     v1.8b, v1.8h
204    st1     {v0.8b,v1.8b}, [x0], #16
205
206    add     w9, w9, #64
207
208    cmp     w9, w4
209    b.cc    comp_ds_bilinear_quarter_loop0
210
211    mov     w9, #0
212    add     x6, x6, w3, sxtw #2
213    mov     x2, x6
214    add     x7, x2, w3, sxtw
215    add     x8, x8, w1, sxtw
216    mov     x0, x8
217    sub     w5, w5, #1
218
219    cbnz    w5, comp_ds_bilinear_quarter_loop0
220
221    //restore   the tailer for the unasigned size
222    st1     {v16.16b}, [x0]
223WELS_ASM_AARCH64_FUNC_END
224
225//void GeneralBilinearAccurateDownsampler_AArch64_neon (uint8_t* pDst, const int32_t kiDstStride,
226//    const int32_t kiDstWidth, const int32_t kiDstHeight,
227//   uint8_t* pSrc, const int32_t kiSrcStride, const uint32_t kuiScaleX, const uint32_t kuiScaleY);
228WELS_ASM_AARCH64_FUNC_BEGIN GeneralBilinearAccurateDownsampler_AArch64_neon
229    mov     w10, #32767
230    and     w8, w6, w10
231    mov     w11, #-1
232    mul     w12, w11, w8
233
234    dup     v2.4h, w8
235    dup     v0.4h, w12
236    zip1    v0.4h, v0.4h, v2.4h     // uinc -uinc uinc -uinc
237
238    and     w9, w7, w10
239    mul     w12, w11, w9
240
241    dup     v2.4h, w9
242    dup     v5.4h, w12
243    ins     v5.s[1], v2.s[0]        // vinc vinc -vinc -vinc
244
245    mov     w11, #0x40000000
246    mov     w12, #0x3FFF
247    add     w11, w11, w12
248    dup     v1.2s, w11              //init u  16384 16383 16384 16383
249
250    mov     w8, #16384
251    dup     v7.4h, w8
252    sub     w11, w8, #1
253    dup     v2.4h, w11
254    ins     v7.s[0], v2.s[0]        //init v  16384 16384 16383 16383
255
256    eor     v26.16b, v26.16b, v26.16b
257    eor     v27.16b, v27.16b, v27.16b
258    SIGN_EXTENSION x1, w1
259    SIGN_EXTENSION x2, w2
260    SIGN_EXTENSION x3, w3
261    SIGN_EXTENSION x5, w5
262    SIGN_EXTENSION x6, w6
263    SIGN_EXTENSION x7, w7
264
265    sub     x1, x1, x2
266    sub     x3, x3, #1
267
268_HEIGHT:
269    lsr     w11, w8, #15
270    mul     w11, w11, w5
271    add     x15, x4, w11, sxtw
272    add     x12, x15, w5, sxtw
273
274    mov     x9, #16384
275    sub     x10, x2, #1
276    orr     v6.8b, v1.8b, v1.8b
277
278_WIDTH:
279    lsr     x13, x9, #15
280    add     x14, x15, x13
281    ld2     {v26.b, v27.b}[0], [x14]  //q14: 0000000b0000000a;
282    add     x14, x12, x13
283    ld2     {v26.b, v27.b}[4], [x14]  //q14: 000d000b000c000a;
284    zip1    v28.2s, v26.2s, v27.2s
285    zip2    v29.2s, v26.2s, v27.2s
286
287    umull   v20.4s, v6.4h, v7.4h
288    umull   v21.2d, v28.2s, v20.2s
289    ins     v20.d[0], v20.d[1]
290    umlal   v21.2d, v29.2s, v20.2s
291
292    addp    d21, v21.2d
293    urshr   d21, d21, #30
294
295    st1     {v21.b}[0], [x0], #1
296    add     x9, x9, x6
297    add     v6.4h, v6.4h, v0.4h
298    shl     v6.4h, v6.4h, #1
299    ushr    v6.4h, v6.4h, #1
300    sub     x10, x10, #1
301    cbnz    x10, _WIDTH
302
303WIDTH_END:
304    lsr     x9, x9, #15
305    add     x14, x15, x9
306    ld1     {v21.b}[0], [x14]
307    st1     {v21.b}[0], [x0], #1
308    add     w8, w8, w7
309    add     x0, x0, x1
310    add     v7.4h, v7.4h, v5.4h
311    shl     v7.4h, v7.4h, #1
312    ushr    v7.4h, v7.4h, #1
313    sub     x3, x3, #1
314    cbnz    x3, _HEIGHT
315
316LAST_ROW:
317    lsr     w8, w8, #15
318    mul     w8, w8, w5
319    add     x4, x4, w8, sxtw
320    mov     x9, #16384
321
322_LAST_ROW_WIDTH:
323    mov     x11, x9
324    lsr     x11, x11, #15
325    add     x3, x4, x11
326    ld1     {v21.b}[0], [x3]
327    st1     {v21.b}[0], [x0], #1
328    add     x9, x9, x6
329    sub     x2, x2, #1
330    cbnz    x2, _LAST_ROW_WIDTH
331
332WELS_ASM_AARCH64_FUNC_END
333
334#endif
335