• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35//int32_t SumOf8x8SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
36WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8SingleBlock_AArch64_neon
37    SIGN_EXTENSION x1,w1
38    ld1 {v0.d}[0], [x0], x1
39    ld1 {v0.d}[1], [x0], x1
40    ld1 {v1.d}[0], [x0], x1
41    ld1 {v1.d}[1], [x0], x1
42    ld1 {v2.d}[0], [x0], x1
43    ld1 {v2.d}[1], [x0], x1
44    ld1 {v3.d}[0], [x0], x1
45    ld1 {v3.d}[1], [x0]
46    uaddlp v0.8h, v0.16b
47    uadalp v0.8h, v1.16b
48    uadalp v0.8h, v2.16b
49    uadalp v0.8h, v3.16b
50    uaddlv s0, v0.8h
51    mov    x0, v0.d[0]
52WELS_ASM_AARCH64_FUNC_END
53
54//int32_t SumOf16x16SingleBlock_AArch64_neon (uint8_t* pRef, const int32_t kiRefStride);
55WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16SingleBlock_AArch64_neon
56    SIGN_EXTENSION x1,w1
57    ld1 {v0.16b}, [x0], x1
58    uaddlp v0.8h, v0.16b
59.rept 15
60    ld1 {v1.16b}, [x0], x1
61    uadalp v0.8h, v1.16b
62.endr
63    uaddlv s0, v0.8h
64    mov    x0, v0.d[0]
65WELS_ASM_AARCH64_FUNC_END
66
67//void SumOf8x8BlockOfFrame_AArch64_neon (uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,
68//                               const int32_t kiRefStride,
69//                                uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[]);
70WELS_ASM_AARCH64_FUNC_BEGIN SumOf8x8BlockOfFrame_AArch64_neon
71//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
72    //x5: pTimesOfFeatureValue
73    //x4: pFeatureOfBlock
74
75    SIGN_EXTENSION x1,w1
76    SIGN_EXTENSION x2,w2
77    SIGN_EXTENSION x3,w3
78    mov x8, x0
79    mov x6, x1
80    add x8, x8, x6
81    add x4, x4, x6, lsl #1
82
83    mov x7, x6
84_width_loop8x8_1:
85    subs x0, x8, x7
86    ld1 {v0.d}[0], [x0], x3
87    ld1 {v0.d}[1], [x0], x3
88    ld1 {v1.d}[0], [x0], x3
89    ld1 {v1.d}[1], [x0], x3
90    ld1 {v2.d}[0], [x0], x3
91    ld1 {v2.d}[1], [x0], x3
92    ld1 {v3.d}[0], [x0], x3
93    ld1 {v3.d}[1], [x0]
94    uaddlp v0.8h, v0.16b
95    uadalp v0.8h, v1.16b
96    uadalp v0.8h, v2.16b
97    uadalp v0.8h, v3.16b
98    uaddlv s0, v0.8h
99
100    subs x1, x4, x7, lsl #1
101    st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
102    mov w0, #0
103    ins v0.s[1], w0
104    mov    x0, v0.d[0]
105    add x1, x5, x0, lsl #2
106    ldr w0, [x1]
107    add w0, w0, #1
108    str w0, [x1]
109    subs x7, x7, #1
110    cbnz x7, _width_loop8x8_1
111
112    add x8, x8, x3
113    add x4, x4, x6, lsl #1
114    subs x2, x2, #1
115    cbz x2, _SumOf8x8BlockOfFrame_AArch64_neon_end
116
117_height_loop8x8:
118    mov x7, x6
119_width_loop8x8_2:
120    subs x0, x8, x7
121    subs x1, x4, x7, lsl #1
122    subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
123    ldrh  w10, [x9] // sum of last line of pFeatureOfBlock[i]
124
125    subs x11, x0, x3
126    ld1 {v0.d}[1], [x11]
127    add x0, x11, x3, lsl #3
128    ld1 {v0.d}[0], [x0] //
129
130    uaddlp v0.8h, v0.16b
131    addp v0.8h, v0.8h, v1.8h
132    uaddlp v0.4s, v0.8h
133    umov w11, v0.s[0]
134    umov w12, v0.s[1]
135
136    subs w10, w10, w12
137    mov x0, #0
138    add w0, w10, w11
139    strh w0, [x1] // sum -> pFeatureOfBlock[i]
140    add x1, x5, x0, lsl #2
141    ldr w0, [x1]
142    add w0, w0, #1
143    str w0, [x1]
144    subs x7, x7, #1
145    cbnz x7, _width_loop8x8_2
146
147    add x8, x8, x3
148    add x4, x4, x6, lsl #1
149    subs x2, x2, #1
150    cbnz x2, _height_loop8x8
151_SumOf8x8BlockOfFrame_AArch64_neon_end:
152WELS_ASM_AARCH64_FUNC_END
153
154WELS_ASM_AARCH64_FUNC_BEGIN SumOf16x16BlockOfFrame_AArch64_neon
155//(uint8_t* pRefPicture, const int32_t kiWidth, const int32_t kiHeight,const int32_t kiRefStride,uint16_t* pFeatureOfBlock, uint32_t pTimesOfFeatureValue[])
156    //x5: pTimesOfFeatureValue
157    //x4: pFeatureOfBlock
158
159    SIGN_EXTENSION x1,w1
160    SIGN_EXTENSION x2,w2
161    SIGN_EXTENSION x3,w3
162    mov x8, x0
163    mov x6, x1
164    add x8, x8, x6
165    add x4, x4, x6, lsl #1
166
167    mov x7, x6
168_width_loop16x16_1:
169    subs x0, x8, x7
170    ld1 {v0.16b}, [x0], x3
171    uaddlp v0.8h, v0.16b
172.rept 15
173    ld1 {v1.16b}, [x0], x3
174    uadalp v0.8h, v1.16b
175.endr
176    uaddlv s0, v0.8h
177
178    subs x1, x4, x7, lsl #1
179    st1 {v0.h}[0], [x1] // sum -> pFeatureOfBlock[i]
180    mov w0, #0
181    ins v0.s[1], w0
182    mov    x0, v0.d[0]
183    add x1, x5, x0, lsl #2
184    ldr w0, [x1]
185    add w0, w0, #1
186    str w0, [x1]
187    subs x7, x7, #1
188    cbnz x7, _width_loop16x16_1
189
190    add x8, x8, x3
191    add x4, x4, x6, lsl #1
192    subs x2, x2, #1
193    cbz x2, _SumOf16x16BlockOfFrame_AArch64_neon_end
194
195_height_loop16x16:
196    mov x7, x6
197_width_loop16x16_2:
198    subs x0, x8, x7
199
200    subs x1, x4, x7, lsl #1
201    subs x9, x1, x6, lsl #1 // last line of pFeatureOfBlock[i]
202    ldrh  w10, [x9] // sum of last line of pFeatureOfBlock[i]
203
204    subs x11, x0, x3
205    ld1 {v1.16b}, [x11]
206    add x0, x11, x3, lsl #4
207    ld1 {v0.16b}, [x0] //
208
209    uaddlv h0, v0.16b
210    uaddlv h1, v1.16b
211    umov w11, v0.h[0]
212    umov w12, v1.h[0]
213
214    subs w10, w10, w12
215    mov x0, #0
216    add w0, w10, w11
217    strh w0, [x1] // sum -> pFeatureOfBlock[i]
218    add x1, x5, x0, lsl #2
219    ldr w0, [x1]
220    add w0, w0, #1
221    str w0, [x1]
222    subs x7, x7, #1
223    cbnz x7, _width_loop16x16_2
224
225    add x8, x8, x3
226    add x4, x4, x6, lsl #1
227    subs x2, x2, #1
228    cbnz x2, _height_loop16x16
229_SumOf16x16BlockOfFrame_AArch64_neon_end:
230WELS_ASM_AARCH64_FUNC_END
231
232WELS_ASM_AARCH64_FUNC_BEGIN InitializeHashforFeature_AArch64_neon
233// (uint32_t* pTimesOfFeatureValue, uint16_t* pBuf, const int32_t kiListSize, uint16_t** pLocationOfFeature, uint16_t** pFeatureValuePointerList);
234    SIGN_EXTENSION x2,w2
235    mov x9, #3
236    bic x5, x2, x9
237    mov x8, #0
238_hash_assign_loop_x4:
239    ld1 {v0.16b}, [x0], #16
240    shl v0.4s, v0.4s, #2
241    addv s1, v0.4s
242    umov w7, v1.s[0]
243    cbz w7, _hash_assign_with_copy_x4
244
245    ins v2.d[0], x1
246    umov w8, v0.s[0]
247    add x1, x1, x8
248    ins v2.d[1], x1
249    umov w8, v0.s[1]
250    add x1, x1, x8
251    ins v3.d[0], x1
252    umov w8, v0.s[2]
253    add x1, x1, x8
254    ins v3.d[1], x1
255    umov w8, v0.s[3]
256    add x1, x1, x8
257    st1 {v2.16b, v3.16b}, [x3], #32
258    st1 {v2.16b, v3.16b}, [x4], #32
259    b _assign_next
260_hash_assign_with_copy_x4:
261    dup  v2.2d, x1
262    dup  v3.2d, x1
263    st1 {v2.16b, v3.16b}, [x3], #32
264    st1 {v2.16b, v3.16b}, [x4], #32
265
266_assign_next:
267    subs x5, x5, #4
268    cbnz x5, _hash_assign_loop_x4
269
270    and x5, x2, x9
271    cbz x5, _hash_assign_end
272
273
274_hash_assign_loop_x4_rem:
275    str x1, [x3], #8
276    str x1, [x4], #8
277    ldr w8, [x0], #4
278    lsl w8, w8, #2
279    add x1, x1, x8
280    subs x5, x5, #1
281    cbnz x5, _hash_assign_loop_x4_rem
282
283_hash_assign_end:
284WELS_ASM_AARCH64_FUNC_END
285
286.align 4
287mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
288mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
289mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
290
291WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon
292// void  (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
293    ldr q7, mv_x_inc_x4
294    ldr q6, mv_y_inc_x4
295    ldr q5, mx_x_offset_x4
296    SIGN_EXTENSION x1,w1
297    SIGN_EXTENSION x2,w2
298    eor v4.16b, v4.16b, v4.16b
299    eor v3.16b, v3.16b, v3.16b
300    dup v16.2d, x3 // v8->v16
301
302_hash_height_loop:
303    mov x7, x1
304    mov v2.16b, v5.16b //mx_x_offset_x4
305
306_hash_width_loop:
307    ld1 {v0.d}[0], [x0], #8
308
309    ushll v0.4s, v0.4h, #3
310    uaddw   v17.2d, v16.2d, v0.2s
311    uaddw2  v18.2d, v16.2d, v0.4s
312    zip1 v1.8h, v2.8h, v3.8h
313
314    umov x4, v17.d[0]
315    ldr x5, [x4]
316    umov w6, v1.s[0]
317    str w6, [x5]
318    add x5, x5, #4
319    str x5, [x4]
320
321    umov x4, v17.d[1]
322    ldr x5, [x4]
323    umov w6, v1.s[1]
324    str w6, [x5]
325    add x5, x5, #4
326    str x5, [x4]
327
328    umov x4, v18.d[0]
329    ldr x5, [x4]
330    umov w6, v1.s[2]
331    str w6, [x5]
332    add x5, x5, #4
333    str x5, [x4]
334
335    umov x4, v18.d[1]
336    ldr x5, [x4]
337    umov w6, v1.s[3]
338    str w6, [x5]
339    add x5, x5, #4
340    str x5, [x4]
341
342    add v2.8h, v2.8h, v7.8h
343    subs x7, x7, #4
344    cbnz x7, _hash_width_loop
345
346    add v3.8h, v3.8h, v6.8h
347    subs x2, x2, #1
348    cbnz x2, _hash_height_loop
349WELS_ASM_AARCH64_FUNC_END
350#endif
351