• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35
36.macro CALC_AND_STORE_SAD
37    saddlv  s2, v2.8h
38    fmov    w0, s2
39.endm
40
41.macro CALC_AND_STORE_SAD_FOUR
42    saddlv  s28, v28.8h
43    saddlv  s29, v29.8h
44    saddlv  s30, v30.8h
45    saddlv  s31, v31.8h
46    st4     {v28.s, v29.s, v30.s, v31.s}[0], [x4]
47.endm
48
49.macro LOAD_8X8_1
50    ld1     {v0.8b}, [x0], x1
51    ld1     {v1.8b}, [x0], x1
52    ld1     {v2.8b}, [x0], x1
53    ld1     {v3.8b}, [x0], x1
54    ld1     {v4.8b}, [x0], x1
55    ld1     {v5.8b}, [x0], x1
56    ld1     {v6.8b}, [x0], x1
57    ld1     {v7.8b}, [x0], x1
58.endm
59
60.macro LOAD_16X8_1
61    ld1     {v0.16b}, [x0], x1
62    ld1     {v1.16b}, [x0], x1
63    ld1     {v2.16b}, [x0], x1
64    ld1     {v3.16b}, [x0], x1
65    ld1     {v4.16b}, [x0], x1
66    ld1     {v5.16b}, [x0], x1
67    ld1     {v6.16b}, [x0], x1
68    ld1     {v7.16b}, [x0], x1
69.endm
70
71.macro LOAD_8X8_2 arg0
72    ld1     {v16.8b}, [\arg0], x3
73    ld1     {v17.8b}, [\arg0], x3
74    ld1     {v18.8b}, [\arg0], x3
75    ld1     {v19.8b}, [\arg0], x3
76    ld1     {v20.8b}, [\arg0], x3
77    ld1     {v21.8b}, [\arg0], x3
78    ld1     {v22.8b}, [\arg0], x3
79    ld1     {v23.8b}, [\arg0], x3
80.endm
81
82.macro CALC_ABS_8X8_1 arg0, arg1
83    uab\arg1\()l    \arg0, v0.8b, v16.8b
84    uabal   \arg0, v1.8b, v17.8b
85    uabal   \arg0, v2.8b, v18.8b
86    uabal   \arg0, v3.8b, v19.8b
87    uabal   \arg0, v4.8b, v20.8b
88    uabal   \arg0, v5.8b, v21.8b
89    uabal   \arg0, v6.8b, v22.8b
90    uabal   \arg0, v7.8b, v23.8b
91.endm
92
93.macro CALC_ABS_8X8_2 arg0
94    uab\arg0\()l    v29.8h, v0.8b, v18.8b
95    uabal   v29.8h, v1.8b, v19.8b
96    uabal   v29.8h, v2.8b, v20.8b
97    uabal   v29.8h, v3.8b, v21.8b
98    uabal   v29.8h, v4.8b, v22.8b
99    uabal   v29.8h, v5.8b, v23.8b
100    uabal   v29.8h, v6.8b, v24.8b
101    uabal   v29.8h, v7.8b, v25.8b
102.endm
103
104.macro LOAD_16X8_2 arg0
105    ld1     {v16.16b}, [\arg0], x3
106    ld1     {v17.16b}, [\arg0], x3
107    ld1     {v18.16b}, [\arg0], x3
108    ld1     {v19.16b}, [\arg0], x3
109    ld1     {v20.16b}, [\arg0], x3
110    ld1     {v21.16b}, [\arg0], x3
111    ld1     {v22.16b}, [\arg0], x3
112    ld1     {v23.16b}, [\arg0], x3
113.endm
114
115.macro CALC_ABS_16X8_1 arg0, arg1
116    uab\arg1\()l  \arg0, v0.8b, v16.8b
117    uabal2  \arg0, v0.16b,v16.16b
118    uabal   \arg0, v1.8b, v17.8b
119    uabal2  \arg0, v1.16b,v17.16b
120    uabal   \arg0, v2.8b, v18.8b
121    uabal2  \arg0, v2.16b,v18.16b
122    uabal   \arg0, v3.8b, v19.8b
123    uabal2  \arg0, v3.16b,v19.16b
124    uabal   \arg0, v4.8b, v20.8b
125    uabal2  \arg0, v4.16b,v20.16b
126    uabal   \arg0, v5.8b, v21.8b
127    uabal2  \arg0, v5.16b,v21.16b
128    uabal   \arg0, v6.8b, v22.8b
129    uabal2  \arg0, v6.16b,v22.16b
130    uabal   \arg0, v7.8b, v23.8b
131    uabal2  \arg0, v7.16b,v23.16b
132.endm
133
134.macro CALC_ABS_16X8_2 arg0
135    uab\arg0\()l  v29.8h, v0.8b, v18.8b
136    uabal2  v29.8h, v0.16b,v18.16b
137    uabal   v29.8h, v1.8b, v19.8b
138    uabal2  v29.8h, v1.16b,v19.16b
139    uabal   v29.8h, v2.8b, v20.8b
140    uabal2  v29.8h, v2.16b,v20.16b
141    uabal   v29.8h, v3.8b, v21.8b
142    uabal2  v29.8h, v3.16b,v21.16b
143    uabal   v29.8h, v4.8b, v22.8b
144    uabal2  v29.8h, v4.16b,v22.16b
145    uabal   v29.8h, v5.8b, v23.8b
146    uabal2  v29.8h, v5.16b,v23.16b
147    uabal   v29.8h, v6.8b, v24.8b
148    uabal2  v29.8h, v6.16b,v24.16b
149    uabal   v29.8h, v7.8b, v25.8b
150    uabal2  v29.8h, v7.16b,v25.16b
151.endm
152
153WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad4x4_AArch64_neon
154    sxtw    x1, w1
155    sxtw    x3, w3
156    ld1     {v0.s}[0], [x0], x1
157    ld1     {v1.s}[0], [x2], x3
158    uabdl   v2.8h, v0.8b, v1.8b
159.rept 3
160    ld1     {v0.s}[0], [x0], x1
161    ld1     {v1.s}[0], [x2], x3
162    uabal   v2.8h, v0.8b, v1.8b
163.endr
164    saddlv  s2, v2.4h
165    fmov    w0, s2
166WELS_ASM_AARCH64_FUNC_END
167
168WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad8x8_AArch64_neon
169    sxtw    x1, w1
170    sxtw    x3, w3
171    ld1     {v0.8b}, [x0], x1
172    ld1     {v1.8b}, [x2], x3
173    uabdl   v2.8h, v0.8b, v1.8b
174.rept 7
175    ld1     {v0.8b}, [x0], x1
176    ld1     {v1.8b}, [x2], x3
177    uabal   v2.8h, v0.8b, v1.8b
178.endr
179    CALC_AND_STORE_SAD
180WELS_ASM_AARCH64_FUNC_END
181
182WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad8x16_AArch64_neon
183    sxtw    x1, w1
184    sxtw    x3, w3
185    ld1     {v0.8b}, [x0], x1
186    ld1     {v1.8b}, [x2], x3
187    uabdl   v2.8h, v0.8b, v1.8b
188.rept 15
189    ld1     {v0.8b}, [x0], x1
190    ld1     {v1.8b}, [x2], x3
191    uabal   v2.8h, v0.8b, v1.8b
192.endr
193    CALC_AND_STORE_SAD
194WELS_ASM_AARCH64_FUNC_END
195
196WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad16x8_AArch64_neon
197    sxtw    x1, w1
198    sxtw    x3, w3
199    ld1     {v0.16b}, [x0], x1
200    ld1     {v1.16b}, [x2], x3
201    uabdl   v2.8h, v0.8b, v1.8b
202    uabal2  v2.8h, v0.16b, v1.16b
203.rept 7
204    ld1     {v0.16b}, [x0], x1
205    ld1     {v1.16b}, [x2], x3
206    uabal   v2.8h, v0.8b, v1.8b
207    uabal2  v2.8h, v0.16b, v1.16b
208.endr
209    CALC_AND_STORE_SAD
210WELS_ASM_AARCH64_FUNC_END
211
212WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSad16x16_AArch64_neon
213    sxtw    x1, w1
214    sxtw    x3, w3
215    ld1     {v0.16b}, [x0], x1
216    ld1     {v1.16b}, [x2], x3
217    uabdl   v2.8h, v0.8b, v1.8b
218    uabal2  v2.8h, v0.16b, v1.16b
219.rept 15
220    ld1     {v0.16b}, [x0], x1
221    ld1     {v1.16b}, [x2], x3
222    uabal   v2.8h, v0.8b, v1.8b
223    uabal2  v2.8h, v0.16b, v1.16b
224.endr
225    CALC_AND_STORE_SAD
226WELS_ASM_AARCH64_FUNC_END
227
228WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour4x4_AArch64_neon
229    sxtw    x1, w1
230    sxtw    x3, w3
231    ld1     {v0.s}[0], [x0], x1
232    ld1     {v0.s}[1], [x0], x1
233    ld1     {v1.s}[0], [x0], x1
234    ld1     {v1.s}[1], [x0]
235    sub     x0, x2, x3
236    ld1     {v2.s}[0], [x0], x3
237    ld1     {v2.s}[1], [x0], x3
238    ld1     {v3.s}[0], [x0], x3
239    ld1     {v3.s}[1], [x0], x3
240    ld1     {v4.s}[0], [x0], x3
241    ld1     {v4.s}[1], [x0], x3
242
243    uabdl   v28.8h, v0.8b, v2.8b
244    uabal   v28.8h, v1.8b, v3.8b
245
246    uabdl   v29.8h, v0.8b, v3.8b
247    uabal   v29.8h, v1.8b, v4.8b
248
249    sub     x0, x2, #1
250    ld1     {v2.s}[0], [x0], x3
251    ld1     {v2.s}[1], [x0], x3
252    ld1     {v3.s}[0], [x0], x3
253    ld1     {v3.s}[1], [x0]
254    uabdl   v30.8h, v0.8b, v2.8b
255    uabal   v30.8h, v1.8b, v3.8b
256
257    add     x0, x2, #1
258    ld1     {v2.s}[0], [x0], x3
259    ld1     {v2.s}[1], [x0], x3
260    ld1     {v3.s}[0], [x0], x3
261    ld1     {v3.s}[1], [x0]
262    uabdl   v31.8h, v0.8b, v2.8b
263    uabal   v31.8h, v1.8b, v3.8b
264
265    CALC_AND_STORE_SAD_FOUR
266WELS_ASM_AARCH64_FUNC_END
267
268WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour8x8_AArch64_neon
269    sxtw    x1, w1
270    sxtw    x3, w3
271    LOAD_8X8_1
272    sub     x0, x2, x3
273    LOAD_8X8_2 x0
274    ld1     {v24.8b}, [x0], x3
275    ld1     {v25.8b}, [x0]
276
277    CALC_ABS_8X8_1 v28.8h, d
278    CALC_ABS_8X8_2 d
279
280    sub     x0, x2, #1
281    LOAD_8X8_2 x0
282    CALC_ABS_8X8_1 v30.8h, d
283
284    add     x0, x2, #1
285    LOAD_8X8_2 x0
286    CALC_ABS_8X8_1 v31.8h, d
287
288    CALC_AND_STORE_SAD_FOUR
289WELS_ASM_AARCH64_FUNC_END
290
291WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour8x16_AArch64_neon
292    sxtw    x1, w1
293    sxtw    x3, w3
294    LOAD_8X8_1
295    sub     x5, x2, x3
296    LOAD_8X8_2 x5
297    ld1     {v24.8b}, [x5], x3
298    ld1     {v25.8b}, [x5], x3
299
300    CALC_ABS_8X8_1 v28.8h, d
301    CALC_ABS_8X8_2 d
302
303    sub     x6, x2, #1
304    LOAD_8X8_2 x6
305    CALC_ABS_8X8_1 v30.8h, d
306
307    add     x7, x2, #1
308    LOAD_8X8_2 x7
309    CALC_ABS_8X8_1 v31.8h, d
310
311    LOAD_8X8_1
312    sub     x5, x5, x3
313    sub     x5, x5, x3
314    LOAD_8X8_2 x5
315    ld1     {v24.8b}, [x5], x3
316    ld1     {v25.8b}, [x5]
317
318    CALC_ABS_8X8_1 v28.8h, a
319    CALC_ABS_8X8_2 a
320
321    LOAD_8X8_2 x6
322    CALC_ABS_8X8_1 v30.8h, a
323
324    LOAD_8X8_2 x7
325    CALC_ABS_8X8_1 v31.8h, a
326
327    CALC_AND_STORE_SAD_FOUR
328WELS_ASM_AARCH64_FUNC_END
329
330WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour16x8_AArch64_neon
331    sxtw    x1, w1
332    sxtw    x3, w3
333    LOAD_16X8_1
334    sub     x0, x2, x3
335    LOAD_16X8_2 x0
336    ld1     {v24.16b}, [x0], x3
337    ld1     {v25.16b}, [x0]
338
339    CALC_ABS_16X8_1 v28.8h, d
340    CALC_ABS_16X8_2 d
341
342    sub     x0, x2, #1
343    LOAD_16X8_2 x0
344    CALC_ABS_16X8_1 v30.8h, d
345
346    add     x0, x2, #1
347    LOAD_16X8_2 x0
348    CALC_ABS_16X8_1 v31.8h, d
349
350    CALC_AND_STORE_SAD_FOUR
351WELS_ASM_AARCH64_FUNC_END
352
353WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSadFour16x16_AArch64_neon
354    sxtw    x1, w1
355    sxtw    x3, w3
356
357    LOAD_16X8_1
358    sub     x5, x2, x3
359    LOAD_16X8_2 x5
360    ld1     {v24.16b}, [x5], x3
361    ld1     {v25.16b}, [x5], x3
362
363    CALC_ABS_16X8_1 v28.8h, d
364    CALC_ABS_16X8_2 d
365
366    sub     x6, x2, #1
367    LOAD_16X8_2 x6
368    CALC_ABS_16X8_1 v30.8h, d
369
370    add     x7, x2, #1
371    LOAD_16X8_2 x7
372    CALC_ABS_16X8_1 v31.8h, d
373
374    LOAD_16X8_1
375    sub     x5, x5, x3
376    sub     x5, x5, x3
377    LOAD_16X8_2 x5
378    ld1     {v24.16b}, [x5], x3
379    ld1     {v25.16b}, [x5]
380
381    CALC_ABS_16X8_1 v28.8h, a
382    CALC_ABS_16X8_2 a
383
384    LOAD_16X8_2 x6
385    CALC_ABS_16X8_1 v30.8h, a
386
387    LOAD_16X8_2 x7
388    CALC_ABS_16X8_1 v31.8h, a
389
390    CALC_AND_STORE_SAD_FOUR
391WELS_ASM_AARCH64_FUNC_END
392
393WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd4x4_AArch64_neon
394    sxtw    x1, w1
395    sxtw    x3, w3
396    ld1     {v0.s}[0], [x0], x1
397    ld1     {v0.s}[1], [x0], x1
398    ld1     {v1.s}[0], [x0], x1
399    ld1     {v1.s}[1], [x0]
400
401    ld1     {v2.s}[0], [x2], x3
402    ld1     {v2.s}[1], [x2], x3
403    ld1     {v3.s}[0], [x2], x3
404    ld1     {v3.s}[1], [x2]
405    usubl   v4.8h, v0.8b, v2.8b //{0,1,2,3,4,5,6,7}
406    usubl   v5.8h, v1.8b, v3.8b //{8,9,10,11,12,13,14,15}
407
408    //Do the vertical transform
409    add     v6.8h, v4.8h, v5.8h //{0,4,8,12,1,5,9,13}
410    sub     v7.8h, v4.8h, v5.8h //{2,6,10,14,3,7,11,15}
411    mov     x4,      v6.d[1]
412    mov     v6.d[1], v7.d[0]
413    ins     v7.d[0], x4
414    add     v4.8h, v6.8h, v7.8h
415    sub     v5.8h, v6.8h, v7.8h
416
417    //Do the horizontal transform
418    trn1    v6.4s, v4.4s, v5.4s
419    trn2    v7.4s, v4.4s, v5.4s
420    add     v4.8h, v6.8h, v7.8h
421    sub     v5.8h, v6.8h, v7.8h
422    trn1    v6.8h, v4.8h, v5.8h
423    trn2    v7.8h, v4.8h, v5.8h
424    add     v4.8h, v6.8h, v7.8h
425    abs     v4.8h, v4.8h
426    saba    v4.8h, v6.8h, v7.8h
427    uaddlv  s4, v4.8h
428    fmov    w0, s4
429    add     w0, w0, #1
430    lsr     w0, w0, #1
431
432WELS_ASM_AARCH64_FUNC_END
433
434.macro SATD_8x4
435    ld1     {v0.8b}, [x0], x1
436    ld1     {v1.8b}, [x2], x3
437    ld1     {v2.8b}, [x0], x1
438    usubl   v16.8h,  v0.8b, v1.8b
439
440    ld1     {v3.8b}, [x2], x3
441    usubl   v17.8h,  v2.8b, v3.8b
442    ld1     {v4.8b}, [x0], x1
443    ld1     {v5.8b}, [x2], x3
444
445    add     v25.8h,  v16.8h, v17.8h
446    usubl   v18.8h,  v4.8b,  v5.8b
447
448    ld1     {v6.8b}, [x0], x1
449    ld1     {v7.8b}, [x2], x3
450
451    usubl   v19.8h,  v6.8b,  v7.8b
452    sub     v26.8h,  v16.8h, v17.8h
453
454    add     v27.8h,  v18.8h, v19.8h
455    sub     v28.8h,  v18.8h, v19.8h
456
457    add     v0.8h,  v25.8h, v27.8h
458    sub     v1.8h,  v25.8h, v27.8h
459
460    add     v2.8h,  v26.8h, v28.8h
461    sub     v3.8h,  v26.8h, v28.8h
462
463    trn1    v4.8h, v0.8h, v1.8h
464    trn2    v5.8h, v0.8h, v1.8h
465    trn1    v6.8h, v2.8h, v3.8h
466    trn2    v7.8h, v2.8h, v3.8h
467
468    add     v16.8h, v4.8h, v5.8h
469    sabd    v17.8h, v4.8h, v5.8h
470    abs     v16.8h, v16.8h
471    add     v18.8h, v6.8h, v7.8h
472    sabd    v19.8h, v6.8h, v7.8h
473    abs     v18.8h, v18.8h
474
475    trn1    v4.4s, v16.4s, v17.4s
476    trn2    v5.4s, v16.4s, v17.4s
477    trn1    v6.4s, v18.4s, v19.4s
478    trn2    v7.4s, v18.4s, v19.4s
479
480    smax    v0.8h, v4.8h, v5.8h
481    smax    v1.8h, v6.8h, v7.8h
482.endm
483
484.macro SATD_16x4
485    ld1     {v0.16b}, [x0], x1
486    ld1     {v1.16b}, [x2], x3
487    ld1     {v2.16b}, [x0], x1
488    usubl   v16.8h,  v0.8b, v1.8b
489    usubl2  v24.8h,  v0.16b, v1.16b
490
491    ld1     {v3.16b}, [x2], x3
492    usubl   v17.8h,  v2.8b, v3.8b
493    usubl2  v25.8h,  v2.16b, v3.16b
494
495    ld1     {v4.16b}, [x0], x1
496    ld1     {v5.16b}, [x2], x3
497    usubl   v18.8h,  v4.8b, v5.8b
498    usubl2  v26.8h,  v4.16b, v5.16b
499
500    ld1     {v6.16b}, [x0], x1
501    ld1     {v7.16b}, [x2], x3
502    usubl   v19.8h,  v6.8b, v7.8b
503    usubl2  v27.8h,  v6.16b, v7.16b
504
505    add     v0.8h,  v16.8h, v17.8h
506    sub     v1.8h,  v16.8h, v17.8h
507    add     v2.8h,  v18.8h, v19.8h
508    sub     v3.8h,  v18.8h, v19.8h
509
510    add     v4.8h,  v24.8h, v25.8h
511    sub     v5.8h,  v24.8h, v25.8h
512    add     v6.8h,  v26.8h, v27.8h
513    sub     v7.8h,  v26.8h, v27.8h
514
515    add     v16.8h,  v0.8h, v2.8h
516    sub     v18.8h,  v0.8h, v2.8h
517    add     v17.8h,  v4.8h, v6.8h
518    sub     v19.8h,  v4.8h, v6.8h
519
520    add     v0.8h,  v1.8h, v3.8h
521    sub     v2.8h,  v1.8h, v3.8h
522    add     v1.8h,  v5.8h, v7.8h
523    sub     v3.8h,  v5.8h, v7.8h
524
525    trn1    v4.8h, v16.8h, v18.8h
526    trn2    v6.8h, v16.8h, v18.8h
527    trn1    v5.8h, v17.8h, v19.8h
528    trn2    v7.8h, v17.8h, v19.8h
529
530    add     v16.8h, v4.8h, v6.8h
531    sabd    v18.8h, v4.8h, v6.8h
532    add     v17.8h, v5.8h, v7.8h
533    sabd    v19.8h, v5.8h, v7.8h
534    abs     v16.8h, v16.8h
535    abs     v17.8h, v17.8h
536
537    trn1    v4.8h, v0.8h, v2.8h
538    trn2    v6.8h, v0.8h, v2.8h
539    trn1    v5.8h, v1.8h, v3.8h
540    trn2    v7.8h, v1.8h, v3.8h
541
542    add     v0.8h, v4.8h, v6.8h
543    sabd    v2.8h, v4.8h, v6.8h
544    add     v1.8h, v5.8h, v7.8h
545    sabd    v3.8h, v5.8h, v7.8h
546    abs     v0.8h, v0.8h
547    abs     v1.8h, v1.8h
548
549    trn1    v4.4s, v16.4s, v18.4s
550    trn2    v6.4s, v16.4s, v18.4s
551    trn1    v5.4s, v17.4s, v19.4s
552    trn2    v7.4s, v17.4s, v19.4s
553
554    trn1    v16.4s, v0.4s, v2.4s
555    trn2    v18.4s, v0.4s, v2.4s
556    trn1    v17.4s, v1.4s, v3.4s
557    trn2    v19.4s, v1.4s, v3.4s
558
559    smax    v0.8h, v4.8h, v6.8h
560    smax    v1.8h, v5.8h, v7.8h
561    smax    v2.8h, v16.8h, v18.8h
562    smax    v3.8h, v17.8h, v19.8h
563    add     v0.8h, v0.8h, v1.8h
564    add     v2.8h, v2.8h, v3.8h
565.endm
566
567WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd16x16_AArch64_neon
568    sxtw    x1, w1
569    sxtw    x3, w3
570    SATD_16x4
571    add     v31.8h, v0.8h, v2.8h
572.rept 3
573    SATD_16x4
574    add     v31.8h, v31.8h, v0.8h
575    add     v31.8h, v31.8h, v2.8h
576.endr
577    uaddlv  s4, v31.8h
578    fmov    w0, s4
579WELS_ASM_AARCH64_FUNC_END
580
581WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd16x8_AArch64_neon
582    sxtw    x1, w1
583    sxtw    x3, w3
584    SATD_16x4
585    add     v31.8h, v0.8h, v2.8h
586
587    SATD_16x4
588    add     v31.8h, v31.8h, v0.8h
589    add     v31.8h, v31.8h, v2.8h
590
591    uaddlv  s4, v31.8h
592    fmov    w0, s4
593WELS_ASM_AARCH64_FUNC_END
594
595WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd8x16_AArch64_neon
596    sxtw    x1, w1
597    sxtw    x3, w3
598    SATD_8x4
599    add     v31.8h, v0.8h, v1.8h
600.rept 3
601    SATD_8x4
602    add     v31.8h, v31.8h, v0.8h
603    add     v31.8h, v31.8h, v1.8h
604.endr
605    uaddlv  s4, v31.8h
606    fmov    w0, s4
607WELS_ASM_AARCH64_FUNC_END
608
609WELS_ASM_AARCH64_FUNC_BEGIN WelsSampleSatd8x8_AArch64_neon
610    sxtw    x1, w1
611    sxtw    x3, w3
612    SATD_8x4
613    add     v31.8h, v0.8h, v1.8h
614
615    SATD_8x4
616    add     v31.8h, v31.8h, v0.8h
617    add     v31.8h, v31.8h, v1.8h
618    uaddlv  s4, v31.8h
619    fmov    w0, s4
620WELS_ASM_AARCH64_FUNC_END
621#endif
622