• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35
36.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
37//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\()
38
39    saddl       \arg4\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][0] = src[0] + src[2];
40    ssubl       \arg5\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][1] = src[0] - src[2];
41    sshr        \arg8\().4h, \arg1\().4h, #1
42    sshr        \arg9\().4h, \arg3\().4h, #1
43    ssubl       \arg6\().4s, \arg8\().4h, \arg3\().4h          //int32 e[i][2] = (src[1]>>1)-src[3];
44    saddl       \arg7\().4s, \arg1\().4h, \arg9\().4h          //int32 e[i][3] = src[1] + (src[3]>>1);
45//  }
46.endm
47
48.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
49// both row & col transform used
50//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
51    add       \arg0\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][0] = e[i][0] + e[i][3];
52    add       \arg1\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][1] = e[i][1] + e[i][2];
53    sub       \arg2\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][2] = e[i][1] - e[i][2];
54    sub       \arg3\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][3] = e[i][0] - e[i][3];
55//  }
56.endm
57
58.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
59//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
60    add        \arg4\().4s, \arg0\().4s, \arg2\().4s          //int32 e[0][j] = f[0][j] + f[2][j];
61    sub        \arg5\().4s, \arg0\().4s, \arg2\().4s          //int32 e[1][j] = f[0][j] - f[2][j];
62    sshr        \arg6\().4s, \arg1\().4s, #1
63    sshr        \arg7\().4s, \arg3\().4s, #1
64    sub        \arg6\().4s, \arg6\().4s, \arg3\().4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
65    add        \arg7\().4s, \arg1\().4s, \arg7\().4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
66//  }
67.endm
68
69//  uint8_t *pred, const int32_t stride, int16_t *rs
70WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon
71    SIGN_EXTENSION x1,w1
72    ld4        {v0.4h, v1.4h, v2.4h, v3.4h}, [x2]      // cost 3 cycles!
73    ROW_TRANSFORM_1_STEP        v0, v1, v2, v3, v16, v17, v18, v19, v4, v5
74    TRANSFORM_4BYTES        v0, v1, v2, v3, v16, v17, v18, v19
75    // transform element 32bits
76    trn1        v16.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]
77    trn2        v17.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[1 5 3 7]
78    trn1        v18.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]
79    trn2        v19.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[9 13 11 15]
80    trn1        v0.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]
81    trn2        v2.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[2 6 10 14]
82    trn1        v1.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]
83    trn2        v3.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[3 7 11 15]
84    COL_TRANSFORM_1_STEP        v0, v1, v2, v3, v16, v17, v18, v19
85
86    TRANSFORM_4BYTES        v0, v1, v2, v3, v16, v17, v18, v19
87    //after clip_table[MAX_NEG_CROP] into [0, 255]
88    mov         x2, x0
89    ld1     {v16.s}[0],[x0],x1
90    ld1     {v16.s}[1],[x0],x1
91    ld1     {v17.s}[0],[x0],x1
92    ld1     {v17.s}[1],[x0]
93
94    rshrn     v0.4h, v0.4s, #6
95    rshrn2    v0.8h, v1.4s, #6
96    rshrn     v1.4h, v2.4s, #6
97    rshrn2    v1.8h, v3.4s, #6
98
99    uxtl      v2.8h,v16.8b
100    uxtl      v3.8h,v17.8b
101    add        v2.8h, v2.8h, v0.8h
102    add        v3.8h, v3.8h, v1.8h
103
104    sqxtun     v0.8b,v2.8h
105    sqxtun     v1.8b,v3.8h
106
107    st1     {v0.s}[0],[x2],x1
108    st1     {v0.s}[1],[x2],x1
109    st1     {v1.s}[0],[x2],x1
110    st1     {v1.s}[1],[x2]
111WELS_ASM_AARCH64_FUNC_END
112
113WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero16x16_AArch64_neon
114    eor v0.16b, v0.16b, v0.16b
115    eor v1.16b, v1.16b, v1.16b
116    SIGN_EXTENSION x1,w1
117    lsl x1, x1, 1
118.rept 16
119    st1 {v0.16b, v1.16b}, [x0], x1
120.endr
121WELS_ASM_AARCH64_FUNC_END
122
123WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero8x8_AArch64_neon
124    eor v0.16b, v0.16b, v0.16b
125    SIGN_EXTENSION x1, w1
126    lsl x1, x1, 1
127.rept 8
128    st1 {v0.16b}, [x0], x1
129.endr
130WELS_ASM_AARCH64_FUNC_END
131#endif
132