• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35
36.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2
37//  {   //  input:  coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q
38    cmeq    \arg0\().8h, \arg0\().8h, #0
39    cmeq    \arg1\().8h, \arg1\().8h, #0
40    uzp1    \arg0\().16b, \arg0\().16b, \arg1\().16b
41    ushr    \arg0\().16b, \arg0\().16b, 7
42    addv    \arg2\(), \arg0\().16b
43//  }
44.endm
45
46.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5
47// if coef <= 0, - coef; else , coef;
48//  {   //  input:  coef, ff (dst), mf
49    eor     \arg3\().16b, \arg3\().16b, \arg3\().16b          // init 0 , and keep 0;
50    saba    \arg1\().8h, \arg0\().8h, \arg3\().8h      // f + abs(coef - 0)
51    smull   \arg4\().4s, \arg1\().4h, \arg2\().4h
52    smull2  \arg5\().4s, \arg1\().8h, \arg2\().8h
53    shrn    \arg1\().4h, \arg4\().4s, #16
54    shrn2   \arg1\().8h, \arg5\().4s, #16
55
56    cmgt    \arg4\().8h, \arg0\().8h, #0      // if true, location of coef == 11111111
57    bif     \arg3\().16b, \arg1\().16b, \arg4\().16b      // if (x<0) reserved part; else keep 0 untouched
58    shl     \arg3\().8h, \arg3\().8h, #1
59    sub     \arg1\().8h, \arg1\().8h, \arg3\().8h      // if x > 0, -= 0; else x-= 2x
60//  }
61.endm
62
63.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6
64// if coef <= 0, - coef; else , coef;
65//  {   //  input:  coef, ff (dst), mf
66    eor     \arg3\().16b, \arg3\().16b, \arg3\().16b          // init 0 , and keep 0;
67    saba    \arg1\().8h, \arg0\().8h, \arg3\().8h      // f + abs(coef - 0)
68    smull   \arg4\().4s, \arg1\().4h, \arg2\().4h
69    smull2  \arg5\().4s, \arg1\().8h, \arg2\().8h
70    shrn    \arg1\().4h, \arg4\().4s, #16
71    shrn2   \arg1\().8h, \arg5\().4s, #16
72
73    cmgt    \arg4\().8h, \arg0\().8h, #0      // if true, location of coef == 11111111
74    bif     \arg3\().16b, \arg1\().16b, \arg4\().16b      // if (x<0) reserved part; else keep 0 untouched
75    shl     \arg3\().8h, \arg3\().8h, #1
76    mov     \arg6\().16b, \arg1\().16b
77    sub     \arg1\().8h, \arg1\().8h, \arg3\().8h      // if x > 0, -= 0; else x-= 2x
78//  }
79.endm
80
81.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
82// if coef <= 0, - coef; else , coef;
83//  {   //  input:  coef, ff (dst), mf
84    saba    \arg1\().8h, \arg0\().8h, \arg3\().8h      // f + abs(coef - 0)
85    smull   \arg4\().4s, \arg1\().4h, \arg2\().4h
86    shrn    \arg1\().4h, \arg4\().4s, #16
87
88    cmgt    \arg4\().8h, \arg0\().8h, #0      // if true, location of coef == 11111111
89    bif     \arg3\().16b, \arg1\().16b, \arg4\().16b      // if (x<0) reserved part; else keep 0 untouched
90    shl     \arg3\().8h, \arg3\().8h, #1
91    sub     \arg1\().8h, \arg1\().8h, \arg3\().8h      // if x > 0, -= 0; else x-= 2x
92//  }
93.endm
94
95.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4, arg5
96//  {   //  input:  coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)
97    umax    \arg0\().8h, \arg0\().8h, \arg1\().8h
98    umaxv   \arg4\(), \arg0\().8h
99    umax    \arg2\().8h, \arg2\().8h, \arg3\().8h
100    umaxv   \arg5\(), \arg2\().8h
101//  }
102.endm
103
104.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
105//  {   //  input: src_d[0][16][32][48], dst_d[0][16][32][48], working
106    sshr  \arg1\().2d, \arg0\().2d, #32
107    add   \arg2\().4h, \arg0\().4h, \arg1\().4h      // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
108    sub   \arg1\().4h, \arg0\().4h, \arg1\().4h      // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
109    zip1  \arg1\().4h, \arg2\().4h, \arg1\().4h
110//  }
111.endm
112
113
114.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
115//  {   //  input:  coef, dst_d, working_d (all 0x01)
116    cmeq    \arg0\().4h, \arg0\().4h, #0
117    and     \arg0\().8b, \arg0\().8b, \arg2\().8b
118    addv    \arg1\(), \arg0\().4h
119//  }
120.endm
121
122.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
123//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1
124    uzp2  \arg1\().4s, \arg0\().4s, \arg0\().4s
125    uzp1  \arg0\().4s, \arg0\().4s, \arg0\().4s
126    add   \arg2\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
127    sub   \arg1\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
128    zip1  \arg2\().8h, \arg2\().8h, \arg1\().8h      // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]
129
130    uzp2  \arg1\().4s, \arg2\().4s, \arg2\().4s
131    uzp1  \arg0\().4s, \arg2\().4s, \arg2\().4s
132    add   \arg2\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];
133    sub   \arg1\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];
134    rev32 \arg1\().4h, \arg1\().4h             // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];
135    zip1  \arg0\().4s, \arg2\().4s, \arg1\().4s
136    //  }
137.endm
138
139.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 arg0, arg1, arg2, arg3
140//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
141    uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s   //[0 1 4 5]+[8 9 12 13]
142    uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s   //[2 3 6 7]+[10 11 14 15]
143
144    uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h   //[0 4 8 12]+[2 6 10 14]
145    uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h   //[1 5 9 13]+[3 7 11 15]
146    zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d   //[2 6 10 14]+[3 7 11 15]
147    zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d   //[0 4 8 12]+[1 5 9 13]
148//  }
149.endm
150
151.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
152//  {   //  input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]
153    trn1 \arg4\().8h, v0.8h, v1.8h
154    trn2 \arg5\().8h, v0.8h, v1.8h
155    trn1 \arg6\().8h, v2.8h, v3.8h
156    trn2 \arg7\().8h, v2.8h, v3.8h
157
158    trn1 \arg0\().4s, v4.4s, v6.4s
159    trn2 \arg2\().4s, v4.4s, v6.4s
160    trn1 \arg1\().4s, v5.4s, v7.4s
161    trn2 \arg3\().4s, v5.4s, v7.4s
162//  }
163.endm
164
165.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 arg0, arg1, arg2, arg3
166//  {   //  input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]
167    mov  \arg0\().d[1], \arg1\().d[0]  //[0 1 2 3]+[4 5 6 7]
168    mov  \arg2\().d[1], \arg3\().d[0]  //[8 9 10 11]+[12 13 14 15]
169    uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s   //[0 1 4 5]+[8 9 12 13]
170    uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s   //[2 3 6 7]+[10 11 14 15]
171
172    uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h   //[0 4 8 12]+[2 6 10 14]
173    uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h   //[1 5 9 13]+[3 7 11 15]
174    zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d   //[2 6 10 14]+[3 7 11 15]
175    zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d   //[0 4 8 12]+[1 5 9 13]
176//  }
177.endm
178
179.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5
180    ld1   {\arg0\().s}[0], [\arg2\()], \arg3\()
181    ld1   {\arg0\().s}[1], [\arg2\()], \arg3\()
182    ld1   {\arg0\().s}[2], [\arg2\()], \arg3\()
183    ld1   {\arg0\().s}[3], [\arg2\()]
184
185    ld1   {\arg1\().s}[0], [\arg4\()], \arg5\()
186    ld1   {\arg1\().s}[1], [\arg4\()], \arg5\()
187    ld1   {\arg1\().s}[2], [\arg4\()], \arg5\()
188    ld1   {\arg1\().s}[3], [\arg4\()]
189.endm
190
191.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
192//  {   //  input: src_d[0]~[3], working: [4]~[7]
193    add     \arg4\().8h, \arg0\().8h, \arg3\().8h   //int16 s[0] = data[i] + data[i3];
194    sub     \arg7\().8h, \arg0\().8h, \arg3\().8h   //int16 s[3] = data[i] - data[i3];
195    add     \arg5\().8h, \arg1\().8h, \arg2\().8h   //int16 s[1] = data[i1] + data[i2];
196    sub     \arg6\().8h, \arg1\().8h, \arg2\().8h   //int16 s[2] = data[i1] - data[i2];
197
198    add     \arg0\().8h, \arg4\().8h, \arg5\().8h   //int16 dct[i ] = s[0] + s[1];
199    sub     \arg2\().8h, \arg4\().8h, \arg5\().8h   //int16 dct[i2] = s[0] - s[1];
200    shl     \arg1\().8h, \arg7\().8h, #1
201    shl     \arg3\().8h, \arg6\().8h, #1
202    add     \arg1\().8h, \arg1\().8h, \arg6\().8h   //int16 dct[i1] = (s[3] << 1) + s[2];
203    sub     \arg3\().8h, \arg7\().8h, \arg3\().8h   //int16 dct[i3] = s[3] - (s[2] << 1);
204//  }
205.endm
206
207.macro LOAD_8x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
208//  {   //  input: \arg0\()~\arg3\(), src1*, src2*; untouched r2:src1_stride &r4:src2_stride
209    ld1   {\arg0\().d}[0], [\arg8\()], x2
210    ld1   {\arg1\().d}[0], [\arg8\()], x2
211    ld1   {\arg2\().d}[0], [\arg8\()], x2
212    ld1   {\arg3\().d}[0], [\arg8\()], x2
213
214    ld1   {\arg4\().d}[0], [\arg9\()], x4
215    ld1   {\arg5\().d}[0], [\arg9\()], x4
216    ld1   {\arg6\().d}[0], [\arg9\()], x4
217    ld1   {\arg7\().d}[0], [\arg9\()], x4
218//  }
219.endm
220
221.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
222//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
223    add   \arg4\().8h, \arg0\().8h, \arg2\().8h          //int16 e[i][0] = src[0] + src[2];
224    sub   \arg5\().8h, \arg0\().8h, \arg2\().8h          //int16 e[i][1] = src[0] - src[2];
225    sshr  \arg6\().8h, \arg1\().8h, #1
226    sshr  \arg7\().8h, \arg3\().8h, #1
227    sub   \arg6\().8h, \arg6\().8h, \arg3\().8h          //int16 e[i][2] = (src[1]>>1)-src[3];
228    add   \arg7\().8h, \arg1\().8h, \arg7\().8h          //int16 e[i][3] = src[1] + (src[3]>>1);
229//  }
230.endm
231
232.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
233// both row & col transform used
234//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
235    add   \arg0\().8h, \arg4\().8h, \arg7\().8h          //int16 f[i][0] = e[i][0] + e[i][3];
236    add   \arg1\().8h, \arg5\().8h, \arg6\().8h          //int16 f[i][1] = e[i][1] + e[i][2];
237    sub   \arg2\().8h, \arg5\().8h, \arg6\().8h          //int16 f[i][2] = e[i][1] - e[i][2];
238    sub   \arg3\().8h, \arg4\().8h, \arg7\().8h          //int16 f[i][3] = e[i][0] - e[i][3];
239//  }
240.endm
241
242.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
243//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
244    saddl   \arg4\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][0] = src[0] + src[2];
245    ssubl   \arg5\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][1] = src[0] - src[2];
246    ssubl   \arg6\().4s, \arg1\().4h, \arg3\().4h          //int32 e[i][2] = src[1] - src[3];
247    saddl   \arg7\().4s, \arg1\().4h, \arg3\().4h          //int32 e[i][3] = src[1] + src[3];
248//  }
249.endm
250
251.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
252//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
253    add     \arg4\().4s, \arg0\().4s, \arg2\().4s          //int32 e[0][j] = f[0][j] + f[2][j];
254    sub     \arg5\().4s, \arg0\().4s, \arg2\().4s          //int32 e[1][j] = f[0][j] - f[2][j];
255    sub     \arg6\().4s, \arg1\().4s, \arg3\().4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
256    add     \arg7\().4s, \arg1\().4s, \arg3\().4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
257//  }
258.endm
259
260.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
261// both row & col transform used
262//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
263    add     \arg0\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][0] = e[i][0] + e[i][3];
264    add     \arg1\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][1] = e[i][1] + e[i][2];
265    sub     \arg2\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][2] = e[i][1] - e[i][2];
266    sub     \arg3\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][3] = e[i][0] - e[i][3];
267//  }
268.endm
269
270.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4
271//  {   //  input: pred_d[0](output), dct_q0/1, working_q0/1;
272    uxtl      \arg3\().8h, \arg0\().8b
273    uxtl2     \arg4\().8h, \arg0\().16b
274    add       \arg3\().8h, \arg3\().8h, \arg1\().8h
275    add       \arg4\().8h, \arg4\().8h, \arg2\().8h
276    sqxtun    \arg0\().8b, \arg3\().8h
277    sqxtun2   \arg0\().16b,\arg4\().8h
278//  }
279.endm
280
281WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon
282    ld1     {v0.8h, v1.8h}, [x0]
283    ZERO_COUNT_IN_2_QUARWORD    v0, v1, b0
284    mov     x0, v0.d[0]
285    mov     x1, #16
286    subs    x0, x1, x0
287WELS_ASM_AARCH64_FUNC_END
288
289WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4_AArch64_neon
290    ld1     {v2.8h}, [x1]
291    ld1     {v0.8h, v1.8h}, [x0]
292    ld1     {v3.8h}, [x2]
293    mov     v4.16b, v2.16b
294    NEWQUANT_COEF_EACH_16BITS   v0, v2, v3, v5, v6, v7
295    st1     {v2.8h}, [x0], #16
296    NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7
297    st1     {v4.8h}, [x0], #16
298WELS_ASM_AARCH64_FUNC_END
299
300
301WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4Dc_AArch64_neon
302    ld1     {v0.8h, v1.8h}, [x0]
303    dup     v2.8h, w1      // even ff range [0, 768]
304    dup     v3.8h, w2
305    mov     v4.16b, v2.16b
306    NEWQUANT_COEF_EACH_16BITS   v0, v2, v3, v5, v6, v7
307    st1     {v2.8h}, [x0], #16
308    NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7
309    st1     {v4.8h}, [x0], #16
310WELS_ASM_AARCH64_FUNC_END
311
312WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4_AArch64_neon
313    ld1     {v2.8h}, [x1]
314    ld1     {v3.8h}, [x2]
315    mov     x1, x0
316
317.rept 4
318    ld1     {v0.8h, v1.8h}, [x0], #32
319    mov     v4.16b, v2.16b
320    NEWQUANT_COEF_EACH_16BITS   v0, v4, v3, v5, v6, v7
321    st1     {v4.8h}, [x1], #16
322    mov     v4.16b, v2.16b
323    NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7
324    st1     {v4.8h}, [x1], #16
325.endr
326WELS_ASM_AARCH64_FUNC_END
327
328
329WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4Max_AArch64_neon
330    ld1     {v2.8h}, [x1]
331    ld1     {v3.8h}, [x2]
332    mov     x1, x0
333
334    ld1     {v0.8h, v1.8h}, [x0], #32
335    mov     v4.16b, v2.16b
336    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v16
337    st1     {v4.8h}, [x1], #16
338    mov     v4.16b, v2.16b
339    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v17
340    st1     {v4.8h}, [x1], #16   // then 1st 16 elem in v16  & v17
341
342    ld1     {v0.8h, v1.8h}, [x0], #32
343    mov     v4.16b, v2.16b
344    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v18
345    st1     {v4.8h}, [x1], #16
346    mov     v4.16b, v2.16b
347    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v19
348    st1     {v4.8h}, [x1], #16   // then 2st 16 elem in v18 & v19
349
350    SELECT_MAX_IN_ABS_COEF  v16, v17, v18, v19, h20, h21
351
352    ld1     {v0.8h, v1.8h}, [x0], #32
353    mov     v4.16b, v2.16b
354    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v16
355    st1     {v4.8h}, [x1], #16
356    mov     v4.16b, v2.16b
357    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v17
358    st1     {v4.8h}, [x1], #16   // then 1st 16 elem in v16  & v17
359
360    ld1     {v0.8h, v1.8h}, [x0], #32
361    mov     v4.16b, v2.16b
362    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v18
363    st1     {v4.8h}, [x1], #16
364    mov     v4.16b, v2.16b
365    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v19
366    st1     {v4.8h}, [x1], #16   // then 2st 16 elem in v18 & v19
367
368    SELECT_MAX_IN_ABS_COEF  v16, v17, v18, v19, h22, h23
369
370    st4 {v20.h,v21.h,v22.h,v23.h}[0], [x3]
371WELS_ASM_AARCH64_FUNC_END
372
373
374WELS_ASM_AARCH64_FUNC_BEGIN WelsDequant4x4_AArch64_neon
375    ld1    {v0.8h, v1.8h}, [x0]
376    ld1    {v2.8h}, [x1]
377    mul    v3.8h, v0.8h, v2.8h
378    mul    v4.8h, v1.8h, v2.8h
379    st1    {v3.8h, v4.8h}, [x0]
380WELS_ASM_AARCH64_FUNC_END
381
382WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantFour4x4_AArch64_neon
383    ld1    {v2.8h}, [x1]
384    mov    x1, x0
385.rept 4
386    ld1   {v0.8h,v1.8h}, [x0], #32
387    mul   v3.8h, v0.8h, v2.8h
388    mul   v4.8h, v1.8h, v2.8h
389    st1   {v3.8h,v4.8h}, [x1], #32
390.endr
391WELS_ASM_AARCH64_FUNC_END
392
393WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_AArch64_neon
394    dup   v4.8h, w1
395    mov   x1, #32
396    ld1   {v0.h}[0], [x0], x1       //rs[0]
397    ld1   {v0.h}[1], [x0], x1       //rs[16]
398    ld1   {v0.h}[2], [x0], x1       //rs[32]
399    ld1   {v0.h}[3], [x0], x1       //rs[48]
400
401    HDM_QUANT_2x2_TOTAL_16BITS  v0, v1, v2      // output v1
402
403    HDM_QUANT_2x2_TOTAL_16BITS  v1, v0, v2      // output v0
404
405    abs   v1.4h, v0.4h
406    cmhi  v0.4h, v1.4h, v4.4h         // abs(dct[i])>threshold;
407    mov   w0, v0.s[0]
408    mov   w1, v0.s[1]
409    orr   w0, w0, w1
410WELS_ASM_AARCH64_FUNC_END
411
412
413WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2_AArch64_neon
414
415    dup   v1.8h, w1 //ff
416    dup   v2.8h, w2 //mf
417    eor   v3.16b, v3.16b, v3.16b
418
419    mov   x1, #32
420    mov   x2, x0
421    ld1   {v0.h}[0], [x0], x1       //rs[0]
422    st1   {v3.h}[0], [x2], x1      //rs[00]=0
423    ld1   {v0.h}[1], [x0], x1       //rs[16]
424    st1   {v3.h}[1], [x2], x1      //rs[16]=0
425    ld1   {v0.h}[2], [x0], x1       //rs[32]
426    st1   {v3.h}[2], [x2], x1      //rs[32]=0
427    ld1   {v0.h}[3], [x0], x1       //rs[48]
428    st1   {v3.h}[3], [x2], x1      //rs[48]=0
429
430
431    HDM_QUANT_2x2_TOTAL_16BITS  v0, v4, v5      // output v4
432
433    HDM_QUANT_2x2_TOTAL_16BITS  v4, v0, v5      // output v0
434
435    QUANT_DUALWORD_COEF_EACH_16BITS v0, v1, v2, v3, v4
436
437    st1    {v1.d}[0], [x3]        // store to dct
438    st1    {v1.d}[0], [x4]        // store to block
439
440    movi v3.8h, #1, lsl #0
441
442    movi v0.16b, #255
443
444    DC_ZERO_COUNT_IN_DUALWORD   v1, h0, v3
445
446    mov     x0, v0.d[0]
447    mov     x1, #4
448    subs    x0, x1, x0
449WELS_ASM_AARCH64_FUNC_END
450
451
452
453WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantIHadamard4x4_AArch64_neon
454    ld1    {v0.8h, v1.8h}, [x0]
455    dup    v4.8h, w1
456
457    IHDM_4x4_TOTAL_16BITS   v0, v2, v3
458    IHDM_4x4_TOTAL_16BITS   v1, v2, v3
459
460    MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2    v0, v1, v2, v3
461
462    IHDM_4x4_TOTAL_16BITS   v0, v2, v3
463    mul   v0.8h, v0.8h, v4.8h
464
465    IHDM_4x4_TOTAL_16BITS   v1, v2, v3
466    mul   v1.8h, v1.8h, v4.8h
467
468    MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2    v0, v1, v2, v3
469    st1    {v0.16b, v1.16b}, [x0]
470WELS_ASM_AARCH64_FUNC_END
471
472//void WelsDctT4_AArch64_neon (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
473WELS_ASM_AARCH64_FUNC_BEGIN WelsDctT4_AArch64_neon
474    SIGN_EXTENSION x2, w2
475    SIGN_EXTENSION x4, w4
476    LOAD_4x4_DATA_FOR_DCT   v0, v1, x1, x2, x3, x4
477    usubl  v2.8h, v0.8b, v1.8b
478    usubl2 v4.8h, v0.16b, v1.16b
479    uzp1  v3.8h, v2.8h, v4.8h
480    uzp2  v5.8h, v2.8h, v4.8h
481    uzp2  v2.8h, v3.8h, v5.8h // s[2, 6, 10, 14] [3, 7, 11, 15]
482    uzp1  v0.8h, v3.8h, v5.8h // s[0, 4, 8, 12] [1, 5, 9, 13]
483    mov    v3.d[0], v2.d[1]   // s[3, 7, 11, 15]
484    mov    v1.d[0], v0.d[1]   // s[1, 5, 9, 13]
485
486    // horizontal transform
487    DCT_ROW_TRANSFORM_TOTAL_16BITS          v0, v1, v2, v3, v4, v5, v6, v7
488    // transform element
489    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7
490    // vertical transform
491    DCT_ROW_TRANSFORM_TOTAL_16BITS          v0, v1, v2, v3, v4, v5, v6, v7
492
493    st4       {v0.d, v1.d, v2.d, v3.d}[0], [x0]
494WELS_ASM_AARCH64_FUNC_END
495
496//void WelsDctFourT4_AArch64_neon (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);
497WELS_ASM_AARCH64_FUNC_BEGIN WelsDctFourT4_AArch64_neon
498    SIGN_EXTENSION x2,w2
499    SIGN_EXTENSION x4,w4
500.rept 2
501    LOAD_8x4_DATA_FOR_DCT   v0, v1, v2, v3, v4, v5, v6, v7, x1, x3
502    usubl    v0.8h, v0.8b, v4.8b
503    usubl    v1.8h, v1.8b, v5.8b
504    usubl    v2.8h, v2.8b, v6.8b
505    usubl    v3.8h, v3.8b, v7.8b
506
507    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7
508
509    // horizontal transform
510    DCT_ROW_TRANSFORM_TOTAL_16BITS      v0, v1, v2, v3, v4, v5, v6, v7
511
512    // transform element
513    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7
514
515    //  vertical transform
516    DCT_ROW_TRANSFORM_TOTAL_16BITS      v0, v1, v2, v3, v4, v5, v6, v7
517
518    uzp1    v4.2d, v0.2d, v1.2d
519    uzp2    v6.2d, v0.2d, v1.2d
520    uzp1    v5.2d, v2.2d, v3.2d
521    uzp2    v7.2d, v2.2d, v3.2d
522    st1     {v4.16b, v5.16b}, [x0], #32
523    st1     {v6.16b, v7.16b}, [x0], #32
524.endr
525WELS_ASM_AARCH64_FUNC_END
526//void WelsIDctT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct)
527WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctT4Rec_AArch64_neon
528    SIGN_EXTENSION x1,w1
529    SIGN_EXTENSION x3,w3
530    ld1     {v16.s}[0], [x2], x3
531    ld1     {v16.s}[1], [x2], x3
532    ld1     {v16.s}[2], [x2], x3
533    ld1     {v16.s}[3], [x2], x3                   // Pred
534    ld4     {v0.4h, v1.4h, v2.4h, v3.4h}, [x4]      // dct coeff
535
536    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7
537
538    TRANSFORM_TOTAL_16BITS              v0, v1, v2, v3, v4, v5, v6, v7
539
540    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7
541
542    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7
543
544    TRANSFORM_TOTAL_16BITS              v0, v1, v2, v3, v4, v5, v6, v7
545    ins     v0.d[1], v1.d[0]
546    ins     v2.d[1], v3.d[0]
547    srshr   v0.8h, v0.8h, #6
548    srshr   v2.8h, v2.8h, #6
549    //after rounding 6, clip into [0, 255]
550    uxtl    v1.8h, v16.8b
551    add     v0.8h, v0.8h, v1.8h
552    sqxtun  v1.8b, v0.8h
553    st1     {v1.s}[0],[x0],x1
554    st1     {v1.s}[1],[x0],x1
555
556    uxtl2   v1.8h, v16.16b
557    add     v2.8h, v2.8h, v1.8h
558    sqxtun  v1.8b, v2.8h
559    st1     {v1.s}[0],[x0],x1
560    st1     {v1.s}[1],[x0],x1
561WELS_ASM_AARCH64_FUNC_END
562//void WelsIDctFourT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);
563WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctFourT4Rec_AArch64_neon
564    SIGN_EXTENSION x1,w1
565    SIGN_EXTENSION x3,w3
566.rept 2
567    ld1     {v16.d}[0], [x2], x3
568    ld1     {v16.d}[1], [x2], x3
569    ld1     {v17.d}[0], [x2], x3
570    ld1     {v17.d}[1], [x2], x3                   // Pred
571    ld4     {v0.8h, v1.8h, v2.8h, v3.8h}, [x4], #64     // dct coeff
572
573    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7
574
575    TRANSFORM_TOTAL_16BITS    v0, v1, v2, v3, v4, v5, v6, v7
576
577    MATRIX_TRANSFORM_EACH_16BITS_OUT4    v0, v1, v2, v3, v4, v5, v6, v7
578
579    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7
580
581    TRANSFORM_TOTAL_16BITS    v0, v1, v2, v3, v4, v5, v6, v7
582    srshr   v0.8h, v0.8h, #6
583    srshr   v1.8h, v1.8h, #6
584    srshr   v2.8h, v2.8h, #6
585    srshr   v3.8h, v3.8h, #6
586
587    //after rounding 6, clip into [0, 255]
588    uxtl    v4.8h, v16.8b
589    add     v0.8h, v0.8h, v4.8h
590    sqxtun  v0.8b, v0.8h
591    st1     {v0.d}[0],[x0],x1
592
593    uxtl2   v5.8h, v16.16b
594    add     v1.8h, v1.8h, v5.8h
595    sqxtun  v1.8b, v1.8h
596    st1     {v1.d}[0],[x0],x1
597
598    uxtl    v6.8h, v17.8b
599    add     v2.8h, v2.8h, v6.8h
600    sqxtun  v2.8b, v2.8h
601    st1     {v2.d}[0],[x0],x1
602
603    uxtl2   v7.8h, v17.16b
604    add     v3.8h, v3.8h, v7.8h
605    sqxtun  v3.8b, v3.8h
606    st1     {v3.d}[0],[x0],x1
607 .endr
608WELS_ASM_AARCH64_FUNC_END
609
610WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardT4Dc_AArch64_neon
611
612    mov     x2, #32
613    ld1     {v0.h}[0], [x1], x2
614    ld1     {v1.h}[0], [x1], x2
615    ld1     {v0.h}[1], [x1], x2
616    ld1     {v1.h}[1], [x1], x2
617
618    ld1     {v2.h}[0], [x1], x2
619    ld1     {v3.h}[0], [x1], x2
620    ld1     {v2.h}[1], [x1], x2
621    ld1     {v3.h}[1], [x1], x2
622
623    ld1     {v0.h}[2], [x1], x2
624    ld1     {v1.h}[2], [x1], x2
625    ld1     {v0.h}[3], [x1], x2
626    ld1     {v1.h}[3], [x1], x2
627
628    ld1     {v2.h}[2], [x1], x2
629    ld1     {v3.h}[2], [x1], x2
630    ld1     {v2.h}[3], [x1], x2
631    ld1     {v3.h}[3], [x1], x2 // v0[0 4 08 12],v1[1 5 09 13],v2[2 6 10 14],v3[3 7 11 15]
632
633    ROW_TRANSFORM_0_STEP    v0, v1, v3, v2, v4, v7, v6, v5
634    TRANSFORM_4BYTES        v0, v1, v3, v2, v4, v7, v6, v5
635
636    // transform element 32bits
637    uzp1    v4.4s, v0.4s, v1.4s // 0 2 4 6
638    uzp2    v5.4s, v0.4s, v1.4s // 1 3 5 7
639    uzp1    v6.4s, v2.4s, v3.4s // 8 10 12 14
640    uzp2    v7.4s, v2.4s, v3.4s // 9 11 13 15
641
642    uzp1    v0.4s, v4.4s, v6.4s // 0 4  8 12
643    uzp2    v2.4s, v4.4s, v6.4s // 2 6 10 14
644    uzp1    v1.4s, v5.4s, v7.4s // 1 5  9 13
645    uzp2    v3.4s, v5.4s, v7.4s // 3 7 11 15
646
647    COL_TRANSFORM_0_STEP    v0, v1, v3, v2, v4, v7, v6, v5
648    TRANSFORM_4BYTES        v0, v1, v3, v2, v4, v7, v6, v5
649    sqrshrn   v4.4h, v0.4s, #1
650    sqrshrn2  v4.8h, v1.4s, #1
651    sqrshrn   v5.4h, v2.4s, #1
652    sqrshrn2  v5.8h, v3.4s, #1
653    st1       {v4.16b, v5.16b}, [x0]  //store
654WELS_ASM_AARCH64_FUNC_END
655
656//void WelsIDctRecI16x16Dc_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,
657//                int16_t* pDctDc);
658WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctRecI16x16Dc_AArch64_neon
659    SIGN_EXTENSION x1,w1
660    SIGN_EXTENSION x3,w3
661    ld1       {v16.16b,v17.16b}, [x4]
662    srshr     v16.8h, v16.8h, #6
663    srshr     v17.8h, v17.8h, #6
664
665    dup       v0.8h, v16.h[0]
666    dup       v1.8h, v16.h[1]
667    ins       v0.d[1], v1.d[0]
668    dup       v1.8h, v16.h[2]
669    dup       v2.8h, v16.h[3]
670    ins       v1.d[1], v2.d[0]
671
672.rept 4
673    ld1       {v3.16b}, [x2], x3
674    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5
675    st1       {v3.16b}, [x0], x1
676.endr
677
678    dup       v0.8h, v16.h[4]
679    dup       v1.8h, v16.h[5]
680    ins       v0.d[1], v1.d[0]
681    dup       v1.8h, v16.h[6]
682    dup       v2.8h, v16.h[7]
683    ins       v1.d[1], v2.d[0]
684
685.rept 4
686    ld1       {v3.16b}, [x2], x3
687    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5
688    st1       {v3.16b}, [x0], x1
689.endr
690
691    dup       v0.8h, v17.h[0]
692    dup       v1.8h, v17.h[1]
693    ins       v0.d[1], v1.d[0]
694    dup       v1.8h, v17.h[2]
695    dup       v2.8h, v17.h[3]
696    ins       v1.d[1], v2.d[0]
697
698.rept 4
699    ld1       {v3.16b}, [x2], x3
700    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5
701    st1       {v3.16b}, [x0], x1
702.endr
703
704    dup       v0.8h, v17.h[4]
705    dup       v1.8h, v17.h[5]
706    ins       v0.d[1], v1.d[0]
707    dup       v1.8h, v17.h[6]
708    dup       v2.8h, v17.h[7]
709    ins       v1.d[1], v2.d[0]
710
711.rept 4
712    ld1       {v3.16b}, [x2], x3
713    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5
714    st1       {v3.16b}, [x0], x1
715.endr
716WELS_ASM_AARCH64_FUNC_END
717#endif
718