• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34
35#include "arm_arch64_common_macro.S"
36
37.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
38    uabd    \arg6\().16b, \arg1\().16b, \arg2\().16b
39    cmhi    \arg6\().16b, \arg4\().16b, \arg6\().16b
40
41    uabd    \arg4\().16b, \arg0\().16b, \arg1\().16b
42    cmhi    \arg4\().16b, \arg5\().16b, \arg4\().16b
43    and     \arg6\().16b, \arg6\().16b, \arg4\().16b
44
45    uabd    \arg4\().16b, \arg3\().16b, \arg2\().16b
46    cmhi    \arg4\().16b, \arg5\().16b, \arg4\().16b
47    and     \arg6\().16b, \arg6\().16b, \arg4\().16b
48.endm
49
50.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
51    //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20
52    urhadd    \arg8\().16b, \arg2\().16b, \arg3\().16b
53    uhadd   \arg8\().16b, \arg0\().16b, \arg8\().16b
54    usubl   \arg9\().8h, \arg8\().8b, \arg1\().8b
55    sqxtn   \arg9\().8b, \arg9\().8h
56    usubl2  \arg8\().8h, \arg8\().16b, \arg1\().16b
57    sqxtn2  \arg9\().16b, \arg8\().8h
58    smax    \arg8\().16b, \arg9\().16b, \arg5\().16b
59    //
60    smin  \arg8\().16b, \arg8\().16b, \arg6\().16b
61    uabd  \arg9\().16b, \arg0\().16b, \arg2\().16b
62    cmhi  \arg9\().16b, \arg4\().16b, \arg9\().16b
63    and     \arg8\().16b, \arg8\().16b, \arg9\().16b
64    and     \arg8\().16b, \arg8\().16b, \arg7\().16b
65    add     \arg8\().16b, \arg1\().16b, \arg8\().16b
66    abs     \arg9\().16b, \arg9\().16b
67.endm
68
69.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6
70    usubl \arg5\().8h, \arg0\().8b, \arg3\().8b
71    usubl \arg6\().8h, \arg2\().8b, \arg1\().8b
72    shl     \arg6\().8h, \arg6\().8h, #2
73    add     \arg5\().8h, \arg5\().8h, \arg6\().8h
74    sqrshrn  \arg4\().8b, \arg5\().8h, #3
75.endm
76
77.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6
78    usubl2    \arg5\().8h, \arg0\().16b, \arg3\().16b
79    usubl2    \arg6\().8h, \arg2\().16b, \arg1\().16b
80    shl     \arg6\().8h, \arg6\().8h, #2
81    add     \arg5\().8h, \arg5\().8h, \arg6\().8h
82    sqrshrn2  \arg4\().16b, \arg5\().8h, #3
83.endm
84
85.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
86    cmge  \arg1\().16b, \arg0\().16b, #0
87    and     \arg1\().16b, \arg0\().16b, \arg1\().16b
88    sub     \arg0\().16b, \arg1\().16b, \arg0\().16b
89.endm
90
91.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
92    uaddl \arg8\().8h, \arg1\().8b, \arg2\().8b
93    uaddl \arg9\().8h, \arg3\().8b, \arg4\().8b
94    add   \arg9\().8h, \arg9\().8h, \arg8\().8h
95
96    uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b
97    shl   \arg8\().8h, \arg8\().8h, #1
98    add   \arg8\().8h, \arg9\().8h, \arg8\().8h
99
100    rshrn \arg0\().8b, \arg9\().8h, #2
101    rshrn \arg7\().8b, \arg8\().8h, #3
102    shl     \arg9\().8h, \arg9\().8h, #1
103    usubl   \arg8\().8h, \arg5\().8b, \arg1\().8b
104    add     \arg9\().8h, \arg8\().8h, \arg9\().8h
105
106    uaddl \arg8\().8h, \arg2\().8b, \arg5\().8b
107    uaddw \arg8\().8h, \arg8\().8h, \arg2\().8b
108    uaddw \arg8\().8h, \arg8\().8h, \arg3\().8b
109
110    rshrn \arg9\().8b, \arg9\().8h, #3
111    rshrn \arg8\().8b, \arg8\().8h, #2
112    bsl       \arg6\().8b, \arg9\().8b, \arg8\().8b
113.endm
114
115.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
116    uaddl2 \arg8\().8h, \arg1\().16b, \arg2\().16b
117    uaddl2 \arg9\().8h, \arg3\().16b, \arg4\().16b
118    add   \arg9\().8h, \arg9\().8h, \arg8\().8h
119
120    uaddl2 \arg8\().8h, \arg0\().16b, \arg1\().16b
121    shl   \arg8\().8h, \arg8\().8h, #1
122    add   \arg8\().8h, \arg9\().8h, \arg8\().8h
123
124    rshrn2    \arg0\().16b, \arg9\().8h, #2
125    rshrn2    \arg7\().16b, \arg8\().8h, #3
126    shl     \arg9\().8h, \arg9\().8h, #1
127    usubl2   \arg8\().8h, \arg5\().16b, \arg1\().16b
128    add     \arg9\().8h, \arg8\().8h, \arg9\().8h
129
130    uaddl2    \arg8\().8h, \arg2\().16b, \arg5\().16b
131    uaddw2    \arg8\().8h, \arg8\().8h, \arg2\().16b
132    uaddw2    \arg8\().8h, \arg8\().8h, \arg3\().16b
133
134    rshrn2    \arg9\().16b, \arg9\().8h, #3
135    rshrn2    \arg8\().16b, \arg8\().8h, #2
136    bsl       \arg6\().16b, \arg9\().16b, \arg8\().16b
137.endm
138
139
140.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
141    uaddl \arg4\().8h, \arg0\().8b, \arg3\().8b
142    shl   \arg4\().8h, \arg4\().8h, #1
143    usubl \arg5\().8h, \arg1\().8b, \arg3\().8b
144    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
145    rshrn \arg6\().8b, \arg5\().8h, #2
146    usubl \arg5\().8h, \arg2\().8b, \arg0\().8b
147    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
148    rshrn \arg7\().8b, \arg5\().8h, #2
149.endm
150
151.macro DIFF_CHROMA_EQ4_P0Q0_2  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
152    uaddl2 \arg4\().8h, \arg0\().16b, \arg3\().16b
153    shl   \arg4\().8h, \arg4\().8h, #1
154    usubl2 \arg5\().8h, \arg1\().16b, \arg3\().16b
155    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
156    rshrn2 \arg6\().16b, \arg5\().8h, #2
157    usubl2 \arg5\().8h, \arg2\().16b, \arg0\().16b
158    add   \arg5\().8h, \arg5\().8h, \arg4\().8h
159    rshrn2 \arg7\().16b, \arg5\().8h, #2
160.endm
161
162.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
163    mov   \arg3\().16b, \arg2\().16b
164    bsl   \arg3\().16b, \arg0\().16b, \arg1\().16b
165.endm
166
167.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
168    ld3   {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x2], x1
169    ld3   {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
170.endm
171
172.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
173    ld4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg8], [x3], x1
174    ld4   {\arg4\().b, \arg5\().b, \arg6\().b, \arg7\().b} [\arg8], [x0], x1
175.endm
176
177.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
178    st4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg4], [x0], x1
179    st4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [x2], x1
180.endm
181
182.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
183    st3   {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x3], x1
184    st3   {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1
185.endm
186
187.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
188    ld4   {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [\arg4], x2
189.endm
190
191.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3
192    st2   {\arg0\().b, \arg1\().b} [\arg3], [\arg2], x2
193.endm
194
195.macro ZERO_JUMP_END arg0, arg1, arg2, arg3
196    mov \arg1, \arg0\().d[0]
197    mov \arg2, \arg0\().d[1]
198    orr \arg1, \arg1, \arg2
199    cbz \arg1, \arg3
200.endm
201
202.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4
203    ld1 {v0.16b}, [\arg0]
204    //Arrange the input data --- TOP
205    ands     x6, \arg1, #2
206    cbz      x6, bs_nzc_check_jump0
207    sub      x6, \arg0, \arg2, lsl #4
208    sub      x6, x6, \arg2, lsl #3
209    add      x6, x6, #12
210    ld1      {v1.s} [3], [x6]
211
212bs_nzc_check_jump0:
213    ext      v1.16b, v1.16b, v0.16b, #12
214    add      \arg3\().16b, v0.16b, v1.16b
215
216    // Arrange the input data --- LEFT
217    ands     x6, \arg1, #1
218    cbz      x6, bs_nzc_check_jump1
219
220    sub      x6, \arg0, #21
221    add      x7, x6, #4
222    ld1      {v1.b} [12], [x6]
223    add      x6, x7, #4
224    ld1      {v1.b} [13], [x7]
225    add      x7, x6, #4
226    ld1      {v1.b} [14], [x6]
227    ld1      {v1.b} [15], [x7]
228
229bs_nzc_check_jump1:
230    ins      v2.d[0], v0.d[1]
231    zip1     v0.16b, v0.16b, v2.16b
232    ins      v2.d[0], v0.d[1]
233    zip1     v0.16b, v0.16b, v2.16b
234    ext      v1.16b, v1.16b, v0.16b, #12
235    add      \arg4\().16b, v0.16b, v1.16b
236.endm
237
238.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5
239    //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5
240    mov   w6, #4
241    sabd  v20.8h, \arg0\().8h, \arg1\().8h
242    sabd  v21.8h, \arg1\().8h, \arg2\().8h
243    dup   \arg0\().8h, w6
244    sabd  v22.8h, \arg2\().8h, \arg3\().8h
245    sabd  v23.8h, \arg3\().8h, \arg4\().8h
246
247    cmge  v20.8h, v20.8h, \arg0\().8h
248    cmge  v21.8h, v21.8h, \arg0\().8h
249    cmge  v22.8h, v22.8h, \arg0\().8h
250    cmge  v23.8h, v23.8h, \arg0\().8h
251
252    addp v20.8h, v20.8h, v21.8h
253    addp v21.8h, v22.8h, v23.8h
254
255    addhn  \arg5\().8b, v20.8h, v20.8h
256    addhn2  \arg5\().16b, v21.8h, v21.8h
257.endm
258
259.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6
260    ldp q0, q1, [\arg0], #32
261    ldp q2, q3, [\arg0]
262    sub \arg0, \arg0, #32
263    // Arrenge the input data --- TOP
264    ands     x6, \arg1, #2
265    cbz     x6, bs_mv_check_jump0
266    sub      x6, \arg0, \arg2, lsl #6
267    add      x6, x6, #48
268    ld1      {v4.16b}, [x6]
269bs_mv_check_jump0:
270    BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg3
271    // Arrange the input data --- LEFT
272    ands     x6, \arg1, #1
273    cbz      x6, bs_mv_check_jump1
274    sub      x6, \arg0, #52
275    add      x7, x6, #16
276    ld1      {v4.s} [0], [x6]
277    add      x6, x7, #16
278    ld1      {v4.s} [1], [x7]
279    add      x7, x6, #16
280    ld1      {v4.s} [2], [x6]
281    ld1      {v4.s} [3], [x7]
282bs_mv_check_jump1:
283    zip1  \arg5\().4s, v0.4s, v2.4s
284    zip2  \arg6\().4s, v0.4s, v2.4s
285    zip1  v0.4s, v1.4s, v3.4s
286    zip2  v2.4s, v1.4s, v3.4s
287    zip2  v1.4s, \arg5\().4s, v0.4s
288    zip1  v0.4s, \arg5\().4s, v0.4s
289    zip2  v3.4s, \arg6\().4s, v2.4s
290    zip1  v2.4s, \arg6\().4s, v2.4s
291    BS_COMPARE_MV  v4, v0, v1, v2, v3, \arg4
292.endm
293
294WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon
295    mov w1, #1
296    dup v3.8b, w1
297    ld1 {v0.8b, v1.8b, v2.8b}, [x0]
298    umin  v0.8b, v0.8b, v3.8b
299    umin  v1.8b, v1.8b, v3.8b
300    umin  v2.8b, v2.8b, v3.8b
301    st1 {v0.8b, v1.8b, v2.8b}, [x0]
302WELS_ASM_AARCH64_FUNC_END
303
304
305WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
306    dup v16.16b, w2 //alpha
307    dup v17.16b, w3 //beta
308    SIGN_EXTENSION x1,w1
309    add x2, x1, x1, lsl #1
310    sub x2, x0, x2
311    movi v23.16b, #128
312    ld1 {v0.16b}, [x2], x1
313    ld1 {v1.16b}, [x2], x1
314    ld1 {v2.16b}, [x2]
315    ld1 {v3.16b}, [x0], x1
316    ld1 {v4.16b}, [x0], x1
317    ld1 {v5.16b}, [x0]
318    sub   x2, x2, x1
319    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
320    trn1 v18.2s, v18.2s, v19.2s
321    trn1 v20.2s, v20.2s, v21.2s
322    trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
323    cmge v7.16b, v6.16b, #0 // iTc0 Flag
324
325    MASK_MATRIX   v1, v2, v3, v4, v16, v17, v18
326    and   v7.16b, v7.16b, v18.16b // need filter flag
327
328    ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end
329
330    eor   v18.16b, v18.16b, v18.16b
331    sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
332
333    DIFF_LUMA_LT4_P1_Q1   v0, v1, v2, v3, v17, v18, v6, v7, v19, v20
334    st1   {v19.16b}, [x2], x1
335
336    DIFF_LUMA_LT4_P1_Q1   v5, v4, v3, v2, v17, v18, v6, v7, v21, v22
337
338    abs   v20.16b, v20.16b
339    abs   v22.16b, v22.16b
340    add   v6.16b, v6.16b, v20.16b
341    add   v6.16b, v6.16b, v22.16b
342    eor   v18.16b, v18.16b, v18.16b
343    sub   v18.16b, v18.16b, v6.16b
344
345    DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
346    DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
347
348    smax  v19.16b, v19.16b, v18.16b
349    smin  v19.16b, v19.16b, v6.16b
350    and     v19.16b, v19.16b, v7.16b
351
352    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
353    uqadd v2.16b, v2.16b, v20.16b
354    uqsub v2.16b, v2.16b, v19.16b
355    st1     {v2.16b}, [x2], x1
356    uqsub v3.16b, v3.16b, v20.16b
357    uqadd v3.16b, v3.16b, v19.16b
358    st1     {v3.16b}, [x2], x1
359    st1     {v21.16b}, [x2]
360DeblockLumaLt4V_AArch64_neon_end:
361WELS_ASM_AARCH64_FUNC_END
362
363
364WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon
365    dup     v16.16b, w2 //alpha
366    dup     v17.16b, w3 //beta
367    SIGN_EXTENSION x1,w1
368    sub     x3, x0, x1, lsl #2
369    ld1     {v0.16b}, [x3], x1
370    ld1     {v4.16b}, [x0], x1
371    ld1     {v1.16b}, [x3], x1
372    ld1     {v5.16b}, [x0], x1
373    ld1     {v2.16b}, [x3], x1
374    ld1     {v6.16b}, [x0], x1
375    ld1     {v3.16b}, [x3]
376    ld1     {v7.16b}, [x0]
377
378    sub     x3, x3, x1, lsl #1
379    MASK_MATRIX   v2, v3, v4, v5, v16, v17, v18
380    lsr       w2, w2, #2
381    add       w2, w2, #2
382    dup     v16.16b, w2 //((alpha >> 2) + 2)
383    uabd  v19.16b, v3.16b, v4.16b
384    cmhi  v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
385
386    uabd  v21.16b, v1.16b, v3.16b
387    cmhi  v21.16b, v17.16b, v21.16b //bDetaP2P0
388    and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
389
390    uabd  v22.16b, v6.16b, v4.16b
391    cmhi  v22.16b, v17.16b, v22.16b //bDetaQ2Q0
392    and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
393    and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
394
395    mov v23.16b, v21.16b
396    mov v24.16b, v21.16b
397
398    mov v25.16b, v0.16b
399    DIFF_LUMA_EQ4_P2P1P0_1        v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
400    DIFF_LUMA_EQ4_P2P1P0_2        v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
401    ins v0.d[1], v25.d[1]
402    ins v23.d[1], v24.d[1]
403    and   v21.16b, v20.16b, v21.16b
404    DIFF_LUMA_EQ4_MASK    v19, v1, v21, v17
405    st1   {v17.16b}, [x3], x1
406    DIFF_LUMA_EQ4_MASK    v0, v2, v21, v17
407    st1   {v17.16b}, [x3], x1
408    DIFF_LUMA_EQ4_MASK    v23, v3, v18, v17
409    st1   {v17.16b}, [x3], x1
410
411
412    mov v23.16b, v22.16b
413    mov v24.16b, v22.16b
414    mov v25.16b, v7.16b
415    DIFF_LUMA_EQ4_P2P1P0_1        v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
416    DIFF_LUMA_EQ4_P2P1P0_2        v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
417    ins v7.d[1], v25.d[1]
418    ins v23.d[1], v24.d[1]
419    and   v22.16b, v20.16b, v22.16b
420    DIFF_LUMA_EQ4_MASK    v23, v4, v18, v17
421    st1   {v17.16b}, [x3], x1
422    DIFF_LUMA_EQ4_MASK    v7, v5, v22, v17
423    st1   {v17.16b}, [x3], x1
424    DIFF_LUMA_EQ4_MASK    v19, v6, v22, v17
425    st1   {v17.16b}, [x3], x1
426DeblockLumaEq4V_AArch64_neon_end:
427WELS_ASM_AARCH64_FUNC_END
428
429
430WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc
431    dup v16.16b, w2 //alpha
432    dup v17.16b, w3 //beta
433    sub x2, x0, #3
434    movi v23.16b, #128
435    SIGN_EXTENSION x1,w1
436    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 0
437    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 1
438    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 2
439    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 3
440    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 4
441    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 5
442    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 6
443    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 7
444
445    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 8
446    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 9
447    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 10
448    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 11
449    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 12
450    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 13
451    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 14
452    LOAD_LUMA_DATA_3      v0, v1, v2, v3, v4, v5, 15
453
454    sub x0, x0, x1, lsl #4
455
456    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4]
457    trn1 v18.2s, v18.2s, v19.2s
458    trn1 v20.2s, v20.2s, v21.2s
459    trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333
460    cmge v7.16b, v6.16b, #0 // iTc0 Flag
461
462    MASK_MATRIX   v1, v2, v3, v4, v16, v17, v18
463    and   v7.16b, v7.16b, v18.16b // need filter flag
464
465    ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end
466
467    eor   v18.16b, v18.16b, v18.16b
468    sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333
469
470    DIFF_LUMA_LT4_P1_Q1   v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24
471    mov v25.16b, v19.16b
472
473    DIFF_LUMA_LT4_P1_Q1   v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24
474
475    abs   v20.16b, v20.16b
476    abs   v22.16b, v22.16b
477    add   v6.16b, v6.16b, v20.16b
478    add   v6.16b, v6.16b, v22.16b
479    eor   v18.16b, v18.16b, v18.16b
480    sub   v18.16b, v18.16b, v6.16b
481
482    DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22
483    DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22
484
485    smax  v19.16b, v19.16b, v18.16b
486    smin  v19.16b, v19.16b, v6.16b
487    and     v19.16b, v19.16b, v7.16b
488
489    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
490    uqadd v2.16b, v2.16b, v20.16b
491    uqsub v2.16b, v2.16b, v19.16b
492    mov v26.16b, v2.16b
493    uqsub v3.16b, v3.16b, v20.16b
494    uqadd v3.16b, v3.16b, v19.16b
495    mov v27.16b, v3.16b
496    mov v28.16b, v21.16b
497
498    sub   x0, x0, #2
499    add   x2, x0, x1
500    lsl   x1, x1, #1
501
502    STORE_LUMA_DATA_4     v25, v26, v27, v28, 0, 1
503    STORE_LUMA_DATA_4     v25, v26, v27, v28, 2, 3
504    STORE_LUMA_DATA_4     v25, v26, v27, v28, 4, 5
505    STORE_LUMA_DATA_4     v25, v26, v27, v28, 6, 7
506
507    STORE_LUMA_DATA_4     v25, v26, v27, v28, 8, 9
508    STORE_LUMA_DATA_4     v25, v26, v27, v28, 10, 11
509    STORE_LUMA_DATA_4     v25, v26, v27, v28, 12, 13
510    STORE_LUMA_DATA_4     v25, v26, v27, v28, 14, 15
511DeblockLumaLt4H_AArch64_neon_end:
512WELS_ASM_AARCH64_FUNC_END
513
514
515WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon
516    dup     v16.16b, w2 //alpha
517    dup     v17.16b, w3 //beta
518    sub     x3, x0, #4
519    SIGN_EXTENSION x1,w1
520    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 0
521    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 1
522    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 2
523    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 3
524    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 4
525    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 5
526    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 6
527    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 7
528
529    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 8
530    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 9
531    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 10
532    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 11
533    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 12
534    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 13
535    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 14
536    LOAD_LUMA_DATA_4      v0, v1, v2, v3, v4, v5, v6, v7, 15
537
538    sub x0, x0, x1, lsl #4
539    sub x3, x0, #3
540    MASK_MATRIX   v2, v3, v4, v5, v16, v17, v18
541
542    ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end
543
544    lsr       w2, w2, #2
545    add       w2, w2, #2
546    dup     v16.16b, w2 //((alpha >> 2) + 2)
547    uabd  v19.16b, v3.16b, v4.16b
548    cmhi  v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2)
549
550    uabd  v21.16b, v1.16b, v3.16b
551    cmhi  v21.16b, v17.16b, v21.16b //bDetaP2P0
552    and     v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0
553
554    uabd  v22.16b, v6.16b, v4.16b
555    cmhi  v22.16b, v17.16b, v22.16b //bDetaQ2Q0
556    and     v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0
557    and     v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2))
558
559    mov v23.16b, v21.16b
560    mov v24.16b, v21.16b
561
562    mov v25.16b, v0.16b
563    DIFF_LUMA_EQ4_P2P1P0_1        v0, v1, v2, v3, v4, v5, v23, v19, v17, v16
564    DIFF_LUMA_EQ4_P2P1P0_2        v25, v1, v2, v3, v4, v5, v24, v19, v17, v16
565    ins v0.d[1], v25.d[1]
566    ins v23.d[1], v24.d[1]
567    and   v21.16b, v20.16b, v21.16b
568    DIFF_LUMA_EQ4_MASK    v19, v1, v21, v17
569    mov v26.16b, v17.16b
570    DIFF_LUMA_EQ4_MASK    v0, v2, v21, v17
571    mov v27.16b, v17.16b
572    DIFF_LUMA_EQ4_MASK    v23, v3, v18, v17
573    mov v28.16b, v17.16b
574
575
576    mov v23.16b, v22.16b
577    mov v24.16b, v22.16b
578    mov v25.16b, v7.16b
579    DIFF_LUMA_EQ4_P2P1P0_1        v7, v6, v5, v4, v3, v2, v23, v19, v17, v16
580    DIFF_LUMA_EQ4_P2P1P0_2        v25, v6, v5, v4, v3, v2, v24, v19, v17, v16
581    ins v7.d[1], v25.d[1]
582    ins v23.d[1], v24.d[1]
583    and   v22.16b, v20.16b, v22.16b
584    DIFF_LUMA_EQ4_MASK    v23, v4, v18, v17
585    mov v29.16b, v17.16b
586    DIFF_LUMA_EQ4_MASK    v7, v5, v22, v17
587    mov v30.16b, v17.16b
588    DIFF_LUMA_EQ4_MASK    v19, v6, v22, v17
589    mov v31.16b, v17.16b
590
591    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 0
592    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 1
593    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 2
594    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 3
595    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 4
596    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 5
597    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 6
598    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 7
599    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 8
600    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 9
601    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 10
602    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 11
603    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 12
604    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 13
605    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 14
606    STORE_LUMA_DATA_3     v26, v27, v28, v29, v30, v31, 15
607DeblockLumaEq4H_AArch64_neon_end:
608WELS_ASM_AARCH64_FUNC_END
609
610
611WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
612    dup v16.16b, w3 //alpha
613    dup v17.16b, w4 //beta
614    lsl x3, x2, #1
615    sub x6, x0, x3 //pPixCb-2*Stride
616    sub x7, x1, x3 //pPixCr-2*Stride
617
618    ld1 {v0.d} [0], [x6], x2
619    ld1 {v1.d} [0], [x6]
620    ld1 {v2.d} [0], [x0], x2
621    ld1 {v3.d} [0], [x0]
622    ld1 {v0.d} [1], [x7], x2
623    ld1 {v1.d} [1], [x7]
624    ld1 {v2.d} [1], [x1], x2
625    ld1 {v3.d} [1], [x1]
626
627    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
628    trn1 v18.4h, v18.4h, v19.4h //0011,0011,
629    trn1 v20.4h, v20.4h, v21.4h //2233,2233
630    zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
631    cmgt v7.16b, v6.16b, #0 // iTc0 Flag
632
633    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v18
634    and   v7.16b, v7.16b, v18.16b // need filter flag
635
636    ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end
637
638    eor   v18.16b, v18.16b, v18.16b
639    sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
640
641    DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
642    DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
643
644    smax  v19.16b, v19.16b, v18.16b
645    smin  v19.16b, v19.16b, v6.16b
646    and     v19.16b, v19.16b, v7.16b
647
648    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
649    uqadd v1.16b, v1.16b, v20.16b
650    uqsub v1.16b, v1.16b, v19.16b
651    st1     {v1.d} [0], [x6], x2
652    st1     {v1.d} [1], [x7], x2
653    uqsub v2.16b, v2.16b, v20.16b
654    uqadd v2.16b, v2.16b, v19.16b
655    st1     {v2.d} [0], [x6]
656    st1     {v2.d} [1], [x7]
657DeblockChromaLt4V_AArch64_neon_end:
658WELS_ASM_AARCH64_FUNC_END
659
660WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc
661    dup v16.16b, w3 //alpha
662    dup v17.16b, w4 //beta
663    sub x6, x0, #2 //pPixCb-2
664    sub x7, x1, #2 //pPixCr-2
665
666    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 0
667    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 1
668    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 2
669    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 3
670    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 4
671    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 5
672    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 6
673    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 7
674
675    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 8
676    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 9
677    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 10
678    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 11
679    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 12
680    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 13
681    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 14
682    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 15
683
684    sub x0, x0, #1
685    sub x1, x1, #1
686
687    ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5]
688    trn1 v18.4h, v18.4h, v19.4h //0011,0011,
689    trn1 v20.4h, v20.4h, v21.4h //2233,2233
690    zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233
691    cmgt v7.16b, v6.16b, #0 // iTc0 Flag
692
693    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v18
694    and   v7.16b, v7.16b, v18.16b // need filter flag
695
696    ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end
697    eor   v18.16b, v18.16b, v18.16b
698    sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233
699
700    DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22
701    DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22
702
703    smax  v19.16b, v19.16b, v18.16b
704    smin  v19.16b, v19.16b, v6.16b
705    and     v19.16b, v19.16b, v7.16b
706
707    EXTRACT_DELTA_INTO_TWO_PART   v19, v20
708    uqadd v1.16b, v1.16b, v20.16b
709    uqsub v1.16b, v1.16b, v19.16b
710    uqsub v2.16b, v2.16b, v20.16b
711    uqadd v2.16b, v2.16b, v19.16b
712
713    STORE_CHROMA_DATA_2 v1, v2, x0, 0
714    STORE_CHROMA_DATA_2 v1, v2, x0, 1
715    STORE_CHROMA_DATA_2 v1, v2, x0, 2
716    STORE_CHROMA_DATA_2 v1, v2, x0, 3
717    STORE_CHROMA_DATA_2 v1, v2, x0, 4
718    STORE_CHROMA_DATA_2 v1, v2, x0, 5
719    STORE_CHROMA_DATA_2 v1, v2, x0, 6
720    STORE_CHROMA_DATA_2 v1, v2, x0, 7
721
722    STORE_CHROMA_DATA_2 v1, v2, x1, 8
723    STORE_CHROMA_DATA_2 v1, v2, x1, 9
724    STORE_CHROMA_DATA_2 v1, v2, x1, 10
725    STORE_CHROMA_DATA_2 v1, v2, x1, 11
726    STORE_CHROMA_DATA_2 v1, v2, x1, 12
727    STORE_CHROMA_DATA_2 v1, v2, x1, 13
728    STORE_CHROMA_DATA_2 v1, v2, x1, 14
729    STORE_CHROMA_DATA_2 v1, v2, x1, 15
730DeblockChromaLt4H_AArch64_neon_end:
731WELS_ASM_AARCH64_FUNC_END
732
733WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
734    dup v16.16b, w3 //alpha
735    dup v17.16b, w4 //beta
736    lsl x3, x2, #1
737    sub x6, x0, x3 //pPixCb-2*Stride
738    sub x7, x1, x3 //pPixCr-2*Stride
739
740    ld1 {v0.d} [0], [x6], x2
741    ld1 {v1.d} [0], [x6]
742    ld1 {v2.d} [0], [x0], x2
743    ld1 {v3.d} [0], [x0]
744    ld1 {v0.d} [1], [x7], x2
745    ld1 {v1.d} [1], [x7]
746    ld1 {v2.d} [1], [x1], x2
747    ld1 {v3.d} [1], [x1]
748
749    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v7
750
751    ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end
752
753    DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
754    DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
755
756    mov v6.16b, v7.16b
757    bsl v6.16b, v20.16b, v1.16b
758    bsl v7.16b, v21.16b, v2.16b
759
760    st1     {v6.d} [0], [x6], x2
761    st1     {v6.d} [1], [x7], x2
762
763    st1     {v7.d} [0], [x6]
764    st1     {v7.d} [1], [x7]
765DeblockChromaEq4V_AArch64_neon_end:
766WELS_ASM_AARCH64_FUNC_END
767
768WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta
769    dup v16.16b, w3 //alpha
770    dup v17.16b, w4 //beta
771
772    sub x6, x0, #2 //pPixCb-2
773    sub x7, x1, #2 //pPixCr-2
774
775    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 0
776    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 1
777    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 2
778    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 3
779    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 4
780    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 5
781    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 6
782    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x6, 7
783
784    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 8
785    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 9
786    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 10
787    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 11
788    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 12
789    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 13
790    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 14
791    LOAD_CHROMA_DATA_4        v0, v1, v2, v3, x7, 15
792    sub x0, x0, #1
793    sub x1, x1, #1
794
795    MASK_MATRIX   v0, v1, v2, v3, v16, v17, v7
796
797    ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end
798
799    DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21
800    DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21
801
802    mov v6.16b, v7.16b
803    bsl v6.16b, v20.16b, v1.16b
804    bsl v7.16b, v21.16b, v2.16b
805
806    STORE_CHROMA_DATA_2 v6, v7, x0, 0
807    STORE_CHROMA_DATA_2 v6, v7, x0, 1
808    STORE_CHROMA_DATA_2 v6, v7, x0, 2
809    STORE_CHROMA_DATA_2 v6, v7, x0, 3
810    STORE_CHROMA_DATA_2 v6, v7, x0, 4
811    STORE_CHROMA_DATA_2 v6, v7, x0, 5
812    STORE_CHROMA_DATA_2 v6, v7, x0, 6
813    STORE_CHROMA_DATA_2 v6, v7, x0, 7
814
815    STORE_CHROMA_DATA_2 v6, v7, x1, 8
816    STORE_CHROMA_DATA_2 v6, v7, x1, 9
817    STORE_CHROMA_DATA_2 v6, v7, x1, 10
818    STORE_CHROMA_DATA_2 v6, v7, x1, 11
819    STORE_CHROMA_DATA_2 v6, v7, x1, 12
820    STORE_CHROMA_DATA_2 v6, v7, x1, 13
821    STORE_CHROMA_DATA_2 v6, v7, x1, 14
822    STORE_CHROMA_DATA_2 v6, v7, x1, 15
823DeblockChromaEq4H_AArch64_neon_end:
824WELS_ASM_AARCH64_FUNC_END
825
826
827WELS_ASM_AARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon
828    // Checking the nzc status
829    BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status
830    // For checking bS[I] = 2
831    movi     v0.16b, #0
832    cmgt     v16.16b, v16.16b, v0.16b
833    cmgt     v17.16b, v17.16b, v0.16b
834    movi     v0.16b, #2
835
836    and  v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top
837    and  v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left
838
839    // Checking the mv status
840    BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status
841    // For checking bS[I] = 1
842    movi   v0.16b, #1
843    and  v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top
844    and  v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left
845    // Check bS[I] is '1' or '2'
846    umax v1.16b, v18.16b, v16.16b
847    umax v0.16b, v19.16b, v17.16b
848    st1 {v0.16b, v1.16b}, [x4]
849WELS_ASM_AARCH64_FUNC_END
850
851
852#endif
853