• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35.align 4
36filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
37
38.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
39//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
40    uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
41    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
42    mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
43    uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
44    mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
45    sqrshrun \arg6\().8b, v18.8h, #5
46//  }
47.endm
48
49.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
50//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
51    uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
52    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
53    mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
54    uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
55    mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
56    sqrshrun2 \arg6\().16b, v18.8h, #5
57//  }
58.endm
59
60.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
61//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
62    uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
63    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
64    mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
65    uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
66    mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
67    sqrshrun \arg6\().8b, v18.8h, #5
68    uaddl  v19.8h, \arg2\().8b, \arg6\().8b
69    rshrn \arg6\().8b, v19.8h, #1
70//  }
71.endm
72
73.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
74//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
75    uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
76    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
77    mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
78    uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
79    mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
80    sqrshrun2 \arg6\().16b, v18.8h, #5
81    uaddl2  v19.8h, \arg2\().16b, \arg6\().16b
82    rshrn2 \arg6\().16b, v19.8h, #1
83//  }
84.endm
85
86.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
87//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
88    uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
89    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
90    mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
91    uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
92    mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
93    sqrshrun \arg6\().8b, v18.8h, #5
94    uaddl  v19.8h, \arg3\().8b, \arg6\().8b
95    rshrn \arg6\().8b, v19.8h, #1
96//  }
97.endm
98
99.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
100//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
101    uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
102    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
103    mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
104    uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
105    mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
106    sqrshrun2 \arg6\().16b, v18.8h, #5
107    uaddl2  v19.8h, \arg3\().16b, \arg6\().16b
108    rshrn2 \arg6\().16b, v19.8h, #1
109//  }
110.endm
111
112.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
113//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
114    uaddl   \arg6\().8h, \arg0\().8b, \arg5\().8b       //dst_q=src[-2]+src[3]
115    uaddl   v31.8h, \arg2\().8b, \arg3\().8b    //src[0]+src[1]
116    mla \arg6\().8h, v31.8h, \arg7\().8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
117    uaddl   v31.8h, \arg1\().8b, \arg4\().8b    //src[-1]+src[2]
118    mls \arg6\().8h, v31.8h, \arg8\().8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
119//  }
120.endm
121
122.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
123//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
124    uaddl2  \arg6\().8h, \arg0\().16b, \arg5\().16b     //dst_q=src[-2]+src[3]
125    uaddl2  v31.8h, \arg2\().16b, \arg3\().16b  //src[0]+src[1]
126    mla \arg6\().8h, v31.8h, \arg7\().8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
127    uaddl2  v31.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
128    mls \arg6\().8h, v31.8h, \arg8\().8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
129//  }
130.endm
131
132.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
133//  {   // input:a, b, c, dst_d;
134    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //a-b
135    sshr    \arg0\().8h, \arg0\().8h, #2            //(a-b)/4
136    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //(a-b)/4-b
137    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //(a-b)/4-b+c
138    sshr    \arg0\().8h, \arg0\().8h, #2            //((a-b)/4-b+c)/4
139    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
140    sqrshrun    \arg3\().8b, \arg0\().8h, #6        //(+32)>>6
141//  }
142.endm
143
144.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
145//  {   // input:a, b, c, dst_d;
146    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //a-b
147    sshr    \arg0\().8h, \arg0\().8h, #2            //(a-b)/4
148    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //(a-b)/4-b
149    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //(a-b)/4-b+c
150    sshr    \arg0\().8h, \arg0\().8h, #2            //((a-b)/4-b+c)/4
151    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
152    sqrshrun2   \arg3\().16b, \arg0\().8h, #6       //(+32)>>6
153//  }
154.endm
155
156.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
157//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
158    ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4        //src[0]
159    ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6        //src[1]
160    add \arg4\().8h, \arg4\().8h, \arg3\().8h                   //c=src[0]+src[1]
161
162    ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2        //src[-1]
163    ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8        //src[2]
164    add \arg3\().8h, \arg3\().8h, \arg2\().8h                   //b=src[-1]+src[2]
165
166    ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10       //src[3]
167    add \arg2\().8h, \arg2\().8h, \arg0\().8h                   //a=src[-2]+src[3]
168//  }
169.endm
170
171.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
172//  {   // input:dst_d, src_d A and B; working: v5
173    uaddl   v30.8h, \arg2\().8b, \arg1\().8b
174    rshrn   \arg0\().8b, v30.8h, #1
175//  }
176.endm
177
178.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
179//  {   // input:dst_d, src_d A and B; working: v5
180    uaddl2  v30.8h, \arg2\().16b, \arg1\().16b
181    rshrn2  \arg0\().16b, v30.8h, #1
182//  }
183.endm
184
185.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
186// when width=17/9, used
187//  {   // input: src_d{Y[0][1][2][3][4][5]X},
188    rev64   \arg2\().8b, \arg0\().8b                // X[5][4][3][2][1][0]O
189    uaddl   \arg2\().8h, \arg0\().8b, \arg2\().8b           // each 16bits, *[50][41][32][23][14][05]*
190    mul \arg2\().4h, \arg2\().4h, \arg1\().4h           // 0+1*[50]-5*[41]+20[32]
191    addv \arg3, \arg2\().4h
192    sqrshrun \arg0\().8b, \arg0\().8h, #5
193//  }
194.endm
195
196.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
197//  {   // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
198    ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14       // X[0][1][2][3][4][5]O
199    ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8      // [3][4][5]OX[0][1][2]
200    rev64  \arg4\().8h, \arg4\().8h         // X[5][4][3][2][1][0]O
201    add   \arg3\().8h, \arg3\().8h, \arg4\().8h    // each 16bits, *[50][41][32][23][14][05]*
202    smull \arg3\().4s, \arg3\().4h, \arg2\().4h         // 0+1*[50]-5*[41]+20[32]
203    saddlv \arg5, \arg3\().4s
204    //sshr \arg0\().2d, \arg0\().2d, #4
205    sqrshrun \arg0\().2s, \arg0\().2d, #10
206    uqxtn \arg0\().4h, \arg0\().4s
207    uqxtn \arg0\().8b, \arg0\().8h
208    //   }
209.endm
210
211.macro VEC4_LD1_8BITS_16ELEMENT arg0, arg1, arg2, arg3, arg4, arg5
212//{//load 16bytes * 4rows
213    ld1 {\arg2\().16b}, [\arg0], \arg1
214    ld1 {\arg3\().16b}, [\arg0], \arg1
215    ld1 {\arg4\().16b}, [\arg0], \arg1
216    ld1 {\arg5\().16b}, [\arg0], \arg1
217//}
218.endm
219
220.macro VEC4_ST1_8BITS_8ELEMENT arg0, arg1, arg2, arg3, arg4, arg5
221//{
222    st1 {\arg2\().8b}, [\arg0], \arg1
223    st1 {\arg3\().8b}, [\arg0], \arg1
224    st1 {\arg4\().8b}, [\arg0], \arg1
225    st1 {\arg5\().8b}, [\arg0], \arg1
226//}
227.endm
228
229.macro VEC4_UADDL_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11
230//{
231    uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b
232    uaddl \arg9\().8h, \arg2\().8b, \arg3\().8b
233    uaddl \arg10\().8h, \arg4\().8b, \arg5\().8b
234    uaddl \arg11\().8h, \arg6\().8b, \arg7\().8b
235//}
236.endm
237
238.macro VEC4_UADDL2_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11
239//{
240    uaddl \arg8\().8h, \arg0\().16b, \arg1\().16b
241    uaddl \arg9\().8h, \arg2\().16b, \arg3\().16b
242    uaddl \arg10\().8h, \arg4\().16b, \arg5\().16b
243    uaddl \arg11\().8h, \arg6\().16b, \arg7\().16b
244//}
245.endm
246
247.macro VEC4_MLS_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11
248//{
249    mls   \arg8\().8h, \arg0\().8h, \arg1\().8h
250    mls   \arg9\().8h, \arg2\().8h, \arg3\().8h
251    mls   \arg10\().8h, \arg4\().8h, \arg5\().8h
252    mls   \arg11\().8h, \arg6\().8h, \arg7\().8h
253//}
254.endm
255
256.macro VEC4_MLA_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11
257//{
258    mla   \arg8\().8h, \arg0\().8h, \arg1\().8h
259    mla   \arg9\().8h, \arg2\().8h, \arg3\().8h
260    mla   \arg10\().8h, \arg4\().8h, \arg5\().8h
261    mla   \arg11\().8h, \arg6\().8h, \arg7\().8h
262//}
263.endm
264
265.macro VEC4_SQRSHRUN_16BITS_SHIFT5 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
266//{
267    sqrshrun \arg4\().8b, \arg0\().8h, #5
268    sqrshrun \arg5\().8b, \arg1\().8h, #5
269    sqrshrun \arg6\().8b, \arg2\().8h, #5
270    sqrshrun \arg7\().8b, \arg3\().8h, #5
271//}
272.endm
273
274.macro VEC4_SQRSHRUN2_16BITS_SHIFT5 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
275//{
276    sqrshrun2 \arg4\().16b, \arg0\().8h, #5
277    sqrshrun2 \arg5\().16b, \arg1\().8h, #5
278    sqrshrun2 \arg6\().16b, \arg2\().8h, #5
279    sqrshrun2 \arg7\().16b, \arg3\().8h, #5
280//}
281.endm
282
283.macro VEC4_RSHRN_16BITS_SHIFT1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
284//{
285    rshrn \arg4\().8b, \arg0\().8h, #1
286    rshrn \arg5\().8b, \arg1\().8h, #1
287    rshrn \arg6\().8b, \arg2\().8h, #1
288    rshrn \arg7\().8b, \arg3\().8h, #1
289//}
290.endm
291
292//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4})
293WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon
294    sub x0, x0, #2
295    movi v0.8h, #20, lsl #0
296    movi v1.8h, #5, lsl #0
297    SIGN_EXTENSION x1,w1
298    SIGN_EXTENSION x3,w3
299    SIGN_EXTENSION x4,w4
300w16_h_mc_luma_loop:
301    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
302    trn1 v2.2d, v2.2d, v3.2d
303    //prfm pldl1strm, [x0]
304    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
305    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
306    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
307    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
308    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
309
310    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
311    FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
312
313    sub x4, x4, #1
314    st1 {v20.16b}, [x2], x3 //write 16Byte
315    cbnz x4, w16_h_mc_luma_loop
316WELS_ASM_AARCH64_FUNC_END
317
318//void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,int32_t iHeight);
319WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
320    sub x0, x0, #2
321    stp d8,d9, [sp,#-16]!
322    movi v8.8h, #20, lsl #0
323    movi v9.8h, #5, lsl #0
324    SIGN_EXTENSION x1,w1
325    SIGN_EXTENSION x3,w3
326    SIGN_EXTENSION x4,w4
327w8_h_mc_luma_loop:
328    VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
329    sub x4, x4, #4
330
331    //1st row:
332    ext v17.16b, v16.16b, v16.16b, #5  //src[3]
333    ext v18.16b, v16.16b, v16.16b, #1  //src[-1]
334    ext v19.16b, v16.16b, v16.16b, #4  //src[2]
335    //2nd row:
336    ext v21.16b, v20.16b, v20.16b, #5  //src[3]
337    ext v22.16b, v20.16b, v20.16b, #1  //src[-1]
338    ext v23.16b, v20.16b, v20.16b, #4  //src[2]
339    //3rd row:
340    ext v25.16b, v24.16b, v24.16b, #5  //src[3]
341    ext v26.16b, v24.16b, v24.16b, #1  //src[-1]
342    ext v27.16b, v24.16b, v24.16b, #4  //src[2]
343    //4th row:
344    ext v29.16b, v28.16b, v28.16b, #5  //src[3]
345    ext v30.16b, v28.16b, v28.16b, #1  //src[-1]
346    ext v31.16b, v28.16b, v28.16b, #4  //src[2]
347
348    VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6   //v0/v2/v4/v6=src[-2]+src[3]
349    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2]
350    VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6  //v0/v2/v4/v6 -= 5*(src[-1]+src[2])
351
352    //1st row:
353    ext v18.16b, v16.16b, v16.16b, #2  //src[0]
354    ext v19.16b, v16.16b, v16.16b, #3  //src[1]
355    //2nd row:
356    ext v22.16b, v20.16b, v20.16b, #2  //src[0]
357    ext v23.16b, v20.16b, v20.16b, #3  //src[1]
358    //3rd row:
359    ext v26.16b, v24.16b, v24.16b, #2  //src[0]
360    ext v27.16b, v24.16b, v24.16b, #3  //src[1]
361    //4th row:
362    ext v30.16b, v28.16b, v28.16b, #2  //src[0]
363    ext v31.16b, v28.16b, v28.16b, #3  //src[1]
364
365    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1]
366    VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6  //v0/v2/v4/v6+=20*(src[0]+src[1])
367
368    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
369
370    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7
371    cbnz x4, w8_h_mc_luma_loop
372
373    ldp d8,d9,[sp],#16
374WELS_ASM_AARCH64_FUNC_END
375
376//void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
377//                                      int32_t iHeight);
378WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
379    sub x0, x0, #2
380    movi v0.8h, #20, lsl #0
381    movi v1.8h, #5, lsl #0
382    SIGN_EXTENSION x1,w1
383    SIGN_EXTENSION x3,w3
384    SIGN_EXTENSION x4,w4
385    asr x4, x4, #1
386w4_h_mc_luma_loop:
387    ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
388    //prfm pldl1strm, [x0]
389    ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6]
390    //prfm pldl1strm, [x0]
391
392    zip1 v4.4s, v2.4s, v3.4s  // v4=src[-2] 1st:2nd
393    ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd
394
395    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[-1:6]
396    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[-1:6]
397    zip1 v5.4s, v2.4s, v3.4s  // v5=src[-1:2] 1st:2nd
398    ext v7.16b, v5.16b, v4.16b, #8    //v7=src[3:6] 1st:2nd
399
400    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[0:6]
401    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[0:6]
402    zip1 v6.4s, v2.4s, v3.4s  // v6=src[0:3] 1st:2nd
403
404    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[1:6]
405    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[1:6]
406    zip1 v16.4s, v2.4s, v3.4s  // v16=src[1:4] 1st:2nd
407
408    FILTER_6TAG_8BITS1 v4, v5, v6, v16, v17, v7, v20, v0, v1
409
410    st1 {v20.s}[0], [x2], x3 //write 4Byte
411    st1 {v20.s}[1], [x2], x3 //write 4Byte
412    sub x4, x4, #1
413    cbnz x4, w4_h_mc_luma_loop
414WELS_ASM_AARCH64_FUNC_END
415
416//void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
417//                                       int32_t iHeight);
418WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon
419    sub x0, x0, #2
420    movi v0.8h, #20, lsl #0
421    movi v1.8h, #5, lsl #0
422    SIGN_EXTENSION x1,w1
423    SIGN_EXTENSION x3,w3
424    SIGN_EXTENSION x4,w4
425w16_xy_10_mc_luma_loop:
426    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
427    trn1 v2.2d, v2.2d, v3.2d
428    //prfm pldl1strm, [x0]
429    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
430    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
431    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
432    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
433    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
434
435    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1
436    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1
437
438    sub x4, x4, #1
439    st1 {v20.16b}, [x2], x3 //write 16Byte
440    cbnz x4, w16_xy_10_mc_luma_loop
441WELS_ASM_AARCH64_FUNC_END
442
443//void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
444//                                      int32_t iHeight);
445WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon
446    sub x0, x0, #2
447    stp d8,d9, [sp,#-16]!
448    movi v8.8h, #20, lsl #0
449    movi v9.8h, #5, lsl #0
450    SIGN_EXTENSION x1,w1
451    SIGN_EXTENSION x3,w3
452    SIGN_EXTENSION x4,w4
453w8_xy_10_mc_luma_loop:
454    VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
455    sub x4, x4, #4
456
457    //1st row:
458    ext v17.16b, v16.16b, v16.16b, #5  //src[3]
459    ext v18.16b, v16.16b, v16.16b, #1  //src[-1]
460    ext v19.16b, v16.16b, v16.16b, #4  //src[2]
461    //2nd row:
462    ext v21.16b, v20.16b, v20.16b, #5  //src[3]
463    ext v22.16b, v20.16b, v20.16b, #1  //src[-1]
464    ext v23.16b, v20.16b, v20.16b, #4  //src[2]
465    //3rd row:
466    ext v25.16b, v24.16b, v24.16b, #5  //src[3]
467    ext v26.16b, v24.16b, v24.16b, #1  //src[-1]
468    ext v27.16b, v24.16b, v24.16b, #4  //src[2]
469    //4th row:
470    ext v29.16b, v28.16b, v28.16b, #5  //src[3]
471    ext v30.16b, v28.16b, v28.16b, #1  //src[-1]
472    ext v31.16b, v28.16b, v28.16b, #4  //src[2]
473
474    VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6   //v0/v2/v4/v6=src[-2]+src[3]
475    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2]
476    VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6  //v0/v2/v4/v6 -= 5*(src[-1]+src[2])
477
478    //1st row:
479    ext v18.16b, v16.16b, v16.16b, #2  //src[0]
480    ext v19.16b, v16.16b, v16.16b, #3  //src[1]
481    //2nd row:
482    ext v22.16b, v20.16b, v20.16b, #2  //src[0]
483    ext v23.16b, v20.16b, v20.16b, #3  //src[1]
484    //3rd row:
485    ext v26.16b, v24.16b, v24.16b, #2  //src[0]
486    ext v27.16b, v24.16b, v24.16b, #3  //src[1]
487    //4th row:
488    ext v30.16b, v28.16b, v28.16b, #2  //src[0]
489    ext v31.16b, v28.16b, v28.16b, #3  //src[1]
490
491    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1]
492    VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6  //v0/v2/v4/v6+=20*(src[0]+src[1])
493    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
494
495    VEC4_UADDL_8BITS v1, v18, v3, v22, v5, v26, v7, v30, v0, v2, v4, v6   //average with arc[0]
496    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
497
498    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7
499    cbnz x4, w8_xy_10_mc_luma_loop
500
501    ldp d8,d9,[sp],#16
502WELS_ASM_AARCH64_FUNC_END
503
504//void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
505//                                      int32_t iHeight);
506WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
507    sub x0, x0, #2
508    movi v0.8h, #20, lsl #0
509    movi v1.8h, #5, lsl #0
510    SIGN_EXTENSION x1,w1
511    SIGN_EXTENSION x3,w3
512    SIGN_EXTENSION x4,w4
513    asr x4, x4, #1
514w4_xy_10_mc_luma_loop:
515    ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
516    //prfm pldl1strm, [x0]
517    ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6]
518    //prfm pldl1strm, [x0]
519
520    zip1 v4.4s, v2.4s, v3.4s  // v4=src[-2] 1st:2nd
521    ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd
522
523    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[-1:6]
524    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[-1:6]
525    zip1 v5.4s, v2.4s, v3.4s  // v5=src[-1:2] 1st:2nd
526    ext v7.16b, v5.16b, v4.16b, #8    //v7=src[3:6] 1st:2nd
527
528    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[0:6]
529    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[0:6]
530    zip1 v6.4s, v2.4s, v3.4s  // v6=src[0:3] 1st:2nd
531
532    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[1:6]
533    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[1:6]
534    zip1 v16.4s, v2.4s, v3.4s  // v16=src[1:4] 1st:2nd
535
536    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v16, v17, v7, v20, v0, v1
537
538    st1 {v20.s}[0], [x2], x3 //write 4Byte
539    st1 {v20.s}[1], [x2], x3 //write 4Byte
540    sub x4, x4, #1
541    cbnz x4, w4_xy_10_mc_luma_loop
542WELS_ASM_AARCH64_FUNC_END
543
544//void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
545//                                       int32_t iHeight);
546WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq16_AArch64_neon
547    sub x0, x0, #2
548    movi v0.8h, #20, lsl #0
549    movi v1.8h, #5, lsl #0
550    SIGN_EXTENSION x1,w1
551    SIGN_EXTENSION x3,w3
552    SIGN_EXTENSION x4,w4
553w16_xy_30_mc_luma_loop:
554    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2]
555    trn1 v2.2d, v2.2d, v3.2d
556    //prfm pldl1strm, [x0]
557    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
558    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
559    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
560    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
561    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
562
563    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1
564    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1
565
566    sub x4, x4, #1
567    st1 {v20.16b}, [x2], x3 //write 16Byte
568    cbnz x4, w16_xy_30_mc_luma_loop
569WELS_ASM_AARCH64_FUNC_END
570
571//void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
572//                                      int32_t iHeight);
573WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon
574    sub x0, x0, #2
575    stp d8,d9, [sp,#-16]!
576    movi v8.8h, #20, lsl #0
577    movi v9.8h, #5, lsl #0
578    SIGN_EXTENSION x1,w1
579    SIGN_EXTENSION x3,w3
580    SIGN_EXTENSION x4,w4
581w8_xy_30_mc_luma_loop:
582    VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28    //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5);
583    sub x4, x4, #4
584
585    //1st row:
586    ext v17.16b, v16.16b, v16.16b, #5  //src[3]
587    ext v18.16b, v16.16b, v16.16b, #1  //src[-1]
588    ext v19.16b, v16.16b, v16.16b, #4  //src[2]
589    //2nd row:
590    ext v21.16b, v20.16b, v20.16b, #5  //src[3]
591    ext v22.16b, v20.16b, v20.16b, #1  //src[-1]
592    ext v23.16b, v20.16b, v20.16b, #4  //src[2]
593    //3rd row:
594    ext v25.16b, v24.16b, v24.16b, #5  //src[3]
595    ext v26.16b, v24.16b, v24.16b, #1  //src[-1]
596    ext v27.16b, v24.16b, v24.16b, #4  //src[2]
597    //4th row:
598    ext v29.16b, v28.16b, v28.16b, #5  //src[3]
599    ext v30.16b, v28.16b, v28.16b, #1  //src[-1]
600    ext v31.16b, v28.16b, v28.16b, #4  //src[2]
601
602    VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6   //v0/v2/v4/v6=src[-2]+src[3]
603    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2]
604    VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6  //v0/v2/v4/v6 -= 5*(src[-1]+src[2])
605
606    //1st row:
607    ext v18.16b, v16.16b, v16.16b, #2  //src[0]
608    ext v19.16b, v16.16b, v16.16b, #3  //src[1]
609    //2nd row:
610    ext v22.16b, v20.16b, v20.16b, #2  //src[0]
611    ext v23.16b, v20.16b, v20.16b, #3  //src[1]
612    //3rd row:
613    ext v26.16b, v24.16b, v24.16b, #2  //src[0]
614    ext v27.16b, v24.16b, v24.16b, #3  //src[1]
615    //4th row:
616    ext v30.16b, v28.16b, v28.16b, #2  //src[0]
617    ext v31.16b, v28.16b, v28.16b, #3  //src[1]
618
619    VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1]
620    VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6  //v0/v2/v4/v6+=20*(src[0]+src[1])
621    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
622
623    VEC4_UADDL_8BITS v1, v19, v3, v23, v5, v27, v7, v31, v0, v2, v4, v6   //average with arc[0]
624    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
625
626    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7
627    cbnz x4, w8_xy_30_mc_luma_loop
628
629    ldp d8,d9,[sp],#16
630WELS_ASM_AARCH64_FUNC_END
631
632//void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
633//                                      int32_t iHeight);
634WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
635    sub x0, x0, #2
636    movi v0.8h, #20, lsl #0
637    movi v1.8h, #5, lsl #0
638    SIGN_EXTENSION x1,w1
639    SIGN_EXTENSION x3,w3
640    SIGN_EXTENSION x4,w4
641    asr x4, x4, #1
642w4_xy_30_mc_luma_loop:
643    ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6]
644    //prfm pldl1strm, [x0]
645    ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6]
646    //prfm pldl1strm, [x0]
647
648    zip1 v4.4s, v2.4s, v3.4s  // v4=src[-2] 1st:2nd
649    ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd
650
651    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[-1:6]
652    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[-1:6]
653    zip1 v5.4s, v2.4s, v3.4s  // v5=src[-1:2] 1st:2nd
654    ext v7.16b, v5.16b, v4.16b, #8    //v7=src[3:6] 1st:2nd
655
656    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[0:6]
657    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[0:6]
658    zip1 v6.4s, v2.4s, v3.4s  // v6=src[0:3] 1st:2nd
659
660    ext v2.16b, v2.16b, v4.16b, #1    //1st row src[1:6]
661    ext v3.16b, v3.16b, v4.16b, #1    //2nd row src[1:6]
662    zip1 v16.4s, v2.4s, v3.4s  // v16=src[1:4] 1st:2nd
663
664    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v16, v17, v7, v20, v0, v1
665
666    st1 {v20.s}[0], [x2], x3 //write 4Byte
667    st1 {v20.s}[1], [x2], x3 //write 4Byte
668    sub x4, x4, #1
669    cbnz x4, w4_xy_30_mc_luma_loop
670WELS_ASM_AARCH64_FUNC_END
671
672//void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
673//                                       int32_t iHeight);
674WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq16_AArch64_neon
675    SIGN_EXTENSION x1,w1
676    SIGN_EXTENSION x3,w3
677    SIGN_EXTENSION x4,w4
678    sub x0, x0, x1, lsl #1
679    movi v0.8h, #20, lsl #0
680    movi v1.8h, #5, lsl #0
681
682    //prfm pldl1strm, [x0]
683    //prfm pldl1strm, [x0, x1]
684    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
685    //prfm pldl1strm, [x0, x1]
686    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
687    //prfm pldl1strm, [x0, x1]
688    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
689    //prfm pldl1strm, [x0, x1]
690    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
691    //prfm pldl1strm, [x0, x1]
692    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
693
694
695w16_xy_01_mc_luma_loop:
696    //prfm pldl1strm, [x0, x1]
697    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
698    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
699    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
700    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
701
702
703    //prfm pldl1strm, [x0, x1]
704    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
705    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
706    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
707    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
708
709
710    //prfm pldl1strm, [x0, x1]
711    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
712    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1
713    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1
714    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
715
716
717    //prfm pldl1strm, [x0, x1]
718    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
719    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1
720    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1
721    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
722
723
724    //prfm pldl1strm, [x0, x1]
725    ld1 {v5.16b}, [x0], x1 // v5=src[3*stride]
726    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1
727    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1
728    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
729
730
731    //prfm pldl1strm, [x0, x1]
732    ld1 {v6.16b}, [x0], x1 // v6=src[3*stride]
733    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1
734    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1
735    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
736
737    //prfm pldl1strm, [x0, x1]
738    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
739    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
740    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
741    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
742
743    //prfm pldl1strm, [x0, x1]
744    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
745    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
746    FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1
747    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
748
749    mov v3.16b, v5.16b
750    mov v5.16b, v7.16b
751    mov v7.16b, v2.16b
752    mov v2.16b, v4.16b
753    mov v4.16b, v6.16b
754    mov v6.16b, v7.16b
755    sub x4, x4, #8
756    cbnz x4, w16_xy_01_mc_luma_loop
757WELS_ASM_AARCH64_FUNC_END
758
759//void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
760//                                      int32_t iHeight);
761WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq8_AArch64_neon
762    SIGN_EXTENSION x1,w1
763    SIGN_EXTENSION x3,w3
764    SIGN_EXTENSION x4,w4
765    sub x0, x0, x1, lsl #1
766    movi v30.8h, #20, lsl #0
767    movi v31.8h, #5, lsl #0
768
769    ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride]
770    ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride]
771    ld1 {v18.8b}, [x0], x1 // v18=src[0*stride]
772    ld1 {v19.8b}, [x0], x1 // v19=src[1*stride]
773    ld1 {v20.8b}, [x0], x1 // v20=src[2*stride]
774
775w8_xy_01_mc_luma_loop:
776    ld1 {v21.8b}, [x0], x1 // v21=src[3*stride]
777    ld1 {v22.8b}, [x0], x1 // v22=src[4*stride]
778    ld1 {v23.8b}, [x0], x1 // v23=src[5*stride]
779    ld1 {v24.8b}, [x0], x1 // v24=src[6*stride]
780
781    VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3]
782    VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2]
783    VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6  //v0/v2/v4/v6 -=5*(src[-1]+src[2])
784    VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1]
785    VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6  //v0/v2/v4/v6 += 20*(src[0]+src[1])
786    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
787
788    VEC4_UADDL_8BITS v1, v18, v3, v19, v5, v20, v7, v21, v0, v2, v4, v6 //v0/v2/v4/v6 = average with src[0]
789    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
790
791    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7  //store 8bytes*4row
792
793    sub x4, x4, #4
794    mov v16.16b, v20.16b
795    mov v17.16b, v21.16b
796    mov v18.16b, v22.16b
797    mov v19.16b, v23.16b
798    mov v20.16b, v24.16b
799
800    cbnz x4, w8_xy_01_mc_luma_loop
801WELS_ASM_AARCH64_FUNC_END
802
803//void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
804//                                      int32_t iHeight);
805WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq4_AArch64_neon
806    SIGN_EXTENSION x1,w1
807    SIGN_EXTENSION x3,w3
808    SIGN_EXTENSION x4,w4
809    sub x0, x0, x1, lsl #1
810    movi v0.8h, #20, lsl #0
811    movi v1.8h, #5, lsl #0
812
813    //prfm pldl1strm, [x0]
814    //prfm pldl1strm, [x0, x1]
815    ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride]
816    //prfm pldl1strm, [x0, x1]
817    ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride]
818    mov v2.s[1], v3.s[0]
819    //prfm pldl1strm, [x0, x1]
820    ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride]
821    mov v3.s[1], v4.s[0]
822    //prfm pldl1strm, [x0, x1]
823    ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride]
824    mov v4.s[1], v5.s[0]
825    //prfm pldl1strm, [x0, x1]
826    ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride]
827    mov v5.s[1], v6.s[0]
828
829w4_xy_01_mc_luma_loop:
830    //prfm pldl1strm, [x0, x1]
831    ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride]
832    mov v6.s[1], v7.s[0]
833    //prfm pldl1strm, [x0, x1]
834    ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride]
835    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1
836    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
837    st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line
838    mov v2.s[0], v7.s[1]
839
840    //prfm pldl1strm, [x0, x1]
841    ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride]
842    //prfm pldl1strm, [x0, x1]
843    ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride]
844    mov v3.s[0], v2.s[1]
845    FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1
846    st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
847    st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
848    mov v4.s[0], v3.s[1]
849
850    mov v21.8b, v6.8b
851    mov v6.8b, v4.8b
852    mov v4.8b, v2.8b
853    mov v2.8b, v21.8b
854    mov v21.8b, v3.8b
855    mov v3.8b, v7.8b
856    mov v7.8b, v5.8b
857    mov v5.8b, v21.8b
858
859    sub x4, x4, #4
860    cbnz x4, w4_xy_01_mc_luma_loop
861WELS_ASM_AARCH64_FUNC_END
862
863//void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
864//                                       int32_t iHeight);
865WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq16_AArch64_neon
866    SIGN_EXTENSION x1,w1
867    SIGN_EXTENSION x3,w3
868    SIGN_EXTENSION x4,w4
869    sub x0, x0, x1, lsl #1
870    movi v0.8h, #20, lsl #0
871    movi v1.8h, #5, lsl #0
872
873    //prfm pldl1strm, [x0]
874    //prfm pldl1strm, [x0, x1]
875    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
876    //prfm pldl1strm, [x0, x1]
877    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
878    //prfm pldl1strm, [x0, x1]
879    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
880    //prfm pldl1strm, [x0, x1]
881    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
882    //prfm pldl1strm, [x0, x1]
883    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
884
885
886w16_xy_03_mc_luma_loop:
887    //prfm pldl1strm, [x0, x1]
888    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
889    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
890    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
891    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
892
893
894    //prfm pldl1strm, [x0, x1]
895    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
896    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
897    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
898    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
899
900
901    //prfm pldl1strm, [x0, x1]
902    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
903    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1
904    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1
905    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
906
907
908    //prfm pldl1strm, [x0, x1]
909    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
910    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1
911    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1
912    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
913
914
915    //prfm pldl1strm, [x0, x1]
916    ld1 {v5.16b}, [x0], x1 // v5=src[3*stride]
917    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1
918    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1
919    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
920
921
922    //prfm pldl1strm, [x0, x1]
923    ld1 {v6.16b}, [x0], x1 // v6=src[3*stride]
924    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1
925    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1
926    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
927
928    //prfm pldl1strm, [x0, x1]
929    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
930    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
931    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
932    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
933
934    //prfm pldl1strm, [x0, x1]
935    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
936    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
937    FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1
938    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
939
940    mov v3.16b, v5.16b
941    mov v5.16b, v7.16b
942    mov v7.16b, v2.16b
943    mov v2.16b, v4.16b
944    mov v4.16b, v6.16b
945    mov v6.16b, v7.16b
946    sub x4, x4, #8
947    cbnz x4, w16_xy_03_mc_luma_loop
948WELS_ASM_AARCH64_FUNC_END
949
950//void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
951//                                      int32_t iHeight);
952WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq8_AArch64_neon
953    SIGN_EXTENSION x1,w1
954    SIGN_EXTENSION x3,w3
955    SIGN_EXTENSION x4,w4
956    sub x0, x0, x1, lsl #1
957    movi v30.8h, #20, lsl #0
958    movi v31.8h, #5, lsl #0
959
960    ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride]
961    ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride]
962    ld1 {v18.8b}, [x0], x1 // v18=src[0*stride]
963    ld1 {v19.8b}, [x0], x1 // v19=src[1*stride]
964    ld1 {v20.8b}, [x0], x1 // v20=src[2*stride]
965
966w8_xy_03_mc_luma_loop:
967    ld1 {v21.8b}, [x0], x1 // v21=src[3*stride]
968    ld1 {v22.8b}, [x0], x1 // v22=src[4*stride]
969    ld1 {v23.8b}, [x0], x1 // v23=src[5*stride]
970    ld1 {v24.8b}, [x0], x1 // v24=src[6*stride]
971
972    VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3]
973    VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2]
974    VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6  //v0/v2/v4/v6 -=5*(src[-1]+src[2])
975    VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1]
976    VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6  //v0/v2/v4/v6 += 20*(src[0]+src[1])
977    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
978
979    VEC4_UADDL_8BITS v1, v19, v3, v20, v5, v21, v7, v22, v0, v2, v4, v6 //v0/v2/v4/v6 = average with src[0]
980    VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7
981
982    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7  //store 8bytes*4row
983
984    sub x4, x4, #4
985    mov v16.16b, v20.16b
986    mov v17.16b, v21.16b
987    mov v18.16b, v22.16b
988    mov v19.16b, v23.16b
989    mov v20.16b, v24.16b
990
991    cbnz x4, w8_xy_03_mc_luma_loop
992WELS_ASM_AARCH64_FUNC_END
993
994//void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
995//                                      int32_t iHeight);
996WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq4_AArch64_neon
997    SIGN_EXTENSION x1,w1
998    SIGN_EXTENSION x3,w3
999    SIGN_EXTENSION x4,w4
1000    sub x0, x0, x1, lsl #1
1001    movi v0.8h, #20, lsl #0
1002    movi v1.8h, #5, lsl #0
1003
1004    //prfm pldl1strm, [x0]
1005    //prfm pldl1strm, [x0, x1]
1006    ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride]
1007    //prfm pldl1strm, [x0, x1]
1008    ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride]
1009    mov v2.s[1], v3.s[0]
1010    //prfm pldl1strm, [x0, x1]
1011    ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride]
1012    mov v3.s[1], v4.s[0]
1013    //prfm pldl1strm, [x0, x1]
1014    ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride]
1015    mov v4.s[1], v5.s[0]
1016    //prfm pldl1strm, [x0, x1]
1017    ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride]
1018    mov v5.s[1], v6.s[0]
1019
1020w4_xy_03_mc_luma_loop:
1021    //prfm pldl1strm, [x0, x1]
1022    ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride]
1023    mov v6.s[1], v7.s[0]
1024    //prfm pldl1strm, [x0, x1]
1025    ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride]
1026    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1
1027    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
1028    st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line
1029    mov v2.s[0], v7.s[1]
1030
1031    //prfm pldl1strm, [x0, x1]
1032    ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride]
1033    //prfm pldl1strm, [x0, x1]
1034    ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride]
1035    mov v3.s[0], v2.s[1]
1036    FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1
1037    st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
1038    st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
1039    mov v4.s[0], v3.s[1]
1040
1041    mov v21.8b, v6.8b
1042    mov v6.8b, v4.8b
1043    mov v4.8b, v2.8b
1044    mov v2.8b, v21.8b
1045    mov v21.8b, v3.8b
1046    mov v3.8b, v7.8b
1047    mov v7.8b, v5.8b
1048    mov v5.8b, v21.8b
1049
1050    sub x4, x4, #4
1051    cbnz x4, w4_xy_03_mc_luma_loop
1052WELS_ASM_AARCH64_FUNC_END
1053
1054//void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1055//                                       int32_t iHeight);
1056WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq16_AArch64_neon
1057    SIGN_EXTENSION x1,w1
1058    SIGN_EXTENSION x3,w3
1059    SIGN_EXTENSION x4,w4
1060    sub x0, x0, x1, lsl #1
1061    movi v0.8h, #20, lsl #0
1062    movi v1.8h, #5, lsl #0
1063
1064    //prfm pldl1strm, [x0]
1065    //prfm pldl1strm, [x0, x1]
1066    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
1067    //prfm pldl1strm, [x0, x1]
1068    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
1069    //prfm pldl1strm, [x0, x1]
1070    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
1071    //prfm pldl1strm, [x0, x1]
1072    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
1073    //prfm pldl1strm, [x0, x1]
1074    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
1075
1076
1077w16_xy_02_mc_luma_loop:
1078    //prfm pldl1strm, [x0, x1]
1079    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
1080    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
1081    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
1082    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
1083
1084
1085    //prfm pldl1strm, [x0, x1]
1086    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
1087    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
1088    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
1089    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
1090
1091
1092    //prfm pldl1strm, [x0, x1]
1093    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
1094    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
1095    FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1
1096    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
1097
1098
1099    //prfm pldl1strm, [x0, x1]
1100    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
1101    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
1102    FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1
1103    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
1104
1105
1106    //prfm pldl1strm, [x0, x1]
1107    ld1 {v5.16b}, [x0], x1 // v5=src[3*stride]
1108    FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1
1109    FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1
1110    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
1111
1112
1113    //prfm pldl1strm, [x0, x1]
1114    ld1 {v6.16b}, [x0], x1 // v6=src[3*stride]
1115    FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1
1116    FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1
1117    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
1118
1119    //prfm pldl1strm, [x0, x1]
1120    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
1121    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
1122    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
1123    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
1124
1125    //prfm pldl1strm, [x0, x1]
1126    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
1127    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
1128    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
1129    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
1130
1131    mov v3.16b, v5.16b
1132    mov v5.16b, v7.16b
1133    mov v7.16b, v2.16b
1134    mov v2.16b, v4.16b
1135    mov v4.16b, v6.16b
1136    mov v6.16b, v7.16b
1137    sub x4, x4, #8
1138    cbnz x4, w16_xy_02_mc_luma_loop
1139WELS_ASM_AARCH64_FUNC_END
1140
1141//void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1142//                                      int32_t iHeight);
1143WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq8_AArch64_neon
1144    SIGN_EXTENSION x1,w1
1145    SIGN_EXTENSION x3,w3
1146    SIGN_EXTENSION x4,w4
1147    sub x0, x0, x1, lsl #1
1148    movi v30.8h, #20, lsl #0
1149    movi v31.8h, #5, lsl #0
1150
1151    ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride]
1152    ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride]
1153    ld1 {v18.8b}, [x0], x1 // v18=src[0*stride]
1154    ld1 {v19.8b}, [x0], x1 // v19=src[1*stride]
1155    ld1 {v20.8b}, [x0], x1 // v20=src[2*stride]
1156
1157w8_xy_02_mc_luma_loop:
1158    ld1 {v21.8b}, [x0], x1 // v21=src[3*stride]
1159    ld1 {v22.8b}, [x0], x1 // v22=src[4*stride]
1160    ld1 {v23.8b}, [x0], x1 // v23=src[5*stride]
1161    ld1 {v24.8b}, [x0], x1 // v24=src[6*stride]
1162
1163    VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3]
1164    VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2]
1165    VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6  //v0/v2/v4/v6 -=5*(src[-1]+src[2])
1166    VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1]
1167    VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6  //v0/v2/v4/v6 += 20*(src[0]+src[1])
1168    VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7
1169    VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7  //store 8bytes*4row
1170
1171    sub x4, x4, #4
1172    mov v16.16b, v20.16b
1173    mov v17.16b, v21.16b
1174    mov v18.16b, v22.16b
1175    mov v19.16b, v23.16b
1176    mov v20.16b, v24.16b
1177
1178    cbnz x4, w8_xy_02_mc_luma_loop
1179WELS_ASM_AARCH64_FUNC_END
1180
1181//void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1182//                                      int32_t iHeight);
1183WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq4_AArch64_neon
1184    SIGN_EXTENSION x1,w1
1185    SIGN_EXTENSION x3,w3
1186    SIGN_EXTENSION x4,w4
1187    sub x0, x0, x1, lsl #1
1188    movi v0.8h, #20, lsl #0
1189    movi v1.8h, #5, lsl #0
1190
1191    //prfm pldl1strm, [x0]
1192    //prfm pldl1strm, [x0, x1]
1193    ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride]
1194    //prfm pldl1strm, [x0, x1]
1195    ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride]
1196    mov v2.s[1], v3.s[0]
1197    //prfm pldl1strm, [x0, x1]
1198    ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride]
1199    mov v3.s[1], v4.s[0]
1200    //prfm pldl1strm, [x0, x1]
1201    ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride]
1202    mov v4.s[1], v5.s[0]
1203    //prfm pldl1strm, [x0, x1]
1204    ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride]
1205    mov v5.s[1], v6.s[0]
1206
1207w4_xy_02_mc_luma_loop:
1208    //prfm pldl1strm, [x0, x1]
1209    ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride]
1210    mov v6.s[1], v7.s[0]
1211    //prfm pldl1strm, [x0, x1]
1212    ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride]
1213    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
1214    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
1215    st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line
1216    mov v2.s[0], v7.s[1]
1217
1218    //prfm pldl1strm, [x0, x1]
1219    ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride]
1220    //prfm pldl1strm, [x0, x1]
1221    ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride]
1222    mov v3.s[0], v2.s[1]
1223    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
1224    st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
1225    st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line
1226    mov v4.s[0], v3.s[1]
1227
1228    mov v21.8b, v6.8b
1229    mov v6.8b, v4.8b
1230    mov v4.8b, v2.8b
1231    mov v2.8b, v21.8b
1232    mov v21.8b, v3.8b
1233    mov v3.8b, v7.8b
1234    mov v7.8b, v5.8b
1235    mov v5.8b, v21.8b
1236
1237    sub x4, x4, #4
1238    cbnz x4, w4_xy_02_mc_luma_loop
1239WELS_ASM_AARCH64_FUNC_END
1240
1241//void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1242//                                       int32_t iHeight);
1243WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq16_AArch64_neon
1244    SIGN_EXTENSION x1,w1
1245    SIGN_EXTENSION x3,w3
1246    SIGN_EXTENSION x4,w4
1247    stp d8, d9, [sp,#-16]!
1248    stp d10, d11, [sp,#-16]!
1249    stp d12, d13, [sp,#-16]!
1250    stp d14, d15, [sp,#-16]!
1251    sub x0, x0, #2
1252    sub x0, x0, x1, lsl #1
1253    movi v0.8h, #20, lsl #0
1254    movi v1.8h, #5, lsl #0
1255
1256    //prfm pldl1strm, [x0]
1257    //prfm pldl1strm, [x0, x1]
1258    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride]
1259    //prfm pldl1strm, [x0, x1]
1260    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride]
1261    //prfm pldl1strm, [x0, x1]
1262    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride]
1263    //prfm pldl1strm, [x0, x1]
1264    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride]
1265    //prfm pldl1strm, [x0, x1]
1266    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride]
1267
1268w16_hv_mc_luma_loop:
1269    //prfm pldl1strm, [x0, x1]
1270    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
1271    // vertical filtered into v20/v21
1272    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
1273    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
1274    // horizon filtered
1275    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1276    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1277    // vertical filtered into v21/v22
1278    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
1279    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1280    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1281    st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line
1282
1283    //prfm pldl1strm, [x0, x1]
1284    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride]
1285    // vertical filtered into v20/v21
1286    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
1287    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
1288    // horizon filtered
1289    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1290    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1291    // vertical filtered into v21/v22
1292    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
1293    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1294    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1295    st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line
1296
1297    //prfm pldl1strm, [x0, x1]
1298    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[3*stride]
1299    // vertical filtered into v20/v21
1300    FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
1301    FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
1302    // horizon filtered
1303    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1304    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1305    // vertical filtered into v21/v22
1306    FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
1307    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1308    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1309    st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line
1310
1311    //prfm pldl1strm, [x0, x1]
1312    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[3*stride]
1313    // vertical filtered into v20/v21
1314    FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
1315    FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
1316    // horizon filtered
1317    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1318    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1319    // vertical filtered into v21/v22
1320    FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
1321    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1322    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1323    st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line
1324
1325    //prfm pldl1strm, [x0, x1]
1326    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[3*stride]
1327    // vertical filtered into v20/v21
1328    FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
1329    FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
1330    // horizon filtered
1331    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1332    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1333    // vertical filtered into v21/v22
1334    FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
1335    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1336    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1337    st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line
1338
1339    //prfm pldl1strm, [x0, x1]
1340    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[3*stride]
1341    // vertical filtered into v20/v21
1342    FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
1343    FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
1344    // horizon filtered
1345    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1346    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1347    // vertical filtered into v21/v22
1348    FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
1349    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1350    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1351    st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line
1352
1353    //prfm pldl1strm, [x0, x1]
1354    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[3*stride]
1355    // vertical filtered into v20/v21
1356    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
1357    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
1358    // horizon filtered
1359    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1360    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1361    // vertical filtered into v21/v22
1362    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
1363    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1364    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1365    st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line
1366
1367    //prfm pldl1strm, [x0, x1]
1368    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride]
1369    // vertical filtered into v20/v21
1370    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
1371    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
1372    // horizon filtered
1373    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1374    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1375    // vertical filtered into v21/v22
1376    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
1377    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
1378    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
1379    st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
1380
1381    mov v5.16b, v11.16b
1382    mov v11.16b, v17.16b
1383    mov v30.16b, v2.16b
1384    mov v2.16b, v8.16b
1385    mov v8.16b, v14.16b
1386    mov v14.16b, v30.16b
1387
1388    mov v6.16b, v12.16b
1389    mov v12.16b, v18.16b
1390    mov v30.16b, v3.16b
1391    mov v3.16b, v9.16b
1392    mov v9.16b, v15.16b
1393    mov v15.16b, v30.16b
1394
1395    mov v7.16b, v13.16b
1396    mov v13.16b, v19.16b
1397    mov v30.16b, v4.16b
1398    mov v4.16b, v10.16b
1399    mov v10.16b, v16.16b
1400    mov v16.16b, v30.16b
1401
1402    sub x4, x4, #8
1403    cbnz x4, w16_hv_mc_luma_loop
1404
1405    ldp d14, d15, [sp], #16
1406    ldp d12, d13, [sp], #16
1407    ldp d10, d11, [sp], #16
1408    ldp d8, d9, [sp], #16
1409WELS_ASM_AARCH64_FUNC_END
1410
1411//void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1412//                                      int32_t iHeight);
1413WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq8_AArch64_neon
1414    SIGN_EXTENSION x1,w1
1415    SIGN_EXTENSION x3,w3
1416    SIGN_EXTENSION x4,w4
1417    sub x0, x0, #2
1418    sub x0, x0, x1, lsl #1
1419    movi v0.8h, #20, lsl #0
1420    movi v1.8h, #5, lsl #0
1421
1422    //prfm pldl1strm, [x0]
1423    //prfm pldl1strm, [x0, x1]
1424    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
1425    //prfm pldl1strm, [x0, x1]
1426    ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride]
1427    //prfm pldl1strm, [x0, x1]
1428    ld1 {v4.16b}, [x0], x1 // v8=src[0*stride]
1429    //prfm pldl1strm, [x0, x1]
1430    ld1 {v5.16b}, [x0], x1 // v11=src[1*stride]
1431    //prfm pldl1strm, [x0, x1]
1432    ld1 {v6.16b}, [x0], x1 // v14=src[2*stride]
1433
1434w8_hv_mc_luma_loop:
1435    //prfm pldl1strm, [x0, x1]
1436    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
1437    // vertical filtered into v20/v21
1438    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
1439    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
1440    // horizon filtered
1441    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1442    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1443    st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line
1444
1445    //prfm pldl1strm, [x0, x1]
1446    ld1 {v2.16b}, [x0], x1 // v2=src[3*stride]
1447    // vertical filtered into v20/v21
1448    FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
1449    FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
1450    // horizon filtered
1451    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1452    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1453    st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line
1454
1455    //prfm pldl1strm, [x0, x1]
1456    ld1 {v3.16b}, [x0], x1 // v3=src[3*stride]
1457    // vertical filtered into v20/v21
1458    FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
1459    FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
1460    // horizon filtered
1461    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1462    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1463    st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line
1464
1465    //prfm pldl1strm, [x0, x1]
1466    ld1 {v4.16b}, [x0], x1 // v4=src[3*stride]
1467    // vertical filtered into v20/v21
1468    FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
1469    FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
1470    // horizon filtered
1471    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
1472    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
1473    st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
1474
1475
1476    mov v5.16b, v3.16b
1477    mov v3.16b, v7.16b
1478    mov v30.16b, v2.16b
1479    mov v2.16b, v6.16b
1480    mov v6.16b, v4.16b
1481    mov v4.16b, v30.16b
1482
1483    sub x4, x4, #4
1484    cbnz x4, w8_hv_mc_luma_loop
1485WELS_ASM_AARCH64_FUNC_END
1486//void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1487//                                      int32_t iHeight);
1488
1489WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq4_AArch64_neon
1490    SIGN_EXTENSION x1,w1
1491    SIGN_EXTENSION x3,w3
1492    SIGN_EXTENSION x4,w4
1493    sub x0, x0, #2
1494    sub x0, x0, x1, lsl #1
1495    movi v0.8h, #20, lsl #0
1496    movi v1.8h, #5, lsl #0
1497
1498    //prfm pldl1strm, [x0]
1499    //prfm pldl1strm, [x0, x1]
1500    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
1501    //prfm pldl1strm, [x0, x1]
1502    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
1503    //prfm pldl1strm, [x0, x1]
1504    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
1505    //prfm pldl1strm, [x0, x1]
1506    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
1507    //prfm pldl1strm, [x0, x1]
1508    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
1509
1510w4_hv_mc_luma_loop:
1511    //prfm pldl1strm, [x0, x1]
1512    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
1513    // vertical filtered into v20/v21 1st line
1514    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
1515    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
1516    //prfm pldl1strm, [x0, x1]
1517    ld1 {v2.16b}, [x0], x1 // v16=src[4*stride]
1518    // vertical filtered into v22/v23 2nd line
1519    FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1
1520    FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1
1521    // horizon filtered
1522    UNPACK_2_16BITS_TO_ABC  v20, v21, v24, v25, v26
1523    UNPACK_2_16BITS_TO_ABC  v22, v23, v28, v29, v30
1524    zip1 v24.2d, v24.2d, v28.2d
1525    zip1 v25.2d, v25.2d, v29.2d
1526    zip1 v26.2d, v26.2d, v30.2d
1527    FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
1528    st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line
1529    st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line
1530
1531
1532    //prfm pldl1strm, [x0, x1]
1533    ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
1534    // vertical filtered into v20/v21
1535    FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
1536    FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
1537    //prfm pldl1strm, [x0, x1]
1538    ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
1539    FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1
1540    FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1
1541    // horizon filtered
1542    UNPACK_2_16BITS_TO_ABC  v20, v21, v24, v25, v26
1543    UNPACK_2_16BITS_TO_ABC  v22, v23, v28, v29, v30
1544    zip1 v24.2d, v24.2d, v28.2d
1545    zip1 v25.2d, v25.2d, v29.2d
1546    zip1 v26.2d, v26.2d, v30.2d
1547    FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
1548    st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
1549    st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
1550
1551    mov v5.16b, v3.16b
1552    mov v3.16b, v7.16b
1553    mov v30.16b, v2.16b
1554    mov v2.16b, v6.16b
1555    mov v6.16b, v4.16b
1556    mov v4.16b, v30.16b
1557
1558    sub x4, x4, #4
1559    cbnz x4, w4_hv_mc_luma_loop
1560WELS_ASM_AARCH64_FUNC_END
1561//void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1562//                                   int32_t iHeight);
1563WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon
1564    //prfm pldl1strm, [x0]
1565    SIGN_EXTENSION x1,w1
1566    SIGN_EXTENSION x3,w3
1567    SIGN_EXTENSION x4,w4
1568w16_copy_loop:
1569    //prfm pldl1strm, [x0, x1]
1570    ld1 {v0.16b}, [x0], x1  //read 16Byte : 0 line
1571    st1 {v0.16b}, [x2], x3 //write 16Byte : 0 line
1572    //prfm pldl1strm, [x0, x1]
1573    ld1 {v1.16b}, [x0], x1  //read 16Byte : 1 line
1574    st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line
1575
1576    sub x4, x4, #2
1577    cbnz x4, w16_copy_loop
1578WELS_ASM_AARCH64_FUNC_END
1579//void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1580//                                  int32_t iHeight);
1581WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon
1582    //prfm pldl1strm, [x0]
1583    SIGN_EXTENSION x1,w1
1584    SIGN_EXTENSION x3,w3
1585    SIGN_EXTENSION x4,w4
1586w8_copy_loop:
1587    //prfm pldl1strm, [x0, x1]
1588    ld1 {v0.8b}, [x0], x1  //read 16Byte : 0 line
1589    st1 {v0.8b}, [x2], x3 //write 16Byte : 0 line
1590    //prfm pldl1strm, [x0, x1]
1591    ld1 {v1.8b}, [x0], x1  //read 16Byte : 1 line
1592    st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line
1593
1594    sub x4, x4, #2
1595    cbnz x4, w8_copy_loop
1596WELS_ASM_AARCH64_FUNC_END
1597
1598WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon
1599    //prfm pldl1strm, [x0]
1600    SIGN_EXTENSION x1,w1
1601    SIGN_EXTENSION x3,w3
1602    SIGN_EXTENSION x4,w4
1603w4_copy_loop:
1604    //prfm pldl1strm, [x0, x1]
1605    ld1 {v0.s}[0], [x0], x1  //read 16Byte : 0 line
1606    st1 {v0.s}[0], [x2], x3 //write 16Byte : 0 line
1607    //prfm pldl1strm, [x0, x1]
1608    ld1 {v1.s}[0], [x0], x1  //read 16Byte : 1 line
1609    st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line
1610
1611    sub x4, x4, #2
1612    cbnz x4, w4_copy_loop
1613WELS_ASM_AARCH64_FUNC_END
1614
1615//void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
1616//const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
1617
1618WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
1619    SIGN_EXTENSION x1,w1
1620    SIGN_EXTENSION x3,w3
1621    SIGN_EXTENSION x5,w5
1622    SIGN_EXTENSION x6,w6
1623enc_w16_pix_avg_loop:
1624    ld1 {v0.16b}, [x2], x3  //read 16Byte : src0: 0 line
1625    ld1 {v1.16b}, [x4], x5  //read 16Byte : src1: 0 line
1626    ld1 {v2.16b}, [x2], x3  //read 16Byte : src0: 1 line
1627    ld1 {v3.16b}, [x4], x5  //read 16Byte : src1: 1 line
1628    ld1 {v4.16b}, [x2], x3  //read 16Byte : src0: 2 line
1629    ld1 {v5.16b}, [x4], x5  //read 16Byte : src1: 2 line
1630    ld1 {v6.16b}, [x2], x3  //read 16Byte : src0: 3 line
1631    ld1 {v7.16b}, [x4], x5  //read 16Byte : src1: 3 line
1632    AVERAGE_TWO_8BITS1  v16, v0, v1
1633    AVERAGE_TWO_8BITS2  v16, v0, v1
1634    st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line
1635
1636
1637    AVERAGE_TWO_8BITS1  v16, v2, v3
1638    AVERAGE_TWO_8BITS2  v16, v2, v3
1639    st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line
1640
1641
1642    AVERAGE_TWO_8BITS1  v16, v4, v5
1643    AVERAGE_TWO_8BITS2  v16, v4, v5
1644    st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line
1645
1646    AVERAGE_TWO_8BITS1  v16, v6, v7
1647    AVERAGE_TWO_8BITS2  v16, v6, v7
1648    st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
1649
1650    sub x6, x6, #4
1651    cbnz x6, enc_w16_pix_avg_loop
1652WELS_ASM_AARCH64_FUNC_END
1653
1654//void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA,
1655//                                        const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
1656WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon
1657    //prfm pldl1strm, [x2]
1658    //prfm pldl1strm, [x4]
1659    SIGN_EXTENSION x1,w1
1660    SIGN_EXTENSION x3,w3
1661    SIGN_EXTENSION x5,w5
1662    SIGN_EXTENSION x6,w6
1663enc_w8_pix_avg_loop:
1664    //prfm pldl1strm, [x2, x3]
1665    //prfm pldl1strm, [x4, x5]
1666    ld1 {v0.8b}, [x2], x3  //read 8Byte : src0: 0 line
1667    ld1 {v1.8b}, [x4], x5  //read 8Byte : src1: 0 line
1668    //prfm pldl1strm, [x2, x3]
1669    //prfm pldl1strm, [x4, x5]
1670    ld1 {v2.8b}, [x2], x3  //read 8Byte : src0: 1 line
1671    ld1 {v3.8b}, [x4], x5  //read 8Byte : src1: 1 line
1672    //prfm pldl1strm, [x2, x3]
1673    //prfm pldl1strm, [x4, x5]
1674    ld1 {v4.8b}, [x2], x3  //read 8Byte : src0: 2 line
1675    ld1 {v5.8b}, [x4], x5  //read 8Byte : src1: 2 line
1676    //prfm pldl1strm, [x2, x3]
1677    //prfm pldl1strm, [x4, x5]
1678    ld1 {v6.8b}, [x2], x3  //read 8Byte : src0: 3 line
1679    ld1 {v7.8b}, [x4], x5  //read 8Byte : src1: 3 line
1680    AVERAGE_TWO_8BITS1  v16, v0, v1
1681    st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line
1682
1683    AVERAGE_TWO_8BITS1  v16, v2, v3
1684    st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line
1685
1686
1687    AVERAGE_TWO_8BITS1  v16, v4, v5
1688    st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line
1689
1690    AVERAGE_TWO_8BITS1  v16, v6, v7
1691    st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
1692
1693    sub x6, x6, #4
1694    cbnz x6, enc_w8_pix_avg_loop
1695WELS_ASM_AARCH64_FUNC_END
1696//void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1697//                                     const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
1698WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon
1699    //prfm pldl1strm, [x2]
1700    //prfm pldl1strm, [x4]
1701    SIGN_EXTENSION x1,w1
1702    SIGN_EXTENSION x3,w3
1703    SIGN_EXTENSION x5,w5
1704    SIGN_EXTENSION x6,w6
1705w16_pix_avg_loop:
1706    //prfm pldl1strm, [x2, x3]
1707    //prfm pldl1strm, [x4, x5]
1708    ld1 {v0.16b}, [x2], x3  //read 16Byte : src0: 0 line
1709    ld1 {v1.16b}, [x4], x5  //read 16Byte : src1: 0 line
1710    //prfm pldl1strm, [x2, x3]
1711    //prfm pldl1strm, [x4, x5]
1712    ld1 {v2.16b}, [x2], x3  //read 16Byte : src0: 1 line
1713    ld1 {v3.16b}, [x4], x5  //read 16Byte : src1: 1 line
1714    //prfm pldl1strm, [x2, x3]
1715    //prfm pldl1strm, [x4, x5]
1716    ld1 {v4.16b}, [x2], x3  //read 16Byte : src0: 2 line
1717    ld1 {v5.16b}, [x4], x5  //read 16Byte : src1: 2 line
1718    //prfm pldl1strm, [x2, x3]
1719    //prfm pldl1strm, [x4, x5]
1720    ld1 {v6.16b}, [x2], x3  //read 16Byte : src0: 3 line
1721    ld1 {v7.16b}, [x4], x5  //read 16Byte : src1: 3 line
1722    AVERAGE_TWO_8BITS1  v16, v0, v1
1723    AVERAGE_TWO_8BITS2  v16, v0, v1
1724    st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line
1725
1726
1727    AVERAGE_TWO_8BITS1  v16, v2, v3
1728    AVERAGE_TWO_8BITS2  v16, v2, v3
1729    st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line
1730
1731
1732    AVERAGE_TWO_8BITS1  v16, v4, v5
1733    AVERAGE_TWO_8BITS2  v16, v4, v5
1734    st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line
1735
1736    AVERAGE_TWO_8BITS1  v16, v6, v7
1737    AVERAGE_TWO_8BITS2  v16, v6, v7
1738    st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
1739
1740    sub x6, x6, #4
1741    cbnz x6, w16_pix_avg_loop
1742WELS_ASM_AARCH64_FUNC_END
1743//void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1744//                                   const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
1745WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon
1746    //prfm pldl1strm, [x2]
1747    //prfm pldl1strm, [x4]
1748    SIGN_EXTENSION x1,w1
1749    SIGN_EXTENSION x3,w3
1750    SIGN_EXTENSION x5,w5
1751    SIGN_EXTENSION x6,w6
1752w8_pix_avg_loop:
1753    //prfm pldl1strm, [x2, x3]
1754    //prfm pldl1strm, [x4, x5]
1755    ld1 {v0.8b}, [x2], x3  //read 8Byte : src0: 0 line
1756    ld1 {v1.8b}, [x4], x5  //read 8Byte : src1: 0 line
1757    //prfm pldl1strm, [x2, x3]
1758    //prfm pldl1strm, [x4, x5]
1759    ld1 {v2.8b}, [x2], x3  //read 8Byte : src0: 1 line
1760    ld1 {v3.8b}, [x4], x5  //read 8Byte : src1: 1 line
1761    //prfm pldl1strm, [x2, x3]
1762    //prfm pldl1strm, [x4, x5]
1763    ld1 {v4.8b}, [x2], x3  //read 8Byte : src0: 2 line
1764    ld1 {v5.8b}, [x4], x5  //read 8Byte : src1: 2 line
1765    //prfm pldl1strm, [x2, x3]
1766    //prfm pldl1strm, [x4, x5]
1767    ld1 {v6.8b}, [x2], x3  //read 8Byte : src0: 3 line
1768    ld1 {v7.8b}, [x4], x5  //read 8Byte : src1: 3 line
1769    AVERAGE_TWO_8BITS1  v16, v0, v1
1770    st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line
1771
1772    AVERAGE_TWO_8BITS1  v16, v2, v3
1773    st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line
1774
1775
1776    AVERAGE_TWO_8BITS1  v16, v4, v5
1777    st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line
1778
1779    AVERAGE_TWO_8BITS1  v16, v6, v7
1780    st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
1781
1782    sub x6, x6, #4
1783    cbnz x6, w8_pix_avg_loop
1784WELS_ASM_AARCH64_FUNC_END
1785
1786//void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
1787//                                    const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
1788WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq4_AArch64_neon
1789    //prfm pldl1strm, [x2]
1790    //prfm pldl1strm, [x4]
1791    SIGN_EXTENSION x1,w1
1792    SIGN_EXTENSION x3,w3
1793    SIGN_EXTENSION x5,w5
1794    SIGN_EXTENSION x6,w6
1795w4_pix_avg_loop:
1796    //prfm pldl1strm, [x2, x3]
1797    //prfm pldl1strm, [x4, x5]
1798    ld1 {v0.s}[0], [x2], x3  //read 4Byte : src0: 0 line
1799    ld1 {v1.s}[0], [x4], x5  //read 4Byte : src1: 0 line
1800    //prfm pldl1strm, [x2, x3]
1801    //prfm pldl1strm, [x4, x5]
1802    ld1 {v0.s}[1], [x2], x3  //read 4Byte : src0: 1 line
1803    ld1 {v1.s}[1], [x4], x5  //read 4Byte : src1: 1 line
1804    AVERAGE_TWO_8BITS1  v2, v0, v1
1805    st1 {v2.s}[0], [x0], x1 //write 4Byte : 0 line
1806    st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line
1807
1808    sub x6, x6, #2
1809    cbnz x6, w4_pix_avg_loop
1810WELS_ASM_AARCH64_FUNC_END
1811//void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1812//                                    int32_t* pWeights, int32_t iHeight);
1813WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
1814    SIGN_EXTENSION x1,w1
1815    SIGN_EXTENSION x3,w3
1816    SIGN_EXTENSION x5,w5
1817    ld4r {v28.8b, v29.8b, v30.8b, v31.8b}, [x4] //load A/B/C/D
1818    ld1 {v16.16b}, [x0], x1  // src[x]
1819    ext v17.16b, v16.16b, v16.16b, #1  // src[x+1]
1820w8_mc_chroma_loop:
1821    ld1 {v18.16b}, [x0], x1  // src[x+stride]
1822    ext v19.16b, v18.16b, v18.16b, #1  // src[x+stride+1]
1823
1824    ld1 {v20.16b}, [x0], x1  // src[x+2*stride]
1825    ext v21.16b, v20.16b, v20.16b, #1  // src[x+2*stride+1]
1826
1827    ld1 {v22.16b}, [x0], x1  // src[x+3*stride]
1828    ext v23.16b, v22.16b, v22.16b, #1  // src[x+3*stride+1]
1829
1830    ld1 {v24.16b}, [x0], x1  // src[x+4*stride]
1831    ext v25.16b, v24.16b, v24.16b, #1  // src[x+4*stride+1]
1832
1833    umull v0.8h, v16.8b, v28.8b
1834    umull v2.8h, v18.8b, v28.8b
1835    umull v4.8h, v20.8b, v28.8b
1836    umull v6.8h, v22.8b, v28.8b
1837
1838    umlal v0.8h, v17.8b, v29.8b
1839    umlal v2.8h, v19.8b, v29.8b
1840    umlal v4.8h, v21.8b, v29.8b
1841    umlal v6.8h, v23.8b, v29.8b
1842
1843    umlal v0.8h, v18.8b, v30.8b
1844    umlal v2.8h, v20.8b, v30.8b
1845    umlal v4.8h, v22.8b, v30.8b
1846    umlal v6.8h, v24.8b, v30.8b
1847
1848    umlal v0.8h, v19.8b, v31.8b
1849    umlal v2.8h, v21.8b, v31.8b
1850    umlal v4.8h, v23.8b, v31.8b
1851    umlal v6.8h, v25.8b, v31.8b
1852
1853    rshrn v1.8b, v0.8h, #6
1854    st1 {v1.8b}, [x2], x3
1855
1856    rshrn v3.8b, v2.8h, #6
1857    st1 {v3.8b}, [x2], x3
1858
1859    rshrn v5.8b, v4.8h, #6
1860    st1 {v5.8b}, [x2], x3
1861
1862    rshrn v7.8b, v6.8h, #6
1863    st1 {v7.8b}, [x2], x3
1864
1865    mov v16.16b, v24.16b
1866    mov v17.16b, v25.16b
1867    sub x5, x5, #4
1868    cbnz x5, w8_mc_chroma_loop
1869WELS_ASM_AARCH64_FUNC_END
1870//void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1871//                                    int32_t* pWeights, int32_t iHeight);
1872WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
1873    SIGN_EXTENSION x1,w1
1874    SIGN_EXTENSION x3,w3
1875    SIGN_EXTENSION x5,w5
1876    ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D
1877    ld1 {v0.8b}, [x0], x1  // src[x]
1878    ext v1.8b, v0.8b, v0.8b, #1  // src[x+1]
1879w4_mc_chroma_loop:
1880    ld1 {v2.8b}, [x0], x1  // src[x+stride]
1881    ext v3.8b, v2.8b, v2.8b, #1  // src[x+stride+1]
1882    ld1 {v18.8b}, [x0], x1  // src[x+2*stride]
1883    ext v19.8b, v18.8b, v18.8b, #1  // src[x+2*stride+1]
1884
1885    zip1 v0.4s, v0.4s, v2.4s
1886    zip1 v1.4s, v1.4s, v3.4s
1887    zip1 v2.4s, v2.4s, v18.4s
1888    zip1 v3.4s, v3.4s, v19.4s
1889
1890    umull v16.8h, v0.8b, v4.8b
1891    umlal v16.8h, v1.8b, v5.8b
1892    umlal v16.8h, v2.8b, v6.8b
1893    umlal v16.8h, v3.8b, v7.8b
1894    rshrn v17.8b, v16.8h, #6
1895    st1 {v17.s}[0], [x2], x3
1896    st1 {v17.s}[1], [x2], x3
1897
1898    mov v0.8b, v18.8b
1899    mov v1.8b, v19.8b
1900    sub x5, x5, #2
1901    cbnz x5, w4_mc_chroma_loop
1902WELS_ASM_AARCH64_FUNC_END
1903
1904//void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1905//                                    int32_t iHeight);// width+1
1906WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon
1907    SIGN_EXTENSION x1,w1
1908    SIGN_EXTENSION x3,w3
1909    SIGN_EXTENSION x4,w4
1910    sub x0, x0, #2
1911    sub x3, x3, #16
1912    mov x5, #16
1913    movi v0.8h, #20, lsl #0
1914    movi v1.8h, #5, lsl #0
1915    ldr q22, filter_para
1916w17_h_mc_luma_loop:
1917    ld1 {v2.16b, v3.16b}, [x0], x1 //only use 22(17+5); v2=src[-2]
1918
1919    //prfm pldl1strm, [x0]
1920    ext v5.16b, v2.16b, v3.16b, #1    //v5=src[-1]
1921    ext v6.16b, v2.16b, v3.16b, #2    //v6=src[0]
1922    ext v7.16b, v2.16b, v3.16b, #3    //v7=src[1]
1923    ext v16.16b, v2.16b, v3.16b, #4   //v16=src[2]
1924    ext v17.16b, v2.16b, v3.16b, #5   //v17=src[3]
1925
1926    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
1927    FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1
1928    st1 {v20.16b}, [x2], x5 //write 16Byte
1929
1930    ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
1931    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
1932    st1 {v21.b}[0], [x2], x3 //write 16th Byte
1933
1934    sub x4, x4, #1
1935    cbnz x4, w17_h_mc_luma_loop
1936WELS_ASM_AARCH64_FUNC_END
1937
1938//void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1939//                                    int32_t iHeight);// width+1
1940WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
1941    SIGN_EXTENSION x1,w1
1942    SIGN_EXTENSION x3,w3
1943    SIGN_EXTENSION x4,w4
1944    sub x0, x0, #2
1945    sub x3, x3, #8
1946    mov x5, #8
1947    movi v0.8h, #20, lsl #0
1948    movi v1.8h, #5, lsl #0
1949    ldr q22, filter_para
1950w9_h_mc_luma_loop:
1951    ld1 {v2.16b}, [x0], x1 //only use 14(9+5); v2=src[-2]
1952    mov v3.d[0], v2.d[1]
1953    //prfm pldl1strm, [x0]
1954    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
1955    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
1956    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
1957    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
1958    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
1959
1960    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
1961    st1 {v20.8b}, [x2], x5 //write 8Byte
1962
1963    ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
1964    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
1965    st1 {v21.b}[0], [x2], x3 //write 9th Byte
1966
1967    sub x4, x4, #1
1968    cbnz x4, w9_h_mc_luma_loop
1969WELS_ASM_AARCH64_FUNC_END
1970
1971//void McHorVer20Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
1972//                                    int32_t iHeight);// width+1
1973WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width5_AArch64_neon
1974    SIGN_EXTENSION x1,w1
1975    SIGN_EXTENSION x3,w3
1976    SIGN_EXTENSION x4,w4
1977    sub x0, x0, #2
1978    sub x3, x3, #4
1979    mov x5, #4
1980    movi v0.8h, #20, lsl #0
1981    movi v1.8h, #5, lsl #0
1982w5_h_mc_luma_loop:
1983    ld1 {v2.16b}, [x0], x1 //only use 10(5+5); v2=src[-2]
1984
1985    ext v5.16b, v2.16b, v4.16b, #1    //v5=src[-1]
1986    ext v6.16b, v2.16b, v4.16b, #2    //v6=src[0]
1987    ext v7.16b, v2.16b, v4.16b, #3    //v7=src[1]
1988    ext v16.16b, v2.16b, v4.16b, #4   //v16=src[2]
1989    ext v17.16b, v2.16b, v4.16b, #5   //v17=src[3]
1990
1991    FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1
1992    st1 {v20.s}[0], [x2], x5 //write 4Byte
1993    st1 {v20.b}[4], [x2], x3 //write 5th Byte
1994
1995    sub x4, x4, #1
1996    cbnz x4, w5_h_mc_luma_loop
1997WELS_ASM_AARCH64_FUNC_END
1998
1999//void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
2000//                                     int32_t iHeight);
2001WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon
2002    stp d8, d9, [sp,#-16]!
2003    stp d10, d11, [sp,#-16]!
2004    stp d12, d13, [sp,#-16]!
2005    stp d14, d15, [sp,#-16]!
2006    SIGN_EXTENSION x1,w1
2007    SIGN_EXTENSION x3,w3
2008    SIGN_EXTENSION x4,w4
2009    sub x0, x0, #2
2010    sub x0, x0, x1, lsl #1
2011    movi v0.8h, #20, lsl #0
2012    movi v1.8h, #5, lsl #0
2013    sub x3, x3, #16
2014    mov x5, #16
2015    ldr q29, filter_para
2016
2017    sub x4, x4, #1
2018
2019    //prfm pldl1strm, [x0]
2020    //prfm pldl1strm, [x0, x1]
2021    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride]
2022    //prfm pldl1strm, [x0, x1]
2023    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride]
2024    //prfm pldl1strm, [x0, x1]
2025    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride]
2026    //prfm pldl1strm, [x0, x1]
2027    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride]
2028    //prfm pldl1strm, [x0, x1]
2029    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride]
2030
2031w17_hv_mc_luma_loop:
2032    //prfm pldl1strm, [x0, x1]
2033    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
2034    // vertical filtered into v20/v21
2035    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
2036    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
2037    // horizon filtered
2038    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2039    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2040    // vertical filtered into v21/v22
2041    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
2042    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2043    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2044    st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
2045    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2046    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
2047
2048    //prfm pldl1strm, [x0, x1]
2049    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[4*stride]
2050    // vertical filtered into v20/v21
2051    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
2052    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
2053    // horizon filtered
2054    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2055    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2056    // vertical filtered into v21/v22
2057    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
2058    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2059    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2060    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line
2061    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2062    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line
2063
2064    //prfm pldl1strm, [x0, x1]
2065    ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[5*stride]
2066    // vertical filtered into v20/v21
2067    FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
2068    FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
2069    // horizon filtered
2070    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2071    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2072    // vertical filtered into v21/v22
2073    FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
2074    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2075    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2076    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line
2077    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2078    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line
2079
2080    //prfm pldl1strm, [x0, x1]
2081    ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[6*stride]
2082    // vertical filtered into v20/v21
2083    FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
2084    FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
2085    // horizon filtered
2086    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2087    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2088    // vertical filtered into v21/v22
2089    FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
2090    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2091    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2092    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line
2093    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2094    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line
2095
2096    //prfm pldl1strm, [x0, x1]
2097    ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[7*stride]
2098    // vertical filtered into v20/v21
2099    FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
2100    FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
2101    // horizon filtered
2102    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2103    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2104    // vertical filtered into v21/v22
2105    FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
2106    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2107    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2108    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line
2109    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2110    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line
2111
2112    //prfm pldl1strm, [x0, x1]
2113    ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[8*stride]
2114    // vertical filtered into v20/v21
2115    FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
2116    FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
2117    // horizon filtered
2118    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2119    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2120    // vertical filtered into v21/v22
2121    FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
2122    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2123    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2124    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line
2125    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2126    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line
2127
2128    //prfm pldl1strm, [x0, x1]
2129    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[9*stride]
2130    // vertical filtered into v20/v21
2131    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
2132    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
2133    // horizon filtered
2134    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2135    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2136    // vertical filtered into v21/v22
2137    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
2138    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2139    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2140    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line
2141    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2142    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line
2143
2144    //prfm pldl1strm, [x0, x1]
2145    ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[10*stride]
2146    // vertical filtered into v20/v21
2147    FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
2148    FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
2149    // horizon filtered
2150    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2151    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2152    // vertical filtered into v21/v22
2153    FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
2154    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2155    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2156    st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line
2157    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2158    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
2159
2160    mov v5.16b, v11.16b
2161    mov v11.16b, v17.16b
2162    mov v30.16b, v2.16b
2163    mov v2.16b, v8.16b
2164    mov v8.16b, v14.16b
2165    mov v14.16b, v30.16b
2166
2167    mov v6.16b, v12.16b
2168    mov v12.16b, v18.16b
2169    mov v30.16b, v3.16b
2170    mov v3.16b, v9.16b
2171    mov v9.16b, v15.16b
2172    mov v15.16b, v30.16b
2173
2174    mov v7.16b, v13.16b
2175    mov v13.16b, v19.16b
2176    mov v30.16b, v4.16b
2177    mov v4.16b, v10.16b
2178    mov v10.16b, v16.16b
2179    mov v16.16b, v30.16b
2180
2181    sub x4, x4, #8
2182    cbnz x4, w17_hv_mc_luma_loop
2183
2184    //prfm pldl1strm, [x0, x1]
2185    ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
2186    // vertical filtered into v20/v21
2187    FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
2188    FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
2189    // horizon filtered
2190    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2191    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2192    // vertical filtered into v21/v22
2193    FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
2194    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
2195    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
2196    st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
2197    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
2198    st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
2199
2200    ldp d14, d15, [sp], #16
2201    ldp d12, d13, [sp], #16
2202    ldp d10, d11, [sp], #16
2203    ldp d8, d9, [sp], #16
2204WELS_ASM_AARCH64_FUNC_END
2205
2206//void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
2207//                                    int32_t iHeight);//width+1&&height+1
2208WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon
2209    SIGN_EXTENSION x1,w1
2210    SIGN_EXTENSION x3,w3
2211    SIGN_EXTENSION x4,w4
2212    sub x0, x0, #2
2213    sub x0, x0, x1, lsl #1
2214    movi v0.8h, #20, lsl #0
2215    movi v1.8h, #5, lsl #0
2216    sub x3, x3, #8
2217    mov x5, #8
2218    ldr q29, filter_para
2219    sub x4, x4, #1
2220
2221    //prfm pldl1strm, [x0]
2222    //prfm pldl1strm, [x0, x1]
2223    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
2224    //prfm pldl1strm, [x0, x1]
2225    ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride]
2226    //prfm pldl1strm, [x0, x1]
2227    ld1 {v4.16b}, [x0], x1 // v8=src[0*stride]
2228    //prfm pldl1strm, [x0, x1]
2229    ld1 {v5.16b}, [x0], x1 // v11=src[1*stride]
2230    //prfm pldl1strm, [x0, x1]
2231    ld1 {v6.16b}, [x0], x1 // v14=src[2*stride]
2232
2233w9_hv_mc_luma_loop:
2234    //prfm pldl1strm, [x0, x1]
2235    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
2236    // vertical filtered into v20/v21
2237    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2238    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
2239    // horizon filtered
2240    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2241    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2242    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
2243    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
2244    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
2245
2246    //prfm pldl1strm, [x0, x1]
2247    ld1 {v2.16b}, [x0], x1 // v2=src[4*stride]
2248    // vertical filtered into v20/v21
2249    FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
2250    FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
2251    // horizon filtered
2252    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2253    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2254    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line
2255    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
2256    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line
2257
2258    //prfm pldl1strm, [x0, x1]
2259    ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
2260    // vertical filtered into v20/v21
2261    FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
2262    FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
2263    // horizon filtered
2264    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2265    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2266    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line
2267    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
2268    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line
2269
2270    //prfm pldl1strm, [x0, x1]
2271    ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
2272    // vertical filtered into v20/v21
2273    FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
2274    FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
2275    // horizon filtered
2276    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2277    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2278    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line
2279    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
2280    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
2281
2282
2283    mov v5.16b, v3.16b
2284    mov v3.16b, v7.16b
2285    mov v30.16b, v2.16b
2286    mov v2.16b, v6.16b
2287    mov v6.16b, v4.16b
2288    mov v4.16b, v30.16b
2289
2290    sub x4, x4, #4
2291    cbnz x4, w9_hv_mc_luma_loop
2292
2293    //prfm pldl1strm, [x0, x1]
2294    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
2295    // vertical filtered into v20/v21
2296    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2297    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
2298    // horizon filtered
2299    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2300    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2301    st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
2302    UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
2303    st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
2304WELS_ASM_AARCH64_FUNC_END
2305
2306//void McHorVer22Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
2307//                                    int32_t iHeight);//width+1&&height+1
2308WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_neon
2309    SIGN_EXTENSION x1,w1
2310    SIGN_EXTENSION x3,w3
2311    SIGN_EXTENSION x4,w4
2312    sub x0, x0, #2
2313    sub x0, x0, x1, lsl #1
2314    movi v0.8h, #20, lsl #0
2315    movi v1.8h, #5, lsl #0
2316    sub x3, x3, #4
2317    mov x5, #4
2318    ldr q29, filter_para
2319    sub x4, x4, #1
2320
2321    //prfm pldl1strm, [x0]
2322    //prfm pldl1strm, [x0, x1]
2323    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
2324    //prfm pldl1strm, [x0, x1]
2325    ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride]
2326    //prfm pldl1strm, [x0, x1]
2327    ld1 {v4.16b}, [x0], x1 // v8=src[0*stride]
2328    //prfm pldl1strm, [x0, x1]
2329    ld1 {v5.16b}, [x0], x1 // v11=src[1*stride]
2330    //prfm pldl1strm, [x0, x1]
2331    ld1 {v6.16b}, [x0], x1 // v14=src[2*stride]
2332
2333w5_hv_mc_luma_loop:
2334    //prfm pldl1strm, [x0, x1]
2335    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
2336    // vertical filtered into v20/v21
2337    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2338    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
2339    // horizon filtered
2340    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2341    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2342    st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 0 line
2343    st1 {v26.b}[4], [x2], x3 //write 5th Byte : 0 line
2344
2345    //prfm pldl1strm, [x0, x1]
2346    ld1 {v2.16b}, [x0], x1 // v2=src[4*stride]
2347    // vertical filtered into v20/v21
2348    FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
2349    FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
2350    // horizon filtered
2351    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2352    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2353    st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 1 line
2354    st1 {v26.b}[4], [x2], x3 //write 5th Byte : 1 line
2355
2356    //prfm pldl1strm, [x0, x1]
2357    ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
2358    // vertical filtered into v20/v21
2359    FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
2360    FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
2361    // horizon filtered
2362    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2363    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2364    st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 2 line
2365    st1 {v26.b}[4], [x2], x3 //write 5th Byte : 2 line
2366
2367    //prfm pldl1strm, [x0, x1]
2368    ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
2369    // vertical filtered into v20/v21
2370    FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
2371    FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
2372    // horizon filtered
2373    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2374    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2375    st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 3 line
2376    st1 {v26.b}[4], [x2], x3 //write 5th Byte : 3 line
2377
2378
2379    mov v5.16b, v3.16b
2380    mov v3.16b, v7.16b
2381    mov v30.16b, v2.16b
2382    mov v2.16b, v6.16b
2383    mov v6.16b, v4.16b
2384    mov v4.16b, v30.16b
2385
2386    sub x4, x4, #4
2387    cbnz x4, w5_hv_mc_luma_loop
2388
2389    //prfm pldl1strm, [x0, x1]
2390    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
2391    // vertical filtered into v20/v21
2392    FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2393    FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
2394    // horizon filtered
2395    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
2396    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
2397    st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 0 line
2398    st1 {v26.b}[4], [x2], x3 //write 5th Byte : 0 line
2399WELS_ASM_AARCH64_FUNC_END
2400
2401//void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
2402//                                      int32_t iHeight);// height+1
2403WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon
2404    SIGN_EXTENSION x1,w1
2405    SIGN_EXTENSION x3,w3
2406    SIGN_EXTENSION x4,w4
2407    sub x0, x0, x1, lsl #1
2408    movi v0.8h, #20, lsl #0
2409    movi v1.8h, #5, lsl #0
2410    sub x4, x4, #1
2411
2412    //prfm pldl1strm, [x0]
2413    //prfm pldl1strm, [x0, x1]
2414    ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride]
2415    //prfm pldl1strm, [x0, x1]
2416    ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride]
2417    //prfm pldl1strm, [x0, x1]
2418    ld1 {v4.16b}, [x0], x1 // v4=src[0*stride]
2419    //prfm pldl1strm, [x0, x1]
2420    ld1 {v5.16b}, [x0], x1 // v5=src[1*stride]
2421    //prfm pldl1strm, [x0, x1]
2422    ld1 {v6.16b}, [x0], x1 // v6=src[2*stride]
2423
2424
2425w17_v_mc_luma_loop:
2426    //prfm pldl1strm, [x0, x1]
2427    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
2428    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2429    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
2430    st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line
2431
2432
2433    //prfm pldl1strm, [x0, x1]
2434    ld1 {v2.16b}, [x0], x1 // v2=src[4*stride]
2435    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
2436    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
2437    st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line
2438
2439
2440    //prfm pldl1strm, [x0, x1]
2441    ld1 {v3.16b}, [x0], x1 // v3=src[5*stride]
2442    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
2443    FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1
2444    st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line
2445
2446
2447    //prfm pldl1strm, [x0, x1]
2448    ld1 {v4.16b}, [x0], x1 // v4=src[6*stride]
2449    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
2450    FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1
2451    st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line
2452
2453
2454    //prfm pldl1strm, [x0, x1]
2455    ld1 {v5.16b}, [x0], x1 // v5=src[7*stride]
2456    FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1
2457    FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1
2458    st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line
2459
2460
2461    //prfm pldl1strm, [x0, x1]
2462    ld1 {v6.16b}, [x0], x1 // v6=src[8*stride]
2463    FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1
2464    FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1
2465    st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line
2466
2467    //prfm pldl1strm, [x0, x1]
2468    ld1 {v7.16b}, [x0], x1 // v7=src[9*stride]
2469    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2470    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
2471    st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line
2472
2473    //prfm pldl1strm, [x0, x1]
2474    ld1 {v2.16b}, [x0], x1 // v2=src[10*stride]
2475    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
2476    FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1
2477    st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line
2478
2479    mov v3.16b, v5.16b
2480    mov v5.16b, v7.16b
2481    mov v7.16b, v2.16b
2482    mov v2.16b, v4.16b
2483    mov v4.16b, v6.16b
2484    mov v6.16b, v7.16b
2485    sub x4, x4, #8
2486    cbnz x4, w17_v_mc_luma_loop
2487
2488    //prfm pldl1strm, [x0, x1]
2489    ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
2490    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2491    FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1
2492    st1 {v20.16b}, [x2], x3 //write 16Byte : last line
2493WELS_ASM_AARCH64_FUNC_END
2494//void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
2495//                                     int32_t iHeight);// height+1
2496WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height9_AArch64_neon
2497    SIGN_EXTENSION x1,w1
2498    SIGN_EXTENSION x3,w3
2499    SIGN_EXTENSION x4,w4
2500    sub x0, x0, x1, lsl #1
2501    movi v0.8h, #20, lsl #0
2502    movi v1.8h, #5, lsl #0
2503    sub x4, x4, #1
2504
2505    //prfm pldl1strm, [x0]
2506    //prfm pldl1strm, [x0, x1]
2507    ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride]
2508    //prfm pldl1strm, [x0, x1]
2509    ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride]
2510    //prfm pldl1strm, [x0, x1]
2511    ld1 {v4.8b}, [x0], x1 // v4=src[0*stride]
2512    //prfm pldl1strm, [x0, x1]
2513    ld1 {v5.8b}, [x0], x1 // v5=src[1*stride]
2514    //prfm pldl1strm, [x0, x1]
2515    ld1 {v6.8b}, [x0], x1 // v6=src[2*stride]
2516
2517w9_v_mc_luma_loop:
2518    //prfm pldl1strm, [x0, x1]
2519    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
2520    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2521    st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
2522
2523    //prfm pldl1strm, [x0, x1]
2524    ld1 {v2.8b}, [x0], x1 // v2=src[4*stride]
2525    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
2526    st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line
2527
2528    //prfm pldl1strm, [x0, x1]
2529    ld1 {v3.8b}, [x0], x1 // v3=src[5*stride]
2530    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
2531    st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line
2532
2533    //prfm pldl1strm, [x0, x1]
2534    ld1 {v4.8b}, [x0], x1 // v4=src[6*stride]
2535    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
2536    st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line
2537
2538    mov v5.16b, v3.16b
2539    mov v3.16b, v7.16b
2540    mov v7.16b, v2.16b
2541    mov v2.16b, v6.16b
2542    mov v6.16b, v4.16b
2543    mov v4.16b, v7.16b
2544    sub x4, x4, #4
2545    cbnz x4, w9_v_mc_luma_loop
2546
2547    //prfm pldl1strm, [x0, x1]
2548    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
2549    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2550    st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line
2551WELS_ASM_AARCH64_FUNC_END
2552
2553//void McHorVer02Height5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
2554//                                     int32_t iHeight);// height+1
2555WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height5_AArch64_neon
2556    SIGN_EXTENSION x1,w1
2557    SIGN_EXTENSION x3,w3
2558    SIGN_EXTENSION x4,w4
2559    sub x0, x0, x1, lsl #1
2560    movi v0.8h, #20, lsl #0
2561    movi v1.8h, #5, lsl #0
2562    sub x4, x4, #1
2563
2564    //prfm pldl1strm, [x0]
2565    //prfm pldl1strm, [x0, x1]
2566    ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride]
2567    //prfm pldl1strm, [x0, x1]
2568    ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride]
2569    //prfm pldl1strm, [x0, x1]
2570    ld1 {v4.8b}, [x0], x1 // v4=src[0*stride]
2571    //prfm pldl1strm, [x0, x1]
2572    ld1 {v5.8b}, [x0], x1 // v5=src[1*stride]
2573    //prfm pldl1strm, [x0, x1]
2574    ld1 {v6.8b}, [x0], x1 // v6=src[2*stride]
2575
2576w5_v_mc_luma_loop:
2577    //prfm pldl1strm, [x0, x1]
2578    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
2579    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2580    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
2581
2582    //prfm pldl1strm, [x0, x1]
2583    ld1 {v2.8b}, [x0], x1 // v2=src[4*stride]
2584    FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
2585    st1 {v20.s}[0], [x2], x3 //write 4Byte : 1 line
2586
2587    //prfm pldl1strm, [x0, x1]
2588    ld1 {v3.8b}, [x0], x1 // v3=src[5*stride]
2589    FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
2590    st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line
2591
2592    //prfm pldl1strm, [x0, x1]
2593    ld1 {v4.8b}, [x0], x1 // v4=src[6*stride]
2594    FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
2595    st1 {v20.s}[0], [x2], x3 //write 4Byte : 3 line
2596
2597    mov v5.16b, v3.16b
2598    mov v3.16b, v7.16b
2599    mov v7.16b, v2.16b
2600    mov v2.16b, v6.16b
2601    mov v6.16b, v4.16b
2602    mov v4.16b, v7.16b
2603    sub x4, x4, #4
2604    cbnz x4, w5_v_mc_luma_loop
2605
2606    //prfm pldl1strm, [x0, x1]
2607    ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
2608    FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
2609    st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line
2610
2611WELS_ASM_AARCH64_FUNC_END
2612
2613#endif
2614
2615