1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34#include "arm_arch64_common_macro.S" 35.align 4 36filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0 37 38.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 39// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 40 uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] 41 uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] 42 mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles 43 uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] 44 mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles 45 sqrshrun \arg6\().8b, v18.8h, #5 46// } 47.endm 48 49.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 50// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 51 uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] 52 uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] 53 mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles 54 uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] 55 mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles 56 sqrshrun2 \arg6\().16b, v18.8h, #5 57// } 58.endm 59 60.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 61// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 62 uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] 63 uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] 64 mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles 65 uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] 66 mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles 67 sqrshrun \arg6\().8b, v18.8h, #5 68 uaddl v19.8h, \arg2\().8b, \arg6\().8b 69 rshrn \arg6\().8b, v19.8h, #1 70// } 71.endm 72 73.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 74// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 75 uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] 76 uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] 77 mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles 78 uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] 79 mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles 80 sqrshrun2 \arg6\().16b, v18.8h, #5 81 uaddl2 v19.8h, \arg2\().16b, \arg6\().16b 82 rshrn2 \arg6\().16b, v19.8h, #1 83// } 84.endm 85 86.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 87// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 88 uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3] 89 uaddl v19.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] 90 mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles 91 uaddl v19.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] 92 mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles 93 sqrshrun \arg6\().8b, v18.8h, #5 94 uaddl v19.8h, \arg3\().8b, \arg6\().8b 95 rshrn \arg6\().8b, v19.8h, #1 96// } 97.endm 98 99.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 100// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19 101 uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3] 102 uaddl2 v19.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] 103 mla v18.8h, v19.8h, \arg7\().8h //v18 += 20*(src[0]+src[1]), 2 cycles 104 uaddl2 v19.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] 105 mls v18.8h, v19.8h, \arg8\().8h //v18 -= 5*(src[-1]+src[2]), 2 cycles 106 sqrshrun2 \arg6\().16b, v18.8h, #5 107 uaddl2 v19.8h, \arg3\().16b, \arg6\().16b 108 rshrn2 \arg6\().16b, v19.8h, #1 109// } 110.endm 111 112.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 113// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 114 uaddl \arg6\().8h, \arg0\().8b, \arg5\().8b //dst_q=src[-2]+src[3] 115 uaddl v31.8h, \arg2\().8b, \arg3\().8b //src[0]+src[1] 116 mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles 117 uaddl v31.8h, \arg1\().8b, \arg4\().8b //src[-1]+src[2] 118 mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles 119// } 120.endm 121 122.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 123// { // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31 124 uaddl2 \arg6\().8h, \arg0\().16b, \arg5\().16b //dst_q=src[-2]+src[3] 125 uaddl2 v31.8h, \arg2\().16b, \arg3\().16b //src[0]+src[1] 126 mla \arg6\().8h, v31.8h, \arg7\().8h //dst_q += 20*(src[0]+src[1]), 2 cycles 127 uaddl2 v31.8h, \arg1\().16b, \arg4\().16b //src[-1]+src[2] 128 mls \arg6\().8h, v31.8h, \arg8\().8h //dst_q -= 5*(src[-1]+src[2]), 2 cycles 129// } 130.endm 131 132.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3 133// { // input:a, b, c, dst_d; 134 sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b 135 sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 136 sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b 137 add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c 138 sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 139 add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 140 sqrshrun \arg3\().8b, \arg0\().8h, #6 //(+32)>>6 141// } 142.endm 143 144.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3 145// { // input:a, b, c, dst_d; 146 sub \arg0\().8h, \arg0\().8h, \arg1\().8h //a-b 147 sshr \arg0\().8h, \arg0\().8h, #2 //(a-b)/4 148 sub \arg0\().8h, \arg0\().8h, \arg1\().8h //(a-b)/4-b 149 add \arg0\().8h, \arg0\().8h, \arg2\().8h //(a-b)/4-b+c 150 sshr \arg0\().8h, \arg0\().8h, #2 //((a-b)/4-b+c)/4 151 add \arg0\().8h, \arg0\().8h, \arg2\().8h //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 152 sqrshrun2 \arg3\().16b, \arg0\().8h, #6 //(+32)>>6 153// } 154.endm 155 156.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4 157// { // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c; 158 ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4 //src[0] 159 ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6 //src[1] 160 add \arg4\().8h, \arg4\().8h, \arg3\().8h //c=src[0]+src[1] 161 162 ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2 //src[-1] 163 ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8 //src[2] 164 add \arg3\().8h, \arg3\().8h, \arg2\().8h //b=src[-1]+src[2] 165 166 ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10 //src[3] 167 add \arg2\().8h, \arg2\().8h, \arg0\().8h //a=src[-2]+src[3] 168// } 169.endm 170 171.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2 172// { // input:dst_d, src_d A and B; working: v5 173 uaddl v30.8h, \arg2\().8b, \arg1\().8b 174 rshrn \arg0\().8b, v30.8h, #1 175// } 176.endm 177 178.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2 179// { // input:dst_d, src_d A and B; working: v5 180 uaddl2 v30.8h, \arg2\().16b, \arg1\().16b 181 rshrn2 \arg0\().16b, v30.8h, #1 182// } 183.endm 184 185.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3 186// when width=17/9, used 187// { // input: src_d{Y[0][1][2][3][4][5]X}, 188 rev64 \arg2\().8b, \arg0\().8b // X[5][4][3][2][1][0]O 189 uaddl \arg2\().8h, \arg0\().8b, \arg2\().8b // each 16bits, *[50][41][32][23][14][05]* 190 mul \arg2\().4h, \arg2\().4h, \arg1\().4h // 0+1*[50]-5*[41]+20[32] 191 addv \arg3, \arg2\().4h 192 sqrshrun \arg0\().8b, \arg0\().8h, #5 193// } 194.endm 195 196.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5 197// { // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst) 198 ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14 // X[0][1][2][3][4][5]O 199 ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8 // [3][4][5]OX[0][1][2] 200 rev64 \arg4\().8h, \arg4\().8h // X[5][4][3][2][1][0]O 201 add \arg3\().8h, \arg3\().8h, \arg4\().8h // each 16bits, *[50][41][32][23][14][05]* 202 smull \arg3\().4s, \arg3\().4h, \arg2\().4h // 0+1*[50]-5*[41]+20[32] 203 saddlv \arg5, \arg3\().4s 204 //sshr \arg0\().2d, \arg0\().2d, #4 205 sqrshrun \arg0\().2s, \arg0\().2d, #10 206 uqxtn \arg0\().4h, \arg0\().4s 207 uqxtn \arg0\().8b, \arg0\().8h 208 // } 209.endm 210 211.macro VEC4_LD1_8BITS_16ELEMENT arg0, arg1, arg2, arg3, arg4, arg5 212//{//load 16bytes * 4rows 213 ld1 {\arg2\().16b}, [\arg0], \arg1 214 ld1 {\arg3\().16b}, [\arg0], \arg1 215 ld1 {\arg4\().16b}, [\arg0], \arg1 216 ld1 {\arg5\().16b}, [\arg0], \arg1 217//} 218.endm 219 220.macro VEC4_ST1_8BITS_8ELEMENT arg0, arg1, arg2, arg3, arg4, arg5 221//{ 222 st1 {\arg2\().8b}, [\arg0], \arg1 223 st1 {\arg3\().8b}, [\arg0], \arg1 224 st1 {\arg4\().8b}, [\arg0], \arg1 225 st1 {\arg5\().8b}, [\arg0], \arg1 226//} 227.endm 228 229.macro VEC4_UADDL_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11 230//{ 231 uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b 232 uaddl \arg9\().8h, \arg2\().8b, \arg3\().8b 233 uaddl \arg10\().8h, \arg4\().8b, \arg5\().8b 234 uaddl \arg11\().8h, \arg6\().8b, \arg7\().8b 235//} 236.endm 237 238.macro VEC4_UADDL2_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11 239//{ 240 uaddl \arg8\().8h, \arg0\().16b, \arg1\().16b 241 uaddl \arg9\().8h, \arg2\().16b, \arg3\().16b 242 uaddl \arg10\().8h, \arg4\().16b, \arg5\().16b 243 uaddl \arg11\().8h, \arg6\().16b, \arg7\().16b 244//} 245.endm 246 247.macro VEC4_MLS_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11 248//{ 249 mls \arg8\().8h, \arg0\().8h, \arg1\().8h 250 mls \arg9\().8h, \arg2\().8h, \arg3\().8h 251 mls \arg10\().8h, \arg4\().8h, \arg5\().8h 252 mls \arg11\().8h, \arg6\().8h, \arg7\().8h 253//} 254.endm 255 256.macro VEC4_MLA_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11 257//{ 258 mla \arg8\().8h, \arg0\().8h, \arg1\().8h 259 mla \arg9\().8h, \arg2\().8h, \arg3\().8h 260 mla \arg10\().8h, \arg4\().8h, \arg5\().8h 261 mla \arg11\().8h, \arg6\().8h, \arg7\().8h 262//} 263.endm 264 265.macro VEC4_SQRSHRUN_16BITS_SHIFT5 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 266//{ 267 sqrshrun \arg4\().8b, \arg0\().8h, #5 268 sqrshrun \arg5\().8b, \arg1\().8h, #5 269 sqrshrun \arg6\().8b, \arg2\().8h, #5 270 sqrshrun \arg7\().8b, \arg3\().8h, #5 271//} 272.endm 273 274.macro VEC4_SQRSHRUN2_16BITS_SHIFT5 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 275//{ 276 sqrshrun2 \arg4\().16b, \arg0\().8h, #5 277 sqrshrun2 \arg5\().16b, \arg1\().8h, #5 278 sqrshrun2 \arg6\().16b, \arg2\().8h, #5 279 sqrshrun2 \arg7\().16b, \arg3\().8h, #5 280//} 281.endm 282 283.macro VEC4_RSHRN_16BITS_SHIFT1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 284//{ 285 rshrn \arg4\().8b, \arg0\().8h, #1 286 rshrn \arg5\().8b, \arg1\().8h, #1 287 rshrn \arg6\().8b, \arg2\().8h, #1 288 rshrn \arg7\().8b, \arg3\().8h, #1 289//} 290.endm 291 292//(const uint8_t* pSrc {x0}, int32_t iSrcStride{x1}, uint8_t* pDst{x2}, int32_t iDstStride{x3}, int32_t iHeight{x4}) 293WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq16_AArch64_neon 294 sub x0, x0, #2 295 movi v0.8h, #20, lsl #0 296 movi v1.8h, #5, lsl #0 297 SIGN_EXTENSION x1,w1 298 SIGN_EXTENSION x3,w3 299 SIGN_EXTENSION x4,w4 300w16_h_mc_luma_loop: 301 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2] 302 trn1 v2.2d, v2.2d, v3.2d 303 //prfm pldl1strm, [x0] 304 ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] 305 ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] 306 ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] 307 ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] 308 ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] 309 310 FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 311 FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1 312 313 sub x4, x4, #1 314 st1 {v20.16b}, [x2], x3 //write 16Byte 315 cbnz x4, w16_h_mc_luma_loop 316WELS_ASM_AARCH64_FUNC_END 317 318//void McHorVer20WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,int32_t iHeight); 319WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon 320 sub x0, x0, #2 321 stp d8,d9, [sp,#-16]! 322 movi v8.8h, #20, lsl #0 323 movi v9.8h, #5, lsl #0 324 SIGN_EXTENSION x1,w1 325 SIGN_EXTENSION x3,w3 326 SIGN_EXTENSION x4,w4 327w8_h_mc_luma_loop: 328 VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28 //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5); 329 sub x4, x4, #4 330 331 //1st row: 332 ext v17.16b, v16.16b, v16.16b, #5 //src[3] 333 ext v18.16b, v16.16b, v16.16b, #1 //src[-1] 334 ext v19.16b, v16.16b, v16.16b, #4 //src[2] 335 //2nd row: 336 ext v21.16b, v20.16b, v20.16b, #5 //src[3] 337 ext v22.16b, v20.16b, v20.16b, #1 //src[-1] 338 ext v23.16b, v20.16b, v20.16b, #4 //src[2] 339 //3rd row: 340 ext v25.16b, v24.16b, v24.16b, #5 //src[3] 341 ext v26.16b, v24.16b, v24.16b, #1 //src[-1] 342 ext v27.16b, v24.16b, v24.16b, #4 //src[2] 343 //4th row: 344 ext v29.16b, v28.16b, v28.16b, #5 //src[3] 345 ext v30.16b, v28.16b, v28.16b, #1 //src[-1] 346 ext v31.16b, v28.16b, v28.16b, #4 //src[2] 347 348 VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6 //v0/v2/v4/v6=src[-2]+src[3] 349 VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2] 350 VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6 //v0/v2/v4/v6 -= 5*(src[-1]+src[2]) 351 352 //1st row: 353 ext v18.16b, v16.16b, v16.16b, #2 //src[0] 354 ext v19.16b, v16.16b, v16.16b, #3 //src[1] 355 //2nd row: 356 ext v22.16b, v20.16b, v20.16b, #2 //src[0] 357 ext v23.16b, v20.16b, v20.16b, #3 //src[1] 358 //3rd row: 359 ext v26.16b, v24.16b, v24.16b, #2 //src[0] 360 ext v27.16b, v24.16b, v24.16b, #3 //src[1] 361 //4th row: 362 ext v30.16b, v28.16b, v28.16b, #2 //src[0] 363 ext v31.16b, v28.16b, v28.16b, #3 //src[1] 364 365 VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1] 366 VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6 //v0/v2/v4/v6+=20*(src[0]+src[1]) 367 368 VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7 369 370 VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7 371 cbnz x4, w8_h_mc_luma_loop 372 373 ldp d8,d9,[sp],#16 374WELS_ASM_AARCH64_FUNC_END 375 376//void McHorVer20WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 377// int32_t iHeight); 378WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon 379 sub x0, x0, #2 380 movi v0.8h, #20, lsl #0 381 movi v1.8h, #5, lsl #0 382 SIGN_EXTENSION x1,w1 383 SIGN_EXTENSION x3,w3 384 SIGN_EXTENSION x4,w4 385 asr x4, x4, #1 386w4_h_mc_luma_loop: 387 ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6] 388 //prfm pldl1strm, [x0] 389 ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6] 390 //prfm pldl1strm, [x0] 391 392 zip1 v4.4s, v2.4s, v3.4s // v4=src[-2] 1st:2nd 393 ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd 394 395 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[-1:6] 396 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[-1:6] 397 zip1 v5.4s, v2.4s, v3.4s // v5=src[-1:2] 1st:2nd 398 ext v7.16b, v5.16b, v4.16b, #8 //v7=src[3:6] 1st:2nd 399 400 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[0:6] 401 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[0:6] 402 zip1 v6.4s, v2.4s, v3.4s // v6=src[0:3] 1st:2nd 403 404 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[1:6] 405 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[1:6] 406 zip1 v16.4s, v2.4s, v3.4s // v16=src[1:4] 1st:2nd 407 408 FILTER_6TAG_8BITS1 v4, v5, v6, v16, v17, v7, v20, v0, v1 409 410 st1 {v20.s}[0], [x2], x3 //write 4Byte 411 st1 {v20.s}[1], [x2], x3 //write 4Byte 412 sub x4, x4, #1 413 cbnz x4, w4_h_mc_luma_loop 414WELS_ASM_AARCH64_FUNC_END 415 416//void McHorVer10WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 417// int32_t iHeight); 418WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon 419 sub x0, x0, #2 420 movi v0.8h, #20, lsl #0 421 movi v1.8h, #5, lsl #0 422 SIGN_EXTENSION x1,w1 423 SIGN_EXTENSION x3,w3 424 SIGN_EXTENSION x4,w4 425w16_xy_10_mc_luma_loop: 426 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2] 427 trn1 v2.2d, v2.2d, v3.2d 428 //prfm pldl1strm, [x0] 429 ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] 430 ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] 431 ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] 432 ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] 433 ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] 434 435 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1 436 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v5, v6, v7, v16, v17, v20, v0, v1 437 438 sub x4, x4, #1 439 st1 {v20.16b}, [x2], x3 //write 16Byte 440 cbnz x4, w16_xy_10_mc_luma_loop 441WELS_ASM_AARCH64_FUNC_END 442 443//void McHorVer10WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 444// int32_t iHeight); 445WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq8_AArch64_neon 446 sub x0, x0, #2 447 stp d8,d9, [sp,#-16]! 448 movi v8.8h, #20, lsl #0 449 movi v9.8h, #5, lsl #0 450 SIGN_EXTENSION x1,w1 451 SIGN_EXTENSION x3,w3 452 SIGN_EXTENSION x4,w4 453w8_xy_10_mc_luma_loop: 454 VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28 //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5); 455 sub x4, x4, #4 456 457 //1st row: 458 ext v17.16b, v16.16b, v16.16b, #5 //src[3] 459 ext v18.16b, v16.16b, v16.16b, #1 //src[-1] 460 ext v19.16b, v16.16b, v16.16b, #4 //src[2] 461 //2nd row: 462 ext v21.16b, v20.16b, v20.16b, #5 //src[3] 463 ext v22.16b, v20.16b, v20.16b, #1 //src[-1] 464 ext v23.16b, v20.16b, v20.16b, #4 //src[2] 465 //3rd row: 466 ext v25.16b, v24.16b, v24.16b, #5 //src[3] 467 ext v26.16b, v24.16b, v24.16b, #1 //src[-1] 468 ext v27.16b, v24.16b, v24.16b, #4 //src[2] 469 //4th row: 470 ext v29.16b, v28.16b, v28.16b, #5 //src[3] 471 ext v30.16b, v28.16b, v28.16b, #1 //src[-1] 472 ext v31.16b, v28.16b, v28.16b, #4 //src[2] 473 474 VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6 //v0/v2/v4/v6=src[-2]+src[3] 475 VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2] 476 VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6 //v0/v2/v4/v6 -= 5*(src[-1]+src[2]) 477 478 //1st row: 479 ext v18.16b, v16.16b, v16.16b, #2 //src[0] 480 ext v19.16b, v16.16b, v16.16b, #3 //src[1] 481 //2nd row: 482 ext v22.16b, v20.16b, v20.16b, #2 //src[0] 483 ext v23.16b, v20.16b, v20.16b, #3 //src[1] 484 //3rd row: 485 ext v26.16b, v24.16b, v24.16b, #2 //src[0] 486 ext v27.16b, v24.16b, v24.16b, #3 //src[1] 487 //4th row: 488 ext v30.16b, v28.16b, v28.16b, #2 //src[0] 489 ext v31.16b, v28.16b, v28.16b, #3 //src[1] 490 491 VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1] 492 VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6 //v0/v2/v4/v6+=20*(src[0]+src[1]) 493 VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7 494 495 VEC4_UADDL_8BITS v1, v18, v3, v22, v5, v26, v7, v30, v0, v2, v4, v6 //average with arc[0] 496 VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7 497 498 VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7 499 cbnz x4, w8_xy_10_mc_luma_loop 500 501 ldp d8,d9,[sp],#16 502WELS_ASM_AARCH64_FUNC_END 503 504//void McHorVer10WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 505// int32_t iHeight); 506WELS_ASM_AARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon 507 sub x0, x0, #2 508 movi v0.8h, #20, lsl #0 509 movi v1.8h, #5, lsl #0 510 SIGN_EXTENSION x1,w1 511 SIGN_EXTENSION x3,w3 512 SIGN_EXTENSION x4,w4 513 asr x4, x4, #1 514w4_xy_10_mc_luma_loop: 515 ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6] 516 //prfm pldl1strm, [x0] 517 ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6] 518 //prfm pldl1strm, [x0] 519 520 zip1 v4.4s, v2.4s, v3.4s // v4=src[-2] 1st:2nd 521 ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd 522 523 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[-1:6] 524 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[-1:6] 525 zip1 v5.4s, v2.4s, v3.4s // v5=src[-1:2] 1st:2nd 526 ext v7.16b, v5.16b, v4.16b, #8 //v7=src[3:6] 1st:2nd 527 528 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[0:6] 529 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[0:6] 530 zip1 v6.4s, v2.4s, v3.4s // v6=src[0:3] 1st:2nd 531 532 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[1:6] 533 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[1:6] 534 zip1 v16.4s, v2.4s, v3.4s // v16=src[1:4] 1st:2nd 535 536 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v16, v17, v7, v20, v0, v1 537 538 st1 {v20.s}[0], [x2], x3 //write 4Byte 539 st1 {v20.s}[1], [x2], x3 //write 4Byte 540 sub x4, x4, #1 541 cbnz x4, w4_xy_10_mc_luma_loop 542WELS_ASM_AARCH64_FUNC_END 543 544//void McHorVer30WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 545// int32_t iHeight); 546WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq16_AArch64_neon 547 sub x0, x0, #2 548 movi v0.8h, #20, lsl #0 549 movi v1.8h, #5, lsl #0 550 SIGN_EXTENSION x1,w1 551 SIGN_EXTENSION x3,w3 552 SIGN_EXTENSION x4,w4 553w16_xy_30_mc_luma_loop: 554 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 //only use 21(16+5); v2=src[-2] 555 trn1 v2.2d, v2.2d, v3.2d 556 //prfm pldl1strm, [x0] 557 ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] 558 ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] 559 ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] 560 ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] 561 ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] 562 563 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1 564 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v5, v6, v7, v16, v17, v20, v0, v1 565 566 sub x4, x4, #1 567 st1 {v20.16b}, [x2], x3 //write 16Byte 568 cbnz x4, w16_xy_30_mc_luma_loop 569WELS_ASM_AARCH64_FUNC_END 570 571//void McHorVer30WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 572// int32_t iHeight); 573WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq8_AArch64_neon 574 sub x0, x0, #2 575 stp d8,d9, [sp,#-16]! 576 movi v8.8h, #20, lsl #0 577 movi v9.8h, #5, lsl #0 578 SIGN_EXTENSION x1,w1 579 SIGN_EXTENSION x3,w3 580 SIGN_EXTENSION x4,w4 581w8_xy_30_mc_luma_loop: 582 VEC4_LD1_8BITS_16ELEMENT x0, x1, v16, v20, v24, v28 //load src[-2] in v16,v20,v24,v28 for 4 row; only use 13(8+5); 583 sub x4, x4, #4 584 585 //1st row: 586 ext v17.16b, v16.16b, v16.16b, #5 //src[3] 587 ext v18.16b, v16.16b, v16.16b, #1 //src[-1] 588 ext v19.16b, v16.16b, v16.16b, #4 //src[2] 589 //2nd row: 590 ext v21.16b, v20.16b, v20.16b, #5 //src[3] 591 ext v22.16b, v20.16b, v20.16b, #1 //src[-1] 592 ext v23.16b, v20.16b, v20.16b, #4 //src[2] 593 //3rd row: 594 ext v25.16b, v24.16b, v24.16b, #5 //src[3] 595 ext v26.16b, v24.16b, v24.16b, #1 //src[-1] 596 ext v27.16b, v24.16b, v24.16b, #4 //src[2] 597 //4th row: 598 ext v29.16b, v28.16b, v28.16b, #5 //src[3] 599 ext v30.16b, v28.16b, v28.16b, #1 //src[-1] 600 ext v31.16b, v28.16b, v28.16b, #4 //src[2] 601 602 VEC4_UADDL_8BITS v16, v17, v20, v21, v24, v25, v28, v29, v0, v2, v4, v6 //v0/v2/v4/v6=src[-2]+src[3] 603 VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[-1]+src[2] 604 VEC4_MLS_16BITS v1, v9, v3, v9, v5, v9, v7, v9, v0, v2, v4, v6 //v0/v2/v4/v6 -= 5*(src[-1]+src[2]) 605 606 //1st row: 607 ext v18.16b, v16.16b, v16.16b, #2 //src[0] 608 ext v19.16b, v16.16b, v16.16b, #3 //src[1] 609 //2nd row: 610 ext v22.16b, v20.16b, v20.16b, #2 //src[0] 611 ext v23.16b, v20.16b, v20.16b, #3 //src[1] 612 //3rd row: 613 ext v26.16b, v24.16b, v24.16b, #2 //src[0] 614 ext v27.16b, v24.16b, v24.16b, #3 //src[1] 615 //4th row: 616 ext v30.16b, v28.16b, v28.16b, #2 //src[0] 617 ext v31.16b, v28.16b, v28.16b, #3 //src[1] 618 619 VEC4_UADDL_8BITS v18, v19, v22, v23, v26, v27, v30, v31, v1, v3, v5, v7 //v1/v3/v5/v7=src[0]+src[1] 620 VEC4_MLA_16BITS v1, v8, v3, v8, v5, v8, v7, v8, v0, v2, v4, v6 //v0/v2/v4/v6+=20*(src[0]+src[1]) 621 VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7 622 623 VEC4_UADDL_8BITS v1, v19, v3, v23, v5, v27, v7, v31, v0, v2, v4, v6 //average with arc[0] 624 VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7 625 626 VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7 627 cbnz x4, w8_xy_30_mc_luma_loop 628 629 ldp d8,d9,[sp],#16 630WELS_ASM_AARCH64_FUNC_END 631 632//void McHorVer30WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 633// int32_t iHeight); 634WELS_ASM_AARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon 635 sub x0, x0, #2 636 movi v0.8h, #20, lsl #0 637 movi v1.8h, #5, lsl #0 638 SIGN_EXTENSION x1,w1 639 SIGN_EXTENSION x3,w3 640 SIGN_EXTENSION x4,w4 641 asr x4, x4, #1 642w4_xy_30_mc_luma_loop: 643 ld1 {v2.16b}, [x0], x1 //only use 9(4+5); 1st row src[-2:6] 644 //prfm pldl1strm, [x0] 645 ld1 {v3.16b}, [x0], x1 //only use 9(4+5); 2nd row src[-2:6] 646 //prfm pldl1strm, [x0] 647 648 zip1 v4.4s, v2.4s, v3.4s // v4=src[-2] 1st:2nd 649 ext v17.16b, v4.16b, v4.16b, #8 // v17=src[2:5] 1st:2nd 650 651 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[-1:6] 652 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[-1:6] 653 zip1 v5.4s, v2.4s, v3.4s // v5=src[-1:2] 1st:2nd 654 ext v7.16b, v5.16b, v4.16b, #8 //v7=src[3:6] 1st:2nd 655 656 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[0:6] 657 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[0:6] 658 zip1 v6.4s, v2.4s, v3.4s // v6=src[0:3] 1st:2nd 659 660 ext v2.16b, v2.16b, v4.16b, #1 //1st row src[1:6] 661 ext v3.16b, v3.16b, v4.16b, #1 //2nd row src[1:6] 662 zip1 v16.4s, v2.4s, v3.4s // v16=src[1:4] 1st:2nd 663 664 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v16, v17, v7, v20, v0, v1 665 666 st1 {v20.s}[0], [x2], x3 //write 4Byte 667 st1 {v20.s}[1], [x2], x3 //write 4Byte 668 sub x4, x4, #1 669 cbnz x4, w4_xy_30_mc_luma_loop 670WELS_ASM_AARCH64_FUNC_END 671 672//void McHorVer01WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 673// int32_t iHeight); 674WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq16_AArch64_neon 675 SIGN_EXTENSION x1,w1 676 SIGN_EXTENSION x3,w3 677 SIGN_EXTENSION x4,w4 678 sub x0, x0, x1, lsl #1 679 movi v0.8h, #20, lsl #0 680 movi v1.8h, #5, lsl #0 681 682 //prfm pldl1strm, [x0] 683 //prfm pldl1strm, [x0, x1] 684 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 685 //prfm pldl1strm, [x0, x1] 686 ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] 687 //prfm pldl1strm, [x0, x1] 688 ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] 689 //prfm pldl1strm, [x0, x1] 690 ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] 691 //prfm pldl1strm, [x0, x1] 692 ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] 693 694 695w16_xy_01_mc_luma_loop: 696 //prfm pldl1strm, [x0, x1] 697 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 698 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 699 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 700 st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line 701 702 703 //prfm pldl1strm, [x0, x1] 704 ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] 705 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 706 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 707 st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line 708 709 710 //prfm pldl1strm, [x0, x1] 711 ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] 712 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1 713 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1 714 st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line 715 716 717 //prfm pldl1strm, [x0, x1] 718 ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] 719 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1 720 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v5, v6, v7, v2, v3, v4, v20, v0, v1 721 st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line 722 723 724 //prfm pldl1strm, [x0, x1] 725 ld1 {v5.16b}, [x0], x1 // v5=src[3*stride] 726 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1 727 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v6, v7, v2, v3, v4, v5, v20, v0, v1 728 st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line 729 730 731 //prfm pldl1strm, [x0, x1] 732 ld1 {v6.16b}, [x0], x1 // v6=src[3*stride] 733 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1 734 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v7, v2, v3, v4, v5, v6, v20, v0, v1 735 st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line 736 737 //prfm pldl1strm, [x0, x1] 738 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 739 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 740 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 741 st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line 742 743 //prfm pldl1strm, [x0, x1] 744 ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] 745 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 746 FILTER_6TAG_8BITS2_AVERAGE_WITH_0 v3, v4, v5, v6, v7, v2, v20, v0, v1 747 st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line 748 749 mov v3.16b, v5.16b 750 mov v5.16b, v7.16b 751 mov v7.16b, v2.16b 752 mov v2.16b, v4.16b 753 mov v4.16b, v6.16b 754 mov v6.16b, v7.16b 755 sub x4, x4, #8 756 cbnz x4, w16_xy_01_mc_luma_loop 757WELS_ASM_AARCH64_FUNC_END 758 759//void McHorVer01WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 760// int32_t iHeight); 761WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq8_AArch64_neon 762 SIGN_EXTENSION x1,w1 763 SIGN_EXTENSION x3,w3 764 SIGN_EXTENSION x4,w4 765 sub x0, x0, x1, lsl #1 766 movi v30.8h, #20, lsl #0 767 movi v31.8h, #5, lsl #0 768 769 ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride] 770 ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride] 771 ld1 {v18.8b}, [x0], x1 // v18=src[0*stride] 772 ld1 {v19.8b}, [x0], x1 // v19=src[1*stride] 773 ld1 {v20.8b}, [x0], x1 // v20=src[2*stride] 774 775w8_xy_01_mc_luma_loop: 776 ld1 {v21.8b}, [x0], x1 // v21=src[3*stride] 777 ld1 {v22.8b}, [x0], x1 // v22=src[4*stride] 778 ld1 {v23.8b}, [x0], x1 // v23=src[5*stride] 779 ld1 {v24.8b}, [x0], x1 // v24=src[6*stride] 780 781 VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3] 782 VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2] 783 VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6 //v0/v2/v4/v6 -=5*(src[-1]+src[2]) 784 VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1] 785 VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6 //v0/v2/v4/v6 += 20*(src[0]+src[1]) 786 VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7 787 788 VEC4_UADDL_8BITS v1, v18, v3, v19, v5, v20, v7, v21, v0, v2, v4, v6 //v0/v2/v4/v6 = average with src[0] 789 VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7 790 791 VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7 //store 8bytes*4row 792 793 sub x4, x4, #4 794 mov v16.16b, v20.16b 795 mov v17.16b, v21.16b 796 mov v18.16b, v22.16b 797 mov v19.16b, v23.16b 798 mov v20.16b, v24.16b 799 800 cbnz x4, w8_xy_01_mc_luma_loop 801WELS_ASM_AARCH64_FUNC_END 802 803//void McHorVer01WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 804// int32_t iHeight); 805WELS_ASM_AARCH64_FUNC_BEGIN McHorVer01WidthEq4_AArch64_neon 806 SIGN_EXTENSION x1,w1 807 SIGN_EXTENSION x3,w3 808 SIGN_EXTENSION x4,w4 809 sub x0, x0, x1, lsl #1 810 movi v0.8h, #20, lsl #0 811 movi v1.8h, #5, lsl #0 812 813 //prfm pldl1strm, [x0] 814 //prfm pldl1strm, [x0, x1] 815 ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride] 816 //prfm pldl1strm, [x0, x1] 817 ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride] 818 mov v2.s[1], v3.s[0] 819 //prfm pldl1strm, [x0, x1] 820 ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride] 821 mov v3.s[1], v4.s[0] 822 //prfm pldl1strm, [x0, x1] 823 ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride] 824 mov v4.s[1], v5.s[0] 825 //prfm pldl1strm, [x0, x1] 826 ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride] 827 mov v5.s[1], v6.s[0] 828 829w4_xy_01_mc_luma_loop: 830 //prfm pldl1strm, [x0, x1] 831 ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride] 832 mov v6.s[1], v7.s[0] 833 //prfm pldl1strm, [x0, x1] 834 ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride] 835 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v2, v3, v4, v5, v6, v7, v20, v0, v1 836 st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line 837 st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line 838 mov v2.s[0], v7.s[1] 839 840 //prfm pldl1strm, [x0, x1] 841 ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride] 842 //prfm pldl1strm, [x0, x1] 843 ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride] 844 mov v3.s[0], v2.s[1] 845 FILTER_6TAG_8BITS1_AVERAGE_WITH_0 v4, v5, v6, v7, v2, v3, v20, v0, v1 846 st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line 847 st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line 848 mov v4.s[0], v3.s[1] 849 850 mov v21.8b, v6.8b 851 mov v6.8b, v4.8b 852 mov v4.8b, v2.8b 853 mov v2.8b, v21.8b 854 mov v21.8b, v3.8b 855 mov v3.8b, v7.8b 856 mov v7.8b, v5.8b 857 mov v5.8b, v21.8b 858 859 sub x4, x4, #4 860 cbnz x4, w4_xy_01_mc_luma_loop 861WELS_ASM_AARCH64_FUNC_END 862 863//void McHorVer03WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 864// int32_t iHeight); 865WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq16_AArch64_neon 866 SIGN_EXTENSION x1,w1 867 SIGN_EXTENSION x3,w3 868 SIGN_EXTENSION x4,w4 869 sub x0, x0, x1, lsl #1 870 movi v0.8h, #20, lsl #0 871 movi v1.8h, #5, lsl #0 872 873 //prfm pldl1strm, [x0] 874 //prfm pldl1strm, [x0, x1] 875 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 876 //prfm pldl1strm, [x0, x1] 877 ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] 878 //prfm pldl1strm, [x0, x1] 879 ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] 880 //prfm pldl1strm, [x0, x1] 881 ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] 882 //prfm pldl1strm, [x0, x1] 883 ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] 884 885 886w16_xy_03_mc_luma_loop: 887 //prfm pldl1strm, [x0, x1] 888 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 889 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 890 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 891 st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line 892 893 894 //prfm pldl1strm, [x0, x1] 895 ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] 896 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 897 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 898 st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line 899 900 901 //prfm pldl1strm, [x0, x1] 902 ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] 903 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1 904 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1 905 st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line 906 907 908 //prfm pldl1strm, [x0, x1] 909 ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] 910 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1 911 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v5, v6, v7, v2, v3, v4, v20, v0, v1 912 st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line 913 914 915 //prfm pldl1strm, [x0, x1] 916 ld1 {v5.16b}, [x0], x1 // v5=src[3*stride] 917 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1 918 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v6, v7, v2, v3, v4, v5, v20, v0, v1 919 st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line 920 921 922 //prfm pldl1strm, [x0, x1] 923 ld1 {v6.16b}, [x0], x1 // v6=src[3*stride] 924 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1 925 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v7, v2, v3, v4, v5, v6, v20, v0, v1 926 st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line 927 928 //prfm pldl1strm, [x0, x1] 929 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 930 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 931 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 932 st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line 933 934 //prfm pldl1strm, [x0, x1] 935 ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] 936 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 937 FILTER_6TAG_8BITS2_AVERAGE_WITH_1 v3, v4, v5, v6, v7, v2, v20, v0, v1 938 st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line 939 940 mov v3.16b, v5.16b 941 mov v5.16b, v7.16b 942 mov v7.16b, v2.16b 943 mov v2.16b, v4.16b 944 mov v4.16b, v6.16b 945 mov v6.16b, v7.16b 946 sub x4, x4, #8 947 cbnz x4, w16_xy_03_mc_luma_loop 948WELS_ASM_AARCH64_FUNC_END 949 950//void McHorVer03WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 951// int32_t iHeight); 952WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq8_AArch64_neon 953 SIGN_EXTENSION x1,w1 954 SIGN_EXTENSION x3,w3 955 SIGN_EXTENSION x4,w4 956 sub x0, x0, x1, lsl #1 957 movi v30.8h, #20, lsl #0 958 movi v31.8h, #5, lsl #0 959 960 ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride] 961 ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride] 962 ld1 {v18.8b}, [x0], x1 // v18=src[0*stride] 963 ld1 {v19.8b}, [x0], x1 // v19=src[1*stride] 964 ld1 {v20.8b}, [x0], x1 // v20=src[2*stride] 965 966w8_xy_03_mc_luma_loop: 967 ld1 {v21.8b}, [x0], x1 // v21=src[3*stride] 968 ld1 {v22.8b}, [x0], x1 // v22=src[4*stride] 969 ld1 {v23.8b}, [x0], x1 // v23=src[5*stride] 970 ld1 {v24.8b}, [x0], x1 // v24=src[6*stride] 971 972 VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3] 973 VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2] 974 VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6 //v0/v2/v4/v6 -=5*(src[-1]+src[2]) 975 VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1] 976 VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6 //v0/v2/v4/v6 += 20*(src[0]+src[1]) 977 VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7 978 979 VEC4_UADDL_8BITS v1, v19, v3, v20, v5, v21, v7, v22, v0, v2, v4, v6 //v0/v2/v4/v6 = average with src[0] 980 VEC4_RSHRN_16BITS_SHIFT1 v0, v2, v4, v6, v1, v3, v5, v7 981 982 VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7 //store 8bytes*4row 983 984 sub x4, x4, #4 985 mov v16.16b, v20.16b 986 mov v17.16b, v21.16b 987 mov v18.16b, v22.16b 988 mov v19.16b, v23.16b 989 mov v20.16b, v24.16b 990 991 cbnz x4, w8_xy_03_mc_luma_loop 992WELS_ASM_AARCH64_FUNC_END 993 994//void McHorVer03WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 995// int32_t iHeight); 996WELS_ASM_AARCH64_FUNC_BEGIN McHorVer03WidthEq4_AArch64_neon 997 SIGN_EXTENSION x1,w1 998 SIGN_EXTENSION x3,w3 999 SIGN_EXTENSION x4,w4 1000 sub x0, x0, x1, lsl #1 1001 movi v0.8h, #20, lsl #0 1002 movi v1.8h, #5, lsl #0 1003 1004 //prfm pldl1strm, [x0] 1005 //prfm pldl1strm, [x0, x1] 1006 ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride] 1007 //prfm pldl1strm, [x0, x1] 1008 ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride] 1009 mov v2.s[1], v3.s[0] 1010 //prfm pldl1strm, [x0, x1] 1011 ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride] 1012 mov v3.s[1], v4.s[0] 1013 //prfm pldl1strm, [x0, x1] 1014 ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride] 1015 mov v4.s[1], v5.s[0] 1016 //prfm pldl1strm, [x0, x1] 1017 ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride] 1018 mov v5.s[1], v6.s[0] 1019 1020w4_xy_03_mc_luma_loop: 1021 //prfm pldl1strm, [x0, x1] 1022 ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride] 1023 mov v6.s[1], v7.s[0] 1024 //prfm pldl1strm, [x0, x1] 1025 ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride] 1026 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v2, v3, v4, v5, v6, v7, v20, v0, v1 1027 st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line 1028 st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line 1029 mov v2.s[0], v7.s[1] 1030 1031 //prfm pldl1strm, [x0, x1] 1032 ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride] 1033 //prfm pldl1strm, [x0, x1] 1034 ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride] 1035 mov v3.s[0], v2.s[1] 1036 FILTER_6TAG_8BITS1_AVERAGE_WITH_1 v4, v5, v6, v7, v2, v3, v20, v0, v1 1037 st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line 1038 st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line 1039 mov v4.s[0], v3.s[1] 1040 1041 mov v21.8b, v6.8b 1042 mov v6.8b, v4.8b 1043 mov v4.8b, v2.8b 1044 mov v2.8b, v21.8b 1045 mov v21.8b, v3.8b 1046 mov v3.8b, v7.8b 1047 mov v7.8b, v5.8b 1048 mov v5.8b, v21.8b 1049 1050 sub x4, x4, #4 1051 cbnz x4, w4_xy_03_mc_luma_loop 1052WELS_ASM_AARCH64_FUNC_END 1053 1054//void McHorVer02WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1055// int32_t iHeight); 1056WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq16_AArch64_neon 1057 SIGN_EXTENSION x1,w1 1058 SIGN_EXTENSION x3,w3 1059 SIGN_EXTENSION x4,w4 1060 sub x0, x0, x1, lsl #1 1061 movi v0.8h, #20, lsl #0 1062 movi v1.8h, #5, lsl #0 1063 1064 //prfm pldl1strm, [x0] 1065 //prfm pldl1strm, [x0, x1] 1066 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 1067 //prfm pldl1strm, [x0, x1] 1068 ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] 1069 //prfm pldl1strm, [x0, x1] 1070 ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] 1071 //prfm pldl1strm, [x0, x1] 1072 ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] 1073 //prfm pldl1strm, [x0, x1] 1074 ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] 1075 1076 1077w16_xy_02_mc_luma_loop: 1078 //prfm pldl1strm, [x0, x1] 1079 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 1080 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 1081 FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 1082 st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line 1083 1084 1085 //prfm pldl1strm, [x0, x1] 1086 ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] 1087 FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 1088 FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 1089 st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line 1090 1091 1092 //prfm pldl1strm, [x0, x1] 1093 ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] 1094 FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 1095 FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1 1096 st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line 1097 1098 1099 //prfm pldl1strm, [x0, x1] 1100 ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] 1101 FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 1102 FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1 1103 st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line 1104 1105 1106 //prfm pldl1strm, [x0, x1] 1107 ld1 {v5.16b}, [x0], x1 // v5=src[3*stride] 1108 FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1 1109 FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1 1110 st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line 1111 1112 1113 //prfm pldl1strm, [x0, x1] 1114 ld1 {v6.16b}, [x0], x1 // v6=src[3*stride] 1115 FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1 1116 FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1 1117 st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line 1118 1119 //prfm pldl1strm, [x0, x1] 1120 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 1121 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 1122 FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 1123 st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line 1124 1125 //prfm pldl1strm, [x0, x1] 1126 ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] 1127 FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 1128 FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 1129 st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line 1130 1131 mov v3.16b, v5.16b 1132 mov v5.16b, v7.16b 1133 mov v7.16b, v2.16b 1134 mov v2.16b, v4.16b 1135 mov v4.16b, v6.16b 1136 mov v6.16b, v7.16b 1137 sub x4, x4, #8 1138 cbnz x4, w16_xy_02_mc_luma_loop 1139WELS_ASM_AARCH64_FUNC_END 1140 1141//void McHorVer02WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1142// int32_t iHeight); 1143WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq8_AArch64_neon 1144 SIGN_EXTENSION x1,w1 1145 SIGN_EXTENSION x3,w3 1146 SIGN_EXTENSION x4,w4 1147 sub x0, x0, x1, lsl #1 1148 movi v30.8h, #20, lsl #0 1149 movi v31.8h, #5, lsl #0 1150 1151 ld1 {v16.8b}, [x0], x1 // v16=src[-2*stride] 1152 ld1 {v17.8b}, [x0], x1 // v17=src[-1*stride] 1153 ld1 {v18.8b}, [x0], x1 // v18=src[0*stride] 1154 ld1 {v19.8b}, [x0], x1 // v19=src[1*stride] 1155 ld1 {v20.8b}, [x0], x1 // v20=src[2*stride] 1156 1157w8_xy_02_mc_luma_loop: 1158 ld1 {v21.8b}, [x0], x1 // v21=src[3*stride] 1159 ld1 {v22.8b}, [x0], x1 // v22=src[4*stride] 1160 ld1 {v23.8b}, [x0], x1 // v23=src[5*stride] 1161 ld1 {v24.8b}, [x0], x1 // v24=src[6*stride] 1162 1163 VEC4_UADDL_8BITS v16, v21, v17, v22, v18, v23, v19, v24, v0, v2, v4, v6 //v0/v2/v4/v6 =src[-2]+src[3] 1164 VEC4_UADDL_8BITS v17, v20, v18, v21, v19, v22, v20, v23, v1, v3, v5, v7 //v1/v3/v5/v7 =src[-1]+src[2] 1165 VEC4_MLS_16BITS v1, v31, v3, v31, v5, v31, v7, v31, v0, v2, v4, v6 //v0/v2/v4/v6 -=5*(src[-1]+src[2]) 1166 VEC4_UADDL_8BITS v18, v19, v19, v20, v20, v21, v21, v22, v1, v3, v5, v7 //v1/v3/v5/v7 =src[0]+src[1] 1167 VEC4_MLA_16BITS v1, v30, v3, v30, v5, v30, v7, v30, v0, v2, v4, v6 //v0/v2/v4/v6 += 20*(src[0]+src[1]) 1168 VEC4_SQRSHRUN_16BITS_SHIFT5 v0, v2, v4, v6, v1, v3, v5, v7 1169 VEC4_ST1_8BITS_8ELEMENT x2, x3, v1, v3, v5, v7 //store 8bytes*4row 1170 1171 sub x4, x4, #4 1172 mov v16.16b, v20.16b 1173 mov v17.16b, v21.16b 1174 mov v18.16b, v22.16b 1175 mov v19.16b, v23.16b 1176 mov v20.16b, v24.16b 1177 1178 cbnz x4, w8_xy_02_mc_luma_loop 1179WELS_ASM_AARCH64_FUNC_END 1180 1181//void McHorVer02WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1182// int32_t iHeight); 1183WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02WidthEq4_AArch64_neon 1184 SIGN_EXTENSION x1,w1 1185 SIGN_EXTENSION x3,w3 1186 SIGN_EXTENSION x4,w4 1187 sub x0, x0, x1, lsl #1 1188 movi v0.8h, #20, lsl #0 1189 movi v1.8h, #5, lsl #0 1190 1191 //prfm pldl1strm, [x0] 1192 //prfm pldl1strm, [x0, x1] 1193 ld1 {v2.s}[0], [x0], x1 // v2=src[-2*stride] 1194 //prfm pldl1strm, [x0, x1] 1195 ld1 {v3.s}[0], [x0], x1 // v3=src[-1*stride] 1196 mov v2.s[1], v3.s[0] 1197 //prfm pldl1strm, [x0, x1] 1198 ld1 {v4.s}[0], [x0], x1 // v4=src[0*stride] 1199 mov v3.s[1], v4.s[0] 1200 //prfm pldl1strm, [x0, x1] 1201 ld1 {v5.s}[0], [x0], x1 // v5=src[1*stride] 1202 mov v4.s[1], v5.s[0] 1203 //prfm pldl1strm, [x0, x1] 1204 ld1 {v6.s}[0], [x0], x1 // v6=src[2*stride] 1205 mov v5.s[1], v6.s[0] 1206 1207w4_xy_02_mc_luma_loop: 1208 //prfm pldl1strm, [x0, x1] 1209 ld1 {v7.s}[0], [x0], x1 // v7=src[3*stride] 1210 mov v6.s[1], v7.s[0] 1211 //prfm pldl1strm, [x0, x1] 1212 ld1 {v7.s}[1], [x0], x1 // v7=src[4*stride] 1213 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 1214 st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line 1215 st1 {v20.s}[1], [x2], x3 //write 4Byte : 1 line 1216 mov v2.s[0], v7.s[1] 1217 1218 //prfm pldl1strm, [x0, x1] 1219 ld1 {v2.s}[1], [x0], x1 // v2=src[5*stride] 1220 //prfm pldl1strm, [x0, x1] 1221 ld1 {v3.s}[1], [x0], x1 // v2=src[6*stride] 1222 mov v3.s[0], v2.s[1] 1223 FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 1224 st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line 1225 st1 {v20.s}[1], [x2], x3 //write 4Byte : 3 line 1226 mov v4.s[0], v3.s[1] 1227 1228 mov v21.8b, v6.8b 1229 mov v6.8b, v4.8b 1230 mov v4.8b, v2.8b 1231 mov v2.8b, v21.8b 1232 mov v21.8b, v3.8b 1233 mov v3.8b, v7.8b 1234 mov v7.8b, v5.8b 1235 mov v5.8b, v21.8b 1236 1237 sub x4, x4, #4 1238 cbnz x4, w4_xy_02_mc_luma_loop 1239WELS_ASM_AARCH64_FUNC_END 1240 1241//void McHorVer22WidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1242// int32_t iHeight); 1243WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq16_AArch64_neon 1244 SIGN_EXTENSION x1,w1 1245 SIGN_EXTENSION x3,w3 1246 SIGN_EXTENSION x4,w4 1247 stp d8, d9, [sp,#-16]! 1248 stp d10, d11, [sp,#-16]! 1249 stp d12, d13, [sp,#-16]! 1250 stp d14, d15, [sp,#-16]! 1251 sub x0, x0, #2 1252 sub x0, x0, x1, lsl #1 1253 movi v0.8h, #20, lsl #0 1254 movi v1.8h, #5, lsl #0 1255 1256 //prfm pldl1strm, [x0] 1257 //prfm pldl1strm, [x0, x1] 1258 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride] 1259 //prfm pldl1strm, [x0, x1] 1260 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride] 1261 //prfm pldl1strm, [x0, x1] 1262 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride] 1263 //prfm pldl1strm, [x0, x1] 1264 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride] 1265 //prfm pldl1strm, [x0, x1] 1266 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride] 1267 1268w16_hv_mc_luma_loop: 1269 //prfm pldl1strm, [x0, x1] 1270 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride] 1271 // vertical filtered into v20/v21 1272 FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 1273 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 1274 // horizon filtered 1275 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1276 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1277 // vertical filtered into v21/v22 1278 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 1279 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1280 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1281 st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line 1282 1283 //prfm pldl1strm, [x0, x1] 1284 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride] 1285 // vertical filtered into v20/v21 1286 FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 1287 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 1288 // horizon filtered 1289 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1290 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1291 // vertical filtered into v21/v22 1292 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 1293 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1294 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1295 st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line 1296 1297 //prfm pldl1strm, [x0, x1] 1298 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[3*stride] 1299 // vertical filtered into v20/v21 1300 FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1 1301 FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1 1302 // horizon filtered 1303 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1304 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1305 // vertical filtered into v21/v22 1306 FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1 1307 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1308 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1309 st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line 1310 1311 //prfm pldl1strm, [x0, x1] 1312 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[3*stride] 1313 // vertical filtered into v20/v21 1314 FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1 1315 FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1 1316 // horizon filtered 1317 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1318 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1319 // vertical filtered into v21/v22 1320 FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1 1321 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1322 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1323 st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line 1324 1325 //prfm pldl1strm, [x0, x1] 1326 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[3*stride] 1327 // vertical filtered into v20/v21 1328 FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1 1329 FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1 1330 // horizon filtered 1331 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1332 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1333 // vertical filtered into v21/v22 1334 FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1 1335 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1336 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1337 st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line 1338 1339 //prfm pldl1strm, [x0, x1] 1340 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[3*stride] 1341 // vertical filtered into v20/v21 1342 FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1 1343 FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1 1344 // horizon filtered 1345 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1346 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1347 // vertical filtered into v21/v22 1348 FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1 1349 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1350 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1351 st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line 1352 1353 //prfm pldl1strm, [x0, x1] 1354 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[3*stride] 1355 // vertical filtered into v20/v21 1356 FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 1357 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 1358 // horizon filtered 1359 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1360 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1361 // vertical filtered into v21/v22 1362 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 1363 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1364 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1365 st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line 1366 1367 //prfm pldl1strm, [x0, x1] 1368 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[3*stride] 1369 // vertical filtered into v20/v21 1370 FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 1371 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 1372 // horizon filtered 1373 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1374 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1375 // vertical filtered into v21/v22 1376 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 1377 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 1378 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 1379 st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line 1380 1381 mov v5.16b, v11.16b 1382 mov v11.16b, v17.16b 1383 mov v30.16b, v2.16b 1384 mov v2.16b, v8.16b 1385 mov v8.16b, v14.16b 1386 mov v14.16b, v30.16b 1387 1388 mov v6.16b, v12.16b 1389 mov v12.16b, v18.16b 1390 mov v30.16b, v3.16b 1391 mov v3.16b, v9.16b 1392 mov v9.16b, v15.16b 1393 mov v15.16b, v30.16b 1394 1395 mov v7.16b, v13.16b 1396 mov v13.16b, v19.16b 1397 mov v30.16b, v4.16b 1398 mov v4.16b, v10.16b 1399 mov v10.16b, v16.16b 1400 mov v16.16b, v30.16b 1401 1402 sub x4, x4, #8 1403 cbnz x4, w16_hv_mc_luma_loop 1404 1405 ldp d14, d15, [sp], #16 1406 ldp d12, d13, [sp], #16 1407 ldp d10, d11, [sp], #16 1408 ldp d8, d9, [sp], #16 1409WELS_ASM_AARCH64_FUNC_END 1410 1411//void McHorVer22WidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1412// int32_t iHeight); 1413WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq8_AArch64_neon 1414 SIGN_EXTENSION x1,w1 1415 SIGN_EXTENSION x3,w3 1416 SIGN_EXTENSION x4,w4 1417 sub x0, x0, #2 1418 sub x0, x0, x1, lsl #1 1419 movi v0.8h, #20, lsl #0 1420 movi v1.8h, #5, lsl #0 1421 1422 //prfm pldl1strm, [x0] 1423 //prfm pldl1strm, [x0, x1] 1424 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 1425 //prfm pldl1strm, [x0, x1] 1426 ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride] 1427 //prfm pldl1strm, [x0, x1] 1428 ld1 {v4.16b}, [x0], x1 // v8=src[0*stride] 1429 //prfm pldl1strm, [x0, x1] 1430 ld1 {v5.16b}, [x0], x1 // v11=src[1*stride] 1431 //prfm pldl1strm, [x0, x1] 1432 ld1 {v6.16b}, [x0], x1 // v14=src[2*stride] 1433 1434w8_hv_mc_luma_loop: 1435 //prfm pldl1strm, [x0, x1] 1436 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 1437 // vertical filtered into v20/v21 1438 FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 1439 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 1440 // horizon filtered 1441 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1442 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1443 st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line 1444 1445 //prfm pldl1strm, [x0, x1] 1446 ld1 {v2.16b}, [x0], x1 // v2=src[3*stride] 1447 // vertical filtered into v20/v21 1448 FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 1449 FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1 1450 // horizon filtered 1451 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1452 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1453 st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line 1454 1455 //prfm pldl1strm, [x0, x1] 1456 ld1 {v3.16b}, [x0], x1 // v3=src[3*stride] 1457 // vertical filtered into v20/v21 1458 FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 1459 FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 1460 // horizon filtered 1461 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1462 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1463 st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line 1464 1465 //prfm pldl1strm, [x0, x1] 1466 ld1 {v4.16b}, [x0], x1 // v4=src[3*stride] 1467 // vertical filtered into v20/v21 1468 FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 1469 FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1 1470 // horizon filtered 1471 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 1472 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 1473 st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line 1474 1475 1476 mov v5.16b, v3.16b 1477 mov v3.16b, v7.16b 1478 mov v30.16b, v2.16b 1479 mov v2.16b, v6.16b 1480 mov v6.16b, v4.16b 1481 mov v4.16b, v30.16b 1482 1483 sub x4, x4, #4 1484 cbnz x4, w8_hv_mc_luma_loop 1485WELS_ASM_AARCH64_FUNC_END 1486//void McHorVer22WidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1487// int32_t iHeight); 1488 1489WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22WidthEq4_AArch64_neon 1490 SIGN_EXTENSION x1,w1 1491 SIGN_EXTENSION x3,w3 1492 SIGN_EXTENSION x4,w4 1493 sub x0, x0, #2 1494 sub x0, x0, x1, lsl #1 1495 movi v0.8h, #20, lsl #0 1496 movi v1.8h, #5, lsl #0 1497 1498 //prfm pldl1strm, [x0] 1499 //prfm pldl1strm, [x0, x1] 1500 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 1501 //prfm pldl1strm, [x0, x1] 1502 ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] 1503 //prfm pldl1strm, [x0, x1] 1504 ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] 1505 //prfm pldl1strm, [x0, x1] 1506 ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] 1507 //prfm pldl1strm, [x0, x1] 1508 ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] 1509 1510w4_hv_mc_luma_loop: 1511 //prfm pldl1strm, [x0, x1] 1512 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 1513 // vertical filtered into v20/v21 1st line 1514 FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 1515 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 1516 //prfm pldl1strm, [x0, x1] 1517 ld1 {v2.16b}, [x0], x1 // v16=src[4*stride] 1518 // vertical filtered into v22/v23 2nd line 1519 FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1 1520 FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1 1521 // horizon filtered 1522 UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 1523 UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 1524 zip1 v24.2d, v24.2d, v28.2d 1525 zip1 v25.2d, v25.2d, v29.2d 1526 zip1 v26.2d, v26.2d, v30.2d 1527 FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] 1528 st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line 1529 st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line 1530 1531 1532 //prfm pldl1strm, [x0, x1] 1533 ld1 {v3.16b}, [x0], x1 // v3=src[5*stride] 1534 // vertical filtered into v20/v21 1535 FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 1536 FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 1537 //prfm pldl1strm, [x0, x1] 1538 ld1 {v4.16b}, [x0], x1 // v4=src[6*stride] 1539 FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1 1540 FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1 1541 // horizon filtered 1542 UNPACK_2_16BITS_TO_ABC v20, v21, v24, v25, v26 1543 UNPACK_2_16BITS_TO_ABC v22, v23, v28, v29, v30 1544 zip1 v24.2d, v24.2d, v28.2d 1545 zip1 v25.2d, v25.2d, v29.2d 1546 zip1 v26.2d, v26.2d, v30.2d 1547 FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0] 1548 st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line 1549 st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line 1550 1551 mov v5.16b, v3.16b 1552 mov v3.16b, v7.16b 1553 mov v30.16b, v2.16b 1554 mov v2.16b, v6.16b 1555 mov v6.16b, v4.16b 1556 mov v4.16b, v30.16b 1557 1558 sub x4, x4, #4 1559 cbnz x4, w4_hv_mc_luma_loop 1560WELS_ASM_AARCH64_FUNC_END 1561//void McCopyWidthEq16_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1562// int32_t iHeight); 1563WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon 1564 //prfm pldl1strm, [x0] 1565 SIGN_EXTENSION x1,w1 1566 SIGN_EXTENSION x3,w3 1567 SIGN_EXTENSION x4,w4 1568w16_copy_loop: 1569 //prfm pldl1strm, [x0, x1] 1570 ld1 {v0.16b}, [x0], x1 //read 16Byte : 0 line 1571 st1 {v0.16b}, [x2], x3 //write 16Byte : 0 line 1572 //prfm pldl1strm, [x0, x1] 1573 ld1 {v1.16b}, [x0], x1 //read 16Byte : 1 line 1574 st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line 1575 1576 sub x4, x4, #2 1577 cbnz x4, w16_copy_loop 1578WELS_ASM_AARCH64_FUNC_END 1579//void McCopyWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1580// int32_t iHeight); 1581WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon 1582 //prfm pldl1strm, [x0] 1583 SIGN_EXTENSION x1,w1 1584 SIGN_EXTENSION x3,w3 1585 SIGN_EXTENSION x4,w4 1586w8_copy_loop: 1587 //prfm pldl1strm, [x0, x1] 1588 ld1 {v0.8b}, [x0], x1 //read 16Byte : 0 line 1589 st1 {v0.8b}, [x2], x3 //write 16Byte : 0 line 1590 //prfm pldl1strm, [x0, x1] 1591 ld1 {v1.8b}, [x0], x1 //read 16Byte : 1 line 1592 st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line 1593 1594 sub x4, x4, #2 1595 cbnz x4, w8_copy_loop 1596WELS_ASM_AARCH64_FUNC_END 1597 1598WELS_ASM_AARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon 1599 //prfm pldl1strm, [x0] 1600 SIGN_EXTENSION x1,w1 1601 SIGN_EXTENSION x3,w3 1602 SIGN_EXTENSION x4,w4 1603w4_copy_loop: 1604 //prfm pldl1strm, [x0, x1] 1605 ld1 {v0.s}[0], [x0], x1 //read 16Byte : 0 line 1606 st1 {v0.s}[0], [x2], x3 //write 16Byte : 0 line 1607 //prfm pldl1strm, [x0, x1] 1608 ld1 {v1.s}[0], [x0], x1 //read 16Byte : 1 line 1609 st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line 1610 1611 sub x4, x4, #2 1612 cbnz x4, w4_copy_loop 1613WELS_ASM_AARCH64_FUNC_END 1614 1615//void PixStrideAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, 1616//const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); 1617 1618WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon 1619 SIGN_EXTENSION x1,w1 1620 SIGN_EXTENSION x3,w3 1621 SIGN_EXTENSION x5,w5 1622 SIGN_EXTENSION x6,w6 1623enc_w16_pix_avg_loop: 1624 ld1 {v0.16b}, [x2], x3 //read 16Byte : src0: 0 line 1625 ld1 {v1.16b}, [x4], x5 //read 16Byte : src1: 0 line 1626 ld1 {v2.16b}, [x2], x3 //read 16Byte : src0: 1 line 1627 ld1 {v3.16b}, [x4], x5 //read 16Byte : src1: 1 line 1628 ld1 {v4.16b}, [x2], x3 //read 16Byte : src0: 2 line 1629 ld1 {v5.16b}, [x4], x5 //read 16Byte : src1: 2 line 1630 ld1 {v6.16b}, [x2], x3 //read 16Byte : src0: 3 line 1631 ld1 {v7.16b}, [x4], x5 //read 16Byte : src1: 3 line 1632 AVERAGE_TWO_8BITS1 v16, v0, v1 1633 AVERAGE_TWO_8BITS2 v16, v0, v1 1634 st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line 1635 1636 1637 AVERAGE_TWO_8BITS1 v16, v2, v3 1638 AVERAGE_TWO_8BITS2 v16, v2, v3 1639 st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line 1640 1641 1642 AVERAGE_TWO_8BITS1 v16, v4, v5 1643 AVERAGE_TWO_8BITS2 v16, v4, v5 1644 st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line 1645 1646 AVERAGE_TWO_8BITS1 v16, v6, v7 1647 AVERAGE_TWO_8BITS2 v16, v6, v7 1648 st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line 1649 1650 sub x6, x6, #4 1651 cbnz x6, enc_w16_pix_avg_loop 1652WELS_ASM_AARCH64_FUNC_END 1653 1654//void PixStrideAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, 1655// const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight); 1656WELS_ASM_AARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon 1657 //prfm pldl1strm, [x2] 1658 //prfm pldl1strm, [x4] 1659 SIGN_EXTENSION x1,w1 1660 SIGN_EXTENSION x3,w3 1661 SIGN_EXTENSION x5,w5 1662 SIGN_EXTENSION x6,w6 1663enc_w8_pix_avg_loop: 1664 //prfm pldl1strm, [x2, x3] 1665 //prfm pldl1strm, [x4, x5] 1666 ld1 {v0.8b}, [x2], x3 //read 8Byte : src0: 0 line 1667 ld1 {v1.8b}, [x4], x5 //read 8Byte : src1: 0 line 1668 //prfm pldl1strm, [x2, x3] 1669 //prfm pldl1strm, [x4, x5] 1670 ld1 {v2.8b}, [x2], x3 //read 8Byte : src0: 1 line 1671 ld1 {v3.8b}, [x4], x5 //read 8Byte : src1: 1 line 1672 //prfm pldl1strm, [x2, x3] 1673 //prfm pldl1strm, [x4, x5] 1674 ld1 {v4.8b}, [x2], x3 //read 8Byte : src0: 2 line 1675 ld1 {v5.8b}, [x4], x5 //read 8Byte : src1: 2 line 1676 //prfm pldl1strm, [x2, x3] 1677 //prfm pldl1strm, [x4, x5] 1678 ld1 {v6.8b}, [x2], x3 //read 8Byte : src0: 3 line 1679 ld1 {v7.8b}, [x4], x5 //read 8Byte : src1: 3 line 1680 AVERAGE_TWO_8BITS1 v16, v0, v1 1681 st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line 1682 1683 AVERAGE_TWO_8BITS1 v16, v2, v3 1684 st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line 1685 1686 1687 AVERAGE_TWO_8BITS1 v16, v4, v5 1688 st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line 1689 1690 AVERAGE_TWO_8BITS1 v16, v6, v7 1691 st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line 1692 1693 sub x6, x6, #4 1694 cbnz x6, enc_w8_pix_avg_loop 1695WELS_ASM_AARCH64_FUNC_END 1696//void PixelAvgWidthEq16_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, 1697// const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); 1698WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon 1699 //prfm pldl1strm, [x2] 1700 //prfm pldl1strm, [x4] 1701 SIGN_EXTENSION x1,w1 1702 SIGN_EXTENSION x3,w3 1703 SIGN_EXTENSION x5,w5 1704 SIGN_EXTENSION x6,w6 1705w16_pix_avg_loop: 1706 //prfm pldl1strm, [x2, x3] 1707 //prfm pldl1strm, [x4, x5] 1708 ld1 {v0.16b}, [x2], x3 //read 16Byte : src0: 0 line 1709 ld1 {v1.16b}, [x4], x5 //read 16Byte : src1: 0 line 1710 //prfm pldl1strm, [x2, x3] 1711 //prfm pldl1strm, [x4, x5] 1712 ld1 {v2.16b}, [x2], x3 //read 16Byte : src0: 1 line 1713 ld1 {v3.16b}, [x4], x5 //read 16Byte : src1: 1 line 1714 //prfm pldl1strm, [x2, x3] 1715 //prfm pldl1strm, [x4, x5] 1716 ld1 {v4.16b}, [x2], x3 //read 16Byte : src0: 2 line 1717 ld1 {v5.16b}, [x4], x5 //read 16Byte : src1: 2 line 1718 //prfm pldl1strm, [x2, x3] 1719 //prfm pldl1strm, [x4, x5] 1720 ld1 {v6.16b}, [x2], x3 //read 16Byte : src0: 3 line 1721 ld1 {v7.16b}, [x4], x5 //read 16Byte : src1: 3 line 1722 AVERAGE_TWO_8BITS1 v16, v0, v1 1723 AVERAGE_TWO_8BITS2 v16, v0, v1 1724 st1 {v16.16b}, [x0], x1 //write 16Byte : 0 line 1725 1726 1727 AVERAGE_TWO_8BITS1 v16, v2, v3 1728 AVERAGE_TWO_8BITS2 v16, v2, v3 1729 st1 {v16.16b}, [x0], x1 //write 16Byte : 1 line 1730 1731 1732 AVERAGE_TWO_8BITS1 v16, v4, v5 1733 AVERAGE_TWO_8BITS2 v16, v4, v5 1734 st1 {v16.16b}, [x0], x1 //write 16Byte : 2 line 1735 1736 AVERAGE_TWO_8BITS1 v16, v6, v7 1737 AVERAGE_TWO_8BITS2 v16, v6, v7 1738 st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line 1739 1740 sub x6, x6, #4 1741 cbnz x6, w16_pix_avg_loop 1742WELS_ASM_AARCH64_FUNC_END 1743//void PixelAvgWidthEq8_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, 1744// const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); 1745WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon 1746 //prfm pldl1strm, [x2] 1747 //prfm pldl1strm, [x4] 1748 SIGN_EXTENSION x1,w1 1749 SIGN_EXTENSION x3,w3 1750 SIGN_EXTENSION x5,w5 1751 SIGN_EXTENSION x6,w6 1752w8_pix_avg_loop: 1753 //prfm pldl1strm, [x2, x3] 1754 //prfm pldl1strm, [x4, x5] 1755 ld1 {v0.8b}, [x2], x3 //read 8Byte : src0: 0 line 1756 ld1 {v1.8b}, [x4], x5 //read 8Byte : src1: 0 line 1757 //prfm pldl1strm, [x2, x3] 1758 //prfm pldl1strm, [x4, x5] 1759 ld1 {v2.8b}, [x2], x3 //read 8Byte : src0: 1 line 1760 ld1 {v3.8b}, [x4], x5 //read 8Byte : src1: 1 line 1761 //prfm pldl1strm, [x2, x3] 1762 //prfm pldl1strm, [x4, x5] 1763 ld1 {v4.8b}, [x2], x3 //read 8Byte : src0: 2 line 1764 ld1 {v5.8b}, [x4], x5 //read 8Byte : src1: 2 line 1765 //prfm pldl1strm, [x2, x3] 1766 //prfm pldl1strm, [x4, x5] 1767 ld1 {v6.8b}, [x2], x3 //read 8Byte : src0: 3 line 1768 ld1 {v7.8b}, [x4], x5 //read 8Byte : src1: 3 line 1769 AVERAGE_TWO_8BITS1 v16, v0, v1 1770 st1 {v16.8b}, [x0], x1 //write 8Byte : 0 line 1771 1772 AVERAGE_TWO_8BITS1 v16, v2, v3 1773 st1 {v16.8b}, [x0], x1 //write 8Byte : 1 line 1774 1775 1776 AVERAGE_TWO_8BITS1 v16, v4, v5 1777 st1 {v16.8b}, [x0], x1 //write 8Byte : 2 line 1778 1779 AVERAGE_TWO_8BITS1 v16, v6, v7 1780 st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line 1781 1782 sub x6, x6, #4 1783 cbnz x6, w8_pix_avg_loop 1784WELS_ASM_AARCH64_FUNC_END 1785 1786//void PixelAvgWidthEq4_AArch64_neon (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride, 1787// const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight); 1788WELS_ASM_AARCH64_FUNC_BEGIN PixelAvgWidthEq4_AArch64_neon 1789 //prfm pldl1strm, [x2] 1790 //prfm pldl1strm, [x4] 1791 SIGN_EXTENSION x1,w1 1792 SIGN_EXTENSION x3,w3 1793 SIGN_EXTENSION x5,w5 1794 SIGN_EXTENSION x6,w6 1795w4_pix_avg_loop: 1796 //prfm pldl1strm, [x2, x3] 1797 //prfm pldl1strm, [x4, x5] 1798 ld1 {v0.s}[0], [x2], x3 //read 4Byte : src0: 0 line 1799 ld1 {v1.s}[0], [x4], x5 //read 4Byte : src1: 0 line 1800 //prfm pldl1strm, [x2, x3] 1801 //prfm pldl1strm, [x4, x5] 1802 ld1 {v0.s}[1], [x2], x3 //read 4Byte : src0: 1 line 1803 ld1 {v1.s}[1], [x4], x5 //read 4Byte : src1: 1 line 1804 AVERAGE_TWO_8BITS1 v2, v0, v1 1805 st1 {v2.s}[0], [x0], x1 //write 4Byte : 0 line 1806 st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line 1807 1808 sub x6, x6, #2 1809 cbnz x6, w4_pix_avg_loop 1810WELS_ASM_AARCH64_FUNC_END 1811//void McChromaWidthEq8_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1812// int32_t* pWeights, int32_t iHeight); 1813WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon 1814 SIGN_EXTENSION x1,w1 1815 SIGN_EXTENSION x3,w3 1816 SIGN_EXTENSION x5,w5 1817 ld4r {v28.8b, v29.8b, v30.8b, v31.8b}, [x4] //load A/B/C/D 1818 ld1 {v16.16b}, [x0], x1 // src[x] 1819 ext v17.16b, v16.16b, v16.16b, #1 // src[x+1] 1820w8_mc_chroma_loop: 1821 ld1 {v18.16b}, [x0], x1 // src[x+stride] 1822 ext v19.16b, v18.16b, v18.16b, #1 // src[x+stride+1] 1823 1824 ld1 {v20.16b}, [x0], x1 // src[x+2*stride] 1825 ext v21.16b, v20.16b, v20.16b, #1 // src[x+2*stride+1] 1826 1827 ld1 {v22.16b}, [x0], x1 // src[x+3*stride] 1828 ext v23.16b, v22.16b, v22.16b, #1 // src[x+3*stride+1] 1829 1830 ld1 {v24.16b}, [x0], x1 // src[x+4*stride] 1831 ext v25.16b, v24.16b, v24.16b, #1 // src[x+4*stride+1] 1832 1833 umull v0.8h, v16.8b, v28.8b 1834 umull v2.8h, v18.8b, v28.8b 1835 umull v4.8h, v20.8b, v28.8b 1836 umull v6.8h, v22.8b, v28.8b 1837 1838 umlal v0.8h, v17.8b, v29.8b 1839 umlal v2.8h, v19.8b, v29.8b 1840 umlal v4.8h, v21.8b, v29.8b 1841 umlal v6.8h, v23.8b, v29.8b 1842 1843 umlal v0.8h, v18.8b, v30.8b 1844 umlal v2.8h, v20.8b, v30.8b 1845 umlal v4.8h, v22.8b, v30.8b 1846 umlal v6.8h, v24.8b, v30.8b 1847 1848 umlal v0.8h, v19.8b, v31.8b 1849 umlal v2.8h, v21.8b, v31.8b 1850 umlal v4.8h, v23.8b, v31.8b 1851 umlal v6.8h, v25.8b, v31.8b 1852 1853 rshrn v1.8b, v0.8h, #6 1854 st1 {v1.8b}, [x2], x3 1855 1856 rshrn v3.8b, v2.8h, #6 1857 st1 {v3.8b}, [x2], x3 1858 1859 rshrn v5.8b, v4.8h, #6 1860 st1 {v5.8b}, [x2], x3 1861 1862 rshrn v7.8b, v6.8h, #6 1863 st1 {v7.8b}, [x2], x3 1864 1865 mov v16.16b, v24.16b 1866 mov v17.16b, v25.16b 1867 sub x5, x5, #4 1868 cbnz x5, w8_mc_chroma_loop 1869WELS_ASM_AARCH64_FUNC_END 1870//void McChromaWidthEq4_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1871// int32_t* pWeights, int32_t iHeight); 1872WELS_ASM_AARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon 1873 SIGN_EXTENSION x1,w1 1874 SIGN_EXTENSION x3,w3 1875 SIGN_EXTENSION x5,w5 1876 ld4r {v4.8b, v5.8b, v6.8b, v7.8b}, [x4] //load A/B/C/D 1877 ld1 {v0.8b}, [x0], x1 // src[x] 1878 ext v1.8b, v0.8b, v0.8b, #1 // src[x+1] 1879w4_mc_chroma_loop: 1880 ld1 {v2.8b}, [x0], x1 // src[x+stride] 1881 ext v3.8b, v2.8b, v2.8b, #1 // src[x+stride+1] 1882 ld1 {v18.8b}, [x0], x1 // src[x+2*stride] 1883 ext v19.8b, v18.8b, v18.8b, #1 // src[x+2*stride+1] 1884 1885 zip1 v0.4s, v0.4s, v2.4s 1886 zip1 v1.4s, v1.4s, v3.4s 1887 zip1 v2.4s, v2.4s, v18.4s 1888 zip1 v3.4s, v3.4s, v19.4s 1889 1890 umull v16.8h, v0.8b, v4.8b 1891 umlal v16.8h, v1.8b, v5.8b 1892 umlal v16.8h, v2.8b, v6.8b 1893 umlal v16.8h, v3.8b, v7.8b 1894 rshrn v17.8b, v16.8h, #6 1895 st1 {v17.s}[0], [x2], x3 1896 st1 {v17.s}[1], [x2], x3 1897 1898 mov v0.8b, v18.8b 1899 mov v1.8b, v19.8b 1900 sub x5, x5, #2 1901 cbnz x5, w4_mc_chroma_loop 1902WELS_ASM_AARCH64_FUNC_END 1903 1904//void McHorVer20Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1905// int32_t iHeight);// width+1 1906WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon 1907 SIGN_EXTENSION x1,w1 1908 SIGN_EXTENSION x3,w3 1909 SIGN_EXTENSION x4,w4 1910 sub x0, x0, #2 1911 sub x3, x3, #16 1912 mov x5, #16 1913 movi v0.8h, #20, lsl #0 1914 movi v1.8h, #5, lsl #0 1915 ldr q22, filter_para 1916w17_h_mc_luma_loop: 1917 ld1 {v2.16b, v3.16b}, [x0], x1 //only use 22(17+5); v2=src[-2] 1918 1919 //prfm pldl1strm, [x0] 1920 ext v5.16b, v2.16b, v3.16b, #1 //v5=src[-1] 1921 ext v6.16b, v2.16b, v3.16b, #2 //v6=src[0] 1922 ext v7.16b, v2.16b, v3.16b, #3 //v7=src[1] 1923 ext v16.16b, v2.16b, v3.16b, #4 //v16=src[2] 1924 ext v17.16b, v2.16b, v3.16b, #5 //v17=src[3] 1925 1926 FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 1927 FILTER_6TAG_8BITS2 v2, v5, v6, v7, v16, v17, v20, v0, v1 1928 st1 {v20.16b}, [x2], x5 //write 16Byte 1929 1930 ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X 1931 FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 1932 st1 {v21.b}[0], [x2], x3 //write 16th Byte 1933 1934 sub x4, x4, #1 1935 cbnz x4, w17_h_mc_luma_loop 1936WELS_ASM_AARCH64_FUNC_END 1937 1938//void McHorVer20Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1939// int32_t iHeight);// width+1 1940WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon 1941 SIGN_EXTENSION x1,w1 1942 SIGN_EXTENSION x3,w3 1943 SIGN_EXTENSION x4,w4 1944 sub x0, x0, #2 1945 sub x3, x3, #8 1946 mov x5, #8 1947 movi v0.8h, #20, lsl #0 1948 movi v1.8h, #5, lsl #0 1949 ldr q22, filter_para 1950w9_h_mc_luma_loop: 1951 ld1 {v2.16b}, [x0], x1 //only use 14(9+5); v2=src[-2] 1952 mov v3.d[0], v2.d[1] 1953 //prfm pldl1strm, [x0] 1954 ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] 1955 ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] 1956 ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] 1957 ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] 1958 ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] 1959 1960 FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 1961 st1 {v20.8b}, [x2], x5 //write 8Byte 1962 1963 ext v21.8b, v3.8b, v3.8b, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X 1964 FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21 1965 st1 {v21.b}[0], [x2], x3 //write 9th Byte 1966 1967 sub x4, x4, #1 1968 cbnz x4, w9_h_mc_luma_loop 1969WELS_ASM_AARCH64_FUNC_END 1970 1971//void McHorVer20Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 1972// int32_t iHeight);// width+1 1973WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width5_AArch64_neon 1974 SIGN_EXTENSION x1,w1 1975 SIGN_EXTENSION x3,w3 1976 SIGN_EXTENSION x4,w4 1977 sub x0, x0, #2 1978 sub x3, x3, #4 1979 mov x5, #4 1980 movi v0.8h, #20, lsl #0 1981 movi v1.8h, #5, lsl #0 1982w5_h_mc_luma_loop: 1983 ld1 {v2.16b}, [x0], x1 //only use 10(5+5); v2=src[-2] 1984 1985 ext v5.16b, v2.16b, v4.16b, #1 //v5=src[-1] 1986 ext v6.16b, v2.16b, v4.16b, #2 //v6=src[0] 1987 ext v7.16b, v2.16b, v4.16b, #3 //v7=src[1] 1988 ext v16.16b, v2.16b, v4.16b, #4 //v16=src[2] 1989 ext v17.16b, v2.16b, v4.16b, #5 //v17=src[3] 1990 1991 FILTER_6TAG_8BITS1 v2, v5, v6, v7, v16, v17, v20, v0, v1 1992 st1 {v20.s}[0], [x2], x5 //write 4Byte 1993 st1 {v20.b}[4], [x2], x3 //write 5th Byte 1994 1995 sub x4, x4, #1 1996 cbnz x4, w5_h_mc_luma_loop 1997WELS_ASM_AARCH64_FUNC_END 1998 1999//void McHorVer22Width17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 2000// int32_t iHeight); 2001WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon 2002 stp d8, d9, [sp,#-16]! 2003 stp d10, d11, [sp,#-16]! 2004 stp d12, d13, [sp,#-16]! 2005 stp d14, d15, [sp,#-16]! 2006 SIGN_EXTENSION x1,w1 2007 SIGN_EXTENSION x3,w3 2008 SIGN_EXTENSION x4,w4 2009 sub x0, x0, #2 2010 sub x0, x0, x1, lsl #1 2011 movi v0.8h, #20, lsl #0 2012 movi v1.8h, #5, lsl #0 2013 sub x3, x3, #16 2014 mov x5, #16 2015 ldr q29, filter_para 2016 2017 sub x4, x4, #1 2018 2019 //prfm pldl1strm, [x0] 2020 //prfm pldl1strm, [x0, x1] 2021 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[-2*stride] 2022 //prfm pldl1strm, [x0, x1] 2023 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v5=src[-1*stride] 2024 //prfm pldl1strm, [x0, x1] 2025 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v8=src[0*stride] 2026 //prfm pldl1strm, [x0, x1] 2027 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v11=src[1*stride] 2028 //prfm pldl1strm, [x0, x1] 2029 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v14=src[2*stride] 2030 2031w17_hv_mc_luma_loop: 2032 //prfm pldl1strm, [x0, x1] 2033 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride] 2034 // vertical filtered into v20/v21 2035 FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 2036 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 2037 // horizon filtered 2038 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2039 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2040 // vertical filtered into v21/v22 2041 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 2042 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2043 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2044 st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line 2045 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2046 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line 2047 2048 //prfm pldl1strm, [x0, x1] 2049 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[4*stride] 2050 // vertical filtered into v20/v21 2051 FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 2052 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 2053 // horizon filtered 2054 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2055 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2056 // vertical filtered into v21/v22 2057 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 2058 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2059 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2060 st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line 2061 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2062 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line 2063 2064 //prfm pldl1strm, [x0, x1] 2065 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x1 // v2=src[5*stride] 2066 // vertical filtered into v20/v21 2067 FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1 2068 FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1 2069 // horizon filtered 2070 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2071 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2072 // vertical filtered into v21/v22 2073 FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1 2074 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2075 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2076 st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line 2077 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2078 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line 2079 2080 //prfm pldl1strm, [x0, x1] 2081 ld1 {v8.8b, v9.8b, v10.8b}, [x0], x1 // v2=src[6*stride] 2082 // vertical filtered into v20/v21 2083 FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1 2084 FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1 2085 // horizon filtered 2086 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2087 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2088 // vertical filtered into v21/v22 2089 FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1 2090 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2091 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2092 st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line 2093 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2094 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line 2095 2096 //prfm pldl1strm, [x0, x1] 2097 ld1 {v11.8b, v12.8b, v13.8b}, [x0], x1 // v2=src[7*stride] 2098 // vertical filtered into v20/v21 2099 FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1 2100 FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1 2101 // horizon filtered 2102 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2103 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2104 // vertical filtered into v21/v22 2105 FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1 2106 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2107 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2108 st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line 2109 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2110 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line 2111 2112 //prfm pldl1strm, [x0, x1] 2113 ld1 {v14.8b, v15.8b, v16.8b}, [x0], x1 // v2=src[8*stride] 2114 // vertical filtered into v20/v21 2115 FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1 2116 FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1 2117 // horizon filtered 2118 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2119 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2120 // vertical filtered into v21/v22 2121 FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1 2122 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2123 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2124 st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line 2125 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2126 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line 2127 2128 //prfm pldl1strm, [x0, x1] 2129 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v2=src[9*stride] 2130 // vertical filtered into v20/v21 2131 FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 2132 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 2133 // horizon filtered 2134 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2135 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2136 // vertical filtered into v21/v22 2137 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 2138 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2139 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2140 st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line 2141 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2142 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line 2143 2144 //prfm pldl1strm, [x0, x1] 2145 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x1 // v2=src[10*stride] 2146 // vertical filtered into v20/v21 2147 FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1 2148 FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1 2149 // horizon filtered 2150 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2151 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2152 // vertical filtered into v21/v22 2153 FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1 2154 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2155 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2156 st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line 2157 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2158 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line 2159 2160 mov v5.16b, v11.16b 2161 mov v11.16b, v17.16b 2162 mov v30.16b, v2.16b 2163 mov v2.16b, v8.16b 2164 mov v8.16b, v14.16b 2165 mov v14.16b, v30.16b 2166 2167 mov v6.16b, v12.16b 2168 mov v12.16b, v18.16b 2169 mov v30.16b, v3.16b 2170 mov v3.16b, v9.16b 2171 mov v9.16b, v15.16b 2172 mov v15.16b, v30.16b 2173 2174 mov v7.16b, v13.16b 2175 mov v13.16b, v19.16b 2176 mov v30.16b, v4.16b 2177 mov v4.16b, v10.16b 2178 mov v10.16b, v16.16b 2179 mov v16.16b, v30.16b 2180 2181 sub x4, x4, #8 2182 cbnz x4, w17_hv_mc_luma_loop 2183 2184 //prfm pldl1strm, [x0, x1] 2185 ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride] 2186 // vertical filtered into v20/v21 2187 FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1 2188 FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1 2189 // horizon filtered 2190 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2191 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2192 // vertical filtered into v21/v22 2193 FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1 2194 UNPACK_2_16BITS_TO_ABC v21, v22, v23, v24, v25 2195 FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1] 2196 st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line 2197 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26 2198 st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line 2199 2200 ldp d14, d15, [sp], #16 2201 ldp d12, d13, [sp], #16 2202 ldp d10, d11, [sp], #16 2203 ldp d8, d9, [sp], #16 2204WELS_ASM_AARCH64_FUNC_END 2205 2206//void McHorVer22Width9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 2207// int32_t iHeight);//width+1&&height+1 2208WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon 2209 SIGN_EXTENSION x1,w1 2210 SIGN_EXTENSION x3,w3 2211 SIGN_EXTENSION x4,w4 2212 sub x0, x0, #2 2213 sub x0, x0, x1, lsl #1 2214 movi v0.8h, #20, lsl #0 2215 movi v1.8h, #5, lsl #0 2216 sub x3, x3, #8 2217 mov x5, #8 2218 ldr q29, filter_para 2219 sub x4, x4, #1 2220 2221 //prfm pldl1strm, [x0] 2222 //prfm pldl1strm, [x0, x1] 2223 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 2224 //prfm pldl1strm, [x0, x1] 2225 ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride] 2226 //prfm pldl1strm, [x0, x1] 2227 ld1 {v4.16b}, [x0], x1 // v8=src[0*stride] 2228 //prfm pldl1strm, [x0, x1] 2229 ld1 {v5.16b}, [x0], x1 // v11=src[1*stride] 2230 //prfm pldl1strm, [x0, x1] 2231 ld1 {v6.16b}, [x0], x1 // v14=src[2*stride] 2232 2233w9_hv_mc_luma_loop: 2234 //prfm pldl1strm, [x0, x1] 2235 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 2236 // vertical filtered into v20/v21 2237 FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2238 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 2239 // horizon filtered 2240 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2241 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2242 st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line 2243 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 2244 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line 2245 2246 //prfm pldl1strm, [x0, x1] 2247 ld1 {v2.16b}, [x0], x1 // v2=src[4*stride] 2248 // vertical filtered into v20/v21 2249 FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 2250 FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1 2251 // horizon filtered 2252 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2253 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2254 st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line 2255 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 2256 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line 2257 2258 //prfm pldl1strm, [x0, x1] 2259 ld1 {v3.16b}, [x0], x1 // v3=src[5*stride] 2260 // vertical filtered into v20/v21 2261 FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 2262 FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 2263 // horizon filtered 2264 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2265 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2266 st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line 2267 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 2268 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line 2269 2270 //prfm pldl1strm, [x0, x1] 2271 ld1 {v4.16b}, [x0], x1 // v4=src[6*stride] 2272 // vertical filtered into v20/v21 2273 FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 2274 FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1 2275 // horizon filtered 2276 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2277 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2278 st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line 2279 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 2280 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line 2281 2282 2283 mov v5.16b, v3.16b 2284 mov v3.16b, v7.16b 2285 mov v30.16b, v2.16b 2286 mov v2.16b, v6.16b 2287 mov v6.16b, v4.16b 2288 mov v4.16b, v30.16b 2289 2290 sub x4, x4, #4 2291 cbnz x4, w9_hv_mc_luma_loop 2292 2293 //prfm pldl1strm, [x0, x1] 2294 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 2295 // vertical filtered into v20/v21 2296 FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2297 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 2298 // horizon filtered 2299 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2300 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2301 st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line 2302 UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26 2303 st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line 2304WELS_ASM_AARCH64_FUNC_END 2305 2306//void McHorVer22Width5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 2307// int32_t iHeight);//width+1&&height+1 2308WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_neon 2309 SIGN_EXTENSION x1,w1 2310 SIGN_EXTENSION x3,w3 2311 SIGN_EXTENSION x4,w4 2312 sub x0, x0, #2 2313 sub x0, x0, x1, lsl #1 2314 movi v0.8h, #20, lsl #0 2315 movi v1.8h, #5, lsl #0 2316 sub x3, x3, #4 2317 mov x5, #4 2318 ldr q29, filter_para 2319 sub x4, x4, #1 2320 2321 //prfm pldl1strm, [x0] 2322 //prfm pldl1strm, [x0, x1] 2323 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 2324 //prfm pldl1strm, [x0, x1] 2325 ld1 {v3.16b}, [x0], x1 // v5=src[-1*stride] 2326 //prfm pldl1strm, [x0, x1] 2327 ld1 {v4.16b}, [x0], x1 // v8=src[0*stride] 2328 //prfm pldl1strm, [x0, x1] 2329 ld1 {v5.16b}, [x0], x1 // v11=src[1*stride] 2330 //prfm pldl1strm, [x0, x1] 2331 ld1 {v6.16b}, [x0], x1 // v14=src[2*stride] 2332 2333w5_hv_mc_luma_loop: 2334 //prfm pldl1strm, [x0, x1] 2335 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 2336 // vertical filtered into v20/v21 2337 FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2338 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 2339 // horizon filtered 2340 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2341 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2342 st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 0 line 2343 st1 {v26.b}[4], [x2], x3 //write 5th Byte : 0 line 2344 2345 //prfm pldl1strm, [x0, x1] 2346 ld1 {v2.16b}, [x0], x1 // v2=src[4*stride] 2347 // vertical filtered into v20/v21 2348 FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 2349 FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1 2350 // horizon filtered 2351 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2352 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2353 st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 1 line 2354 st1 {v26.b}[4], [x2], x3 //write 5th Byte : 1 line 2355 2356 //prfm pldl1strm, [x0, x1] 2357 ld1 {v3.16b}, [x0], x1 // v3=src[5*stride] 2358 // vertical filtered into v20/v21 2359 FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 2360 FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1 2361 // horizon filtered 2362 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2363 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2364 st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 2 line 2365 st1 {v26.b}[4], [x2], x3 //write 5th Byte : 2 line 2366 2367 //prfm pldl1strm, [x0, x1] 2368 ld1 {v4.16b}, [x0], x1 // v4=src[6*stride] 2369 // vertical filtered into v20/v21 2370 FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 2371 FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1 2372 // horizon filtered 2373 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2374 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2375 st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 3 line 2376 st1 {v26.b}[4], [x2], x3 //write 5th Byte : 3 line 2377 2378 2379 mov v5.16b, v3.16b 2380 mov v3.16b, v7.16b 2381 mov v30.16b, v2.16b 2382 mov v2.16b, v6.16b 2383 mov v6.16b, v4.16b 2384 mov v4.16b, v30.16b 2385 2386 sub x4, x4, #4 2387 cbnz x4, w5_hv_mc_luma_loop 2388 2389 //prfm pldl1strm, [x0, x1] 2390 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 2391 // vertical filtered into v20/v21 2392 FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2393 FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1 2394 // horizon filtered 2395 UNPACK_2_16BITS_TO_ABC v20, v21, v23, v24, v25 2396 FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0] 2397 st1 {v26.s}[0], [x2], x5 //write 0:3Byte : 0 line 2398 st1 {v26.b}[4], [x2], x3 //write 5th Byte : 0 line 2399WELS_ASM_AARCH64_FUNC_END 2400 2401//void McHorVer02Height17_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 2402// int32_t iHeight);// height+1 2403WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height17_AArch64_neon 2404 SIGN_EXTENSION x1,w1 2405 SIGN_EXTENSION x3,w3 2406 SIGN_EXTENSION x4,w4 2407 sub x0, x0, x1, lsl #1 2408 movi v0.8h, #20, lsl #0 2409 movi v1.8h, #5, lsl #0 2410 sub x4, x4, #1 2411 2412 //prfm pldl1strm, [x0] 2413 //prfm pldl1strm, [x0, x1] 2414 ld1 {v2.16b}, [x0], x1 // v2=src[-2*stride] 2415 //prfm pldl1strm, [x0, x1] 2416 ld1 {v3.16b}, [x0], x1 // v3=src[-1*stride] 2417 //prfm pldl1strm, [x0, x1] 2418 ld1 {v4.16b}, [x0], x1 // v4=src[0*stride] 2419 //prfm pldl1strm, [x0, x1] 2420 ld1 {v5.16b}, [x0], x1 // v5=src[1*stride] 2421 //prfm pldl1strm, [x0, x1] 2422 ld1 {v6.16b}, [x0], x1 // v6=src[2*stride] 2423 2424 2425w17_v_mc_luma_loop: 2426 //prfm pldl1strm, [x0, x1] 2427 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 2428 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2429 FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 2430 st1 {v20.16b}, [x2], x3 //write 16Byte : 0 line 2431 2432 2433 //prfm pldl1strm, [x0, x1] 2434 ld1 {v2.16b}, [x0], x1 // v2=src[4*stride] 2435 FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 2436 FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 2437 st1 {v20.16b}, [x2], x3 //write 16Byte : 1 line 2438 2439 2440 //prfm pldl1strm, [x0, x1] 2441 ld1 {v3.16b}, [x0], x1 // v3=src[5*stride] 2442 FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 2443 FILTER_6TAG_8BITS2 v4, v5, v6, v7, v2, v3, v20, v0, v1 2444 st1 {v20.16b}, [x2], x3 //write 16Byte : 2 line 2445 2446 2447 //prfm pldl1strm, [x0, x1] 2448 ld1 {v4.16b}, [x0], x1 // v4=src[6*stride] 2449 FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 2450 FILTER_6TAG_8BITS2 v5, v6, v7, v2, v3, v4, v20, v0, v1 2451 st1 {v20.16b}, [x2], x3 //write 16Byte : 3 line 2452 2453 2454 //prfm pldl1strm, [x0, x1] 2455 ld1 {v5.16b}, [x0], x1 // v5=src[7*stride] 2456 FILTER_6TAG_8BITS1 v6, v7, v2, v3, v4, v5, v20, v0, v1 2457 FILTER_6TAG_8BITS2 v6, v7, v2, v3, v4, v5, v20, v0, v1 2458 st1 {v20.16b}, [x2], x3 //write 16Byte : 4 line 2459 2460 2461 //prfm pldl1strm, [x0, x1] 2462 ld1 {v6.16b}, [x0], x1 // v6=src[8*stride] 2463 FILTER_6TAG_8BITS1 v7, v2, v3, v4, v5, v6, v20, v0, v1 2464 FILTER_6TAG_8BITS2 v7, v2, v3, v4, v5, v6, v20, v0, v1 2465 st1 {v20.16b}, [x2], x3 //write 16Byte : 5 line 2466 2467 //prfm pldl1strm, [x0, x1] 2468 ld1 {v7.16b}, [x0], x1 // v7=src[9*stride] 2469 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2470 FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 2471 st1 {v20.16b}, [x2], x3 //write 16Byte : 6 line 2472 2473 //prfm pldl1strm, [x0, x1] 2474 ld1 {v2.16b}, [x0], x1 // v2=src[10*stride] 2475 FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 2476 FILTER_6TAG_8BITS2 v3, v4, v5, v6, v7, v2, v20, v0, v1 2477 st1 {v20.16b}, [x2], x3 //write 16Byte : 7 line 2478 2479 mov v3.16b, v5.16b 2480 mov v5.16b, v7.16b 2481 mov v7.16b, v2.16b 2482 mov v2.16b, v4.16b 2483 mov v4.16b, v6.16b 2484 mov v6.16b, v7.16b 2485 sub x4, x4, #8 2486 cbnz x4, w17_v_mc_luma_loop 2487 2488 //prfm pldl1strm, [x0, x1] 2489 ld1 {v7.16b}, [x0], x1 // v7=src[3*stride] 2490 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2491 FILTER_6TAG_8BITS2 v2, v3, v4, v5, v6, v7, v20, v0, v1 2492 st1 {v20.16b}, [x2], x3 //write 16Byte : last line 2493WELS_ASM_AARCH64_FUNC_END 2494//void McHorVer02Height9_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 2495// int32_t iHeight);// height+1 2496WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height9_AArch64_neon 2497 SIGN_EXTENSION x1,w1 2498 SIGN_EXTENSION x3,w3 2499 SIGN_EXTENSION x4,w4 2500 sub x0, x0, x1, lsl #1 2501 movi v0.8h, #20, lsl #0 2502 movi v1.8h, #5, lsl #0 2503 sub x4, x4, #1 2504 2505 //prfm pldl1strm, [x0] 2506 //prfm pldl1strm, [x0, x1] 2507 ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride] 2508 //prfm pldl1strm, [x0, x1] 2509 ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride] 2510 //prfm pldl1strm, [x0, x1] 2511 ld1 {v4.8b}, [x0], x1 // v4=src[0*stride] 2512 //prfm pldl1strm, [x0, x1] 2513 ld1 {v5.8b}, [x0], x1 // v5=src[1*stride] 2514 //prfm pldl1strm, [x0, x1] 2515 ld1 {v6.8b}, [x0], x1 // v6=src[2*stride] 2516 2517w9_v_mc_luma_loop: 2518 //prfm pldl1strm, [x0, x1] 2519 ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] 2520 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2521 st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line 2522 2523 //prfm pldl1strm, [x0, x1] 2524 ld1 {v2.8b}, [x0], x1 // v2=src[4*stride] 2525 FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 2526 st1 {v20.8b}, [x2], x3 //write 8Byte : 1 line 2527 2528 //prfm pldl1strm, [x0, x1] 2529 ld1 {v3.8b}, [x0], x1 // v3=src[5*stride] 2530 FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 2531 st1 {v20.8b}, [x2], x3 //write 8Byte : 2 line 2532 2533 //prfm pldl1strm, [x0, x1] 2534 ld1 {v4.8b}, [x0], x1 // v4=src[6*stride] 2535 FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 2536 st1 {v20.8b}, [x2], x3 //write 8Byte : 3 line 2537 2538 mov v5.16b, v3.16b 2539 mov v3.16b, v7.16b 2540 mov v7.16b, v2.16b 2541 mov v2.16b, v6.16b 2542 mov v6.16b, v4.16b 2543 mov v4.16b, v7.16b 2544 sub x4, x4, #4 2545 cbnz x4, w9_v_mc_luma_loop 2546 2547 //prfm pldl1strm, [x0, x1] 2548 ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] 2549 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2550 st1 {v20.8b}, [x2], x3 //write 8Byte : 0 line 2551WELS_ASM_AARCH64_FUNC_END 2552 2553//void McHorVer02Height5_AArch64_neon (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 2554// int32_t iHeight);// height+1 2555WELS_ASM_AARCH64_FUNC_BEGIN McHorVer02Height5_AArch64_neon 2556 SIGN_EXTENSION x1,w1 2557 SIGN_EXTENSION x3,w3 2558 SIGN_EXTENSION x4,w4 2559 sub x0, x0, x1, lsl #1 2560 movi v0.8h, #20, lsl #0 2561 movi v1.8h, #5, lsl #0 2562 sub x4, x4, #1 2563 2564 //prfm pldl1strm, [x0] 2565 //prfm pldl1strm, [x0, x1] 2566 ld1 {v2.8b}, [x0], x1 // v2=src[-2*stride] 2567 //prfm pldl1strm, [x0, x1] 2568 ld1 {v3.8b}, [x0], x1 // v3=src[-1*stride] 2569 //prfm pldl1strm, [x0, x1] 2570 ld1 {v4.8b}, [x0], x1 // v4=src[0*stride] 2571 //prfm pldl1strm, [x0, x1] 2572 ld1 {v5.8b}, [x0], x1 // v5=src[1*stride] 2573 //prfm pldl1strm, [x0, x1] 2574 ld1 {v6.8b}, [x0], x1 // v6=src[2*stride] 2575 2576w5_v_mc_luma_loop: 2577 //prfm pldl1strm, [x0, x1] 2578 ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] 2579 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2580 st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line 2581 2582 //prfm pldl1strm, [x0, x1] 2583 ld1 {v2.8b}, [x0], x1 // v2=src[4*stride] 2584 FILTER_6TAG_8BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1 2585 st1 {v20.s}[0], [x2], x3 //write 4Byte : 1 line 2586 2587 //prfm pldl1strm, [x0, x1] 2588 ld1 {v3.8b}, [x0], x1 // v3=src[5*stride] 2589 FILTER_6TAG_8BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1 2590 st1 {v20.s}[0], [x2], x3 //write 4Byte : 2 line 2591 2592 //prfm pldl1strm, [x0, x1] 2593 ld1 {v4.8b}, [x0], x1 // v4=src[6*stride] 2594 FILTER_6TAG_8BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1 2595 st1 {v20.s}[0], [x2], x3 //write 4Byte : 3 line 2596 2597 mov v5.16b, v3.16b 2598 mov v3.16b, v7.16b 2599 mov v7.16b, v2.16b 2600 mov v2.16b, v6.16b 2601 mov v6.16b, v4.16b 2602 mov v4.16b, v7.16b 2603 sub x4, x4, #4 2604 cbnz x4, w5_v_mc_luma_loop 2605 2606 //prfm pldl1strm, [x0, x1] 2607 ld1 {v7.8b}, [x0], x1 // v7=src[3*stride] 2608 FILTER_6TAG_8BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1 2609 st1 {v20.s}[0], [x2], x3 //write 4Byte : 0 line 2610 2611WELS_ASM_AARCH64_FUNC_END 2612 2613#endif 2614 2615