1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34 35#include "arm_arch64_common_macro.S" 36 37.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6 38 uabd \arg6\().16b, \arg1\().16b, \arg2\().16b 39 cmhi \arg6\().16b, \arg4\().16b, \arg6\().16b 40 41 uabd \arg4\().16b, \arg0\().16b, \arg1\().16b 42 cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b 43 and \arg6\().16b, \arg6\().16b, \arg4\().16b 44 45 uabd \arg4\().16b, \arg3\().16b, \arg2\().16b 46 cmhi \arg4\().16b, \arg5\().16b, \arg4\().16b 47 and \arg6\().16b, \arg6\().16b, \arg4\().16b 48.endm 49 50.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 51 //v0, v1, v2, v3, v17(beta), v18(-Tc0), v6(Tc0), v7(flag), v19, v20 52 urhadd \arg8\().16b, \arg2\().16b, \arg3\().16b 53 uhadd \arg8\().16b, \arg0\().16b, \arg8\().16b 54 usubl \arg9\().8h, \arg8\().8b, \arg1\().8b 55 sqxtn \arg9\().8b, \arg9\().8h 56 usubl2 \arg8\().8h, \arg8\().16b, \arg1\().16b 57 sqxtn2 \arg9\().16b, \arg8\().8h 58 smax \arg8\().16b, \arg9\().16b, \arg5\().16b 59 // 60 smin \arg8\().16b, \arg8\().16b, \arg6\().16b 61 uabd \arg9\().16b, \arg0\().16b, \arg2\().16b 62 cmhi \arg9\().16b, \arg4\().16b, \arg9\().16b 63 and \arg8\().16b, \arg8\().16b, \arg9\().16b 64 and \arg8\().16b, \arg8\().16b, \arg7\().16b 65 add \arg8\().16b, \arg1\().16b, \arg8\().16b 66 abs \arg9\().16b, \arg9\().16b 67.endm 68 69.macro DIFF_LUMA_LT4_P0_Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6 70 usubl \arg5\().8h, \arg0\().8b, \arg3\().8b 71 usubl \arg6\().8h, \arg2\().8b, \arg1\().8b 72 shl \arg6\().8h, \arg6\().8h, #2 73 add \arg5\().8h, \arg5\().8h, \arg6\().8h 74 sqrshrn \arg4\().8b, \arg5\().8h, #3 75.endm 76 77.macro DIFF_LUMA_LT4_P0_Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6 78 usubl2 \arg5\().8h, \arg0\().16b, \arg3\().16b 79 usubl2 \arg6\().8h, \arg2\().16b, \arg1\().16b 80 shl \arg6\().8h, \arg6\().8h, #2 81 add \arg5\().8h, \arg5\().8h, \arg6\().8h 82 sqrshrn2 \arg4\().16b, \arg5\().8h, #3 83.endm 84 85.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1 86 cmge \arg1\().16b, \arg0\().16b, #0 87 and \arg1\().16b, \arg0\().16b, \arg1\().16b 88 sub \arg0\().16b, \arg1\().16b, \arg0\().16b 89.endm 90 91.macro DIFF_LUMA_EQ4_P2P1P0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 92 uaddl \arg8\().8h, \arg1\().8b, \arg2\().8b 93 uaddl \arg9\().8h, \arg3\().8b, \arg4\().8b 94 add \arg9\().8h, \arg9\().8h, \arg8\().8h 95 96 uaddl \arg8\().8h, \arg0\().8b, \arg1\().8b 97 shl \arg8\().8h, \arg8\().8h, #1 98 add \arg8\().8h, \arg9\().8h, \arg8\().8h 99 100 rshrn \arg0\().8b, \arg9\().8h, #2 101 rshrn \arg7\().8b, \arg8\().8h, #3 102 shl \arg9\().8h, \arg9\().8h, #1 103 usubl \arg8\().8h, \arg5\().8b, \arg1\().8b 104 add \arg9\().8h, \arg8\().8h, \arg9\().8h 105 106 uaddl \arg8\().8h, \arg2\().8b, \arg5\().8b 107 uaddw \arg8\().8h, \arg8\().8h, \arg2\().8b 108 uaddw \arg8\().8h, \arg8\().8h, \arg3\().8b 109 110 rshrn \arg9\().8b, \arg9\().8h, #3 111 rshrn \arg8\().8b, \arg8\().8h, #2 112 bsl \arg6\().8b, \arg9\().8b, \arg8\().8b 113.endm 114 115.macro DIFF_LUMA_EQ4_P2P1P0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 116 uaddl2 \arg8\().8h, \arg1\().16b, \arg2\().16b 117 uaddl2 \arg9\().8h, \arg3\().16b, \arg4\().16b 118 add \arg9\().8h, \arg9\().8h, \arg8\().8h 119 120 uaddl2 \arg8\().8h, \arg0\().16b, \arg1\().16b 121 shl \arg8\().8h, \arg8\().8h, #1 122 add \arg8\().8h, \arg9\().8h, \arg8\().8h 123 124 rshrn2 \arg0\().16b, \arg9\().8h, #2 125 rshrn2 \arg7\().16b, \arg8\().8h, #3 126 shl \arg9\().8h, \arg9\().8h, #1 127 usubl2 \arg8\().8h, \arg5\().16b, \arg1\().16b 128 add \arg9\().8h, \arg8\().8h, \arg9\().8h 129 130 uaddl2 \arg8\().8h, \arg2\().16b, \arg5\().16b 131 uaddw2 \arg8\().8h, \arg8\().8h, \arg2\().16b 132 uaddw2 \arg8\().8h, \arg8\().8h, \arg3\().16b 133 134 rshrn2 \arg9\().16b, \arg9\().8h, #3 135 rshrn2 \arg8\().16b, \arg8\().8h, #2 136 bsl \arg6\().16b, \arg9\().16b, \arg8\().16b 137.endm 138 139 140.macro DIFF_CHROMA_EQ4_P0Q0_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 141 uaddl \arg4\().8h, \arg0\().8b, \arg3\().8b 142 shl \arg4\().8h, \arg4\().8h, #1 143 usubl \arg5\().8h, \arg1\().8b, \arg3\().8b 144 add \arg5\().8h, \arg5\().8h, \arg4\().8h 145 rshrn \arg6\().8b, \arg5\().8h, #2 146 usubl \arg5\().8h, \arg2\().8b, \arg0\().8b 147 add \arg5\().8h, \arg5\().8h, \arg4\().8h 148 rshrn \arg7\().8b, \arg5\().8h, #2 149.endm 150 151.macro DIFF_CHROMA_EQ4_P0Q0_2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 152 uaddl2 \arg4\().8h, \arg0\().16b, \arg3\().16b 153 shl \arg4\().8h, \arg4\().8h, #1 154 usubl2 \arg5\().8h, \arg1\().16b, \arg3\().16b 155 add \arg5\().8h, \arg5\().8h, \arg4\().8h 156 rshrn2 \arg6\().16b, \arg5\().8h, #2 157 usubl2 \arg5\().8h, \arg2\().16b, \arg0\().16b 158 add \arg5\().8h, \arg5\().8h, \arg4\().8h 159 rshrn2 \arg7\().16b, \arg5\().8h, #2 160.endm 161 162.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3 163 mov \arg3\().16b, \arg2\().16b 164 bsl \arg3\().16b, \arg0\().16b, \arg1\().16b 165.endm 166 167.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6 168 ld3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x2], x1 169 ld3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1 170.endm 171 172.macro LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8 173 ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg8], [x3], x1 174 ld4 {\arg4\().b, \arg5\().b, \arg6\().b, \arg7\().b} [\arg8], [x0], x1 175.endm 176 177.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5 178 st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg4], [x0], x1 179 st4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [x2], x1 180.endm 181 182.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6 183 st3 {\arg0\().b, \arg1\().b, \arg2\().b} [\arg6], [x3], x1 184 st3 {\arg3\().b, \arg4\().b, \arg5\().b} [\arg6], [x0], x1 185.endm 186 187.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5 188 ld4 {\arg0\().b, \arg1\().b, \arg2\().b, \arg3\().b} [\arg5], [\arg4], x2 189.endm 190 191.macro STORE_CHROMA_DATA_2 arg0, arg1, arg2, arg3 192 st2 {\arg0\().b, \arg1\().b} [\arg3], [\arg2], x2 193.endm 194 195.macro ZERO_JUMP_END arg0, arg1, arg2, arg3 196 mov \arg1, \arg0\().d[0] 197 mov \arg2, \arg0\().d[1] 198 orr \arg1, \arg1, \arg2 199 cbz \arg1, \arg3 200.endm 201 202.macro BS_NZC_CHECK arg0, arg1, arg2, arg3, arg4 203 ld1 {v0.16b}, [\arg0] 204 //Arrange the input data --- TOP 205 ands x6, \arg1, #2 206 cbz x6, bs_nzc_check_jump0 207 sub x6, \arg0, \arg2, lsl #4 208 sub x6, x6, \arg2, lsl #3 209 add x6, x6, #12 210 ld1 {v1.s} [3], [x6] 211 212bs_nzc_check_jump0: 213 ext v1.16b, v1.16b, v0.16b, #12 214 add \arg3\().16b, v0.16b, v1.16b 215 216 // Arrange the input data --- LEFT 217 ands x6, \arg1, #1 218 cbz x6, bs_nzc_check_jump1 219 220 sub x6, \arg0, #21 221 add x7, x6, #4 222 ld1 {v1.b} [12], [x6] 223 add x6, x7, #4 224 ld1 {v1.b} [13], [x7] 225 add x7, x6, #4 226 ld1 {v1.b} [14], [x6] 227 ld1 {v1.b} [15], [x7] 228 229bs_nzc_check_jump1: 230 ins v2.d[0], v0.d[1] 231 zip1 v0.16b, v0.16b, v2.16b 232 ins v2.d[0], v0.d[1] 233 zip1 v0.16b, v0.16b, v2.16b 234 ext v1.16b, v1.16b, v0.16b, #12 235 add \arg4\().16b, v0.16b, v1.16b 236.endm 237 238.macro BS_COMPARE_MV arg0, arg1, arg2, arg3, arg4, arg5 239 //in: \arg0,\arg1(const),\arg2(const),\arg3(const),\arg4(const); out:\arg5 240 mov w6, #4 241 sabd v20.8h, \arg0\().8h, \arg1\().8h 242 sabd v21.8h, \arg1\().8h, \arg2\().8h 243 dup \arg0\().8h, w6 244 sabd v22.8h, \arg2\().8h, \arg3\().8h 245 sabd v23.8h, \arg3\().8h, \arg4\().8h 246 247 cmge v20.8h, v20.8h, \arg0\().8h 248 cmge v21.8h, v21.8h, \arg0\().8h 249 cmge v22.8h, v22.8h, \arg0\().8h 250 cmge v23.8h, v23.8h, \arg0\().8h 251 252 addp v20.8h, v20.8h, v21.8h 253 addp v21.8h, v22.8h, v23.8h 254 255 addhn \arg5\().8b, v20.8h, v20.8h 256 addhn2 \arg5\().16b, v21.8h, v21.8h 257.endm 258 259.macro BS_MV_CHECK arg0, arg1, arg2, arg3, arg4, arg5, arg6 260 ldp q0, q1, [\arg0], #32 261 ldp q2, q3, [\arg0] 262 sub \arg0, \arg0, #32 263 // Arrenge the input data --- TOP 264 ands x6, \arg1, #2 265 cbz x6, bs_mv_check_jump0 266 sub x6, \arg0, \arg2, lsl #6 267 add x6, x6, #48 268 ld1 {v4.16b}, [x6] 269bs_mv_check_jump0: 270 BS_COMPARE_MV v4, v0, v1, v2, v3, \arg3 271 // Arrange the input data --- LEFT 272 ands x6, \arg1, #1 273 cbz x6, bs_mv_check_jump1 274 sub x6, \arg0, #52 275 add x7, x6, #16 276 ld1 {v4.s} [0], [x6] 277 add x6, x7, #16 278 ld1 {v4.s} [1], [x7] 279 add x7, x6, #16 280 ld1 {v4.s} [2], [x6] 281 ld1 {v4.s} [3], [x7] 282bs_mv_check_jump1: 283 zip1 \arg5\().4s, v0.4s, v2.4s 284 zip2 \arg6\().4s, v0.4s, v2.4s 285 zip1 v0.4s, v1.4s, v3.4s 286 zip2 v2.4s, v1.4s, v3.4s 287 zip2 v1.4s, \arg5\().4s, v0.4s 288 zip1 v0.4s, \arg5\().4s, v0.4s 289 zip2 v3.4s, \arg6\().4s, v2.4s 290 zip1 v2.4s, \arg6\().4s, v2.4s 291 BS_COMPARE_MV v4, v0, v1, v2, v3, \arg4 292.endm 293 294WELS_ASM_AARCH64_FUNC_BEGIN WelsNonZeroCount_AArch64_neon 295 mov w1, #1 296 dup v3.8b, w1 297 ld1 {v0.8b, v1.8b, v2.8b}, [x0] 298 umin v0.8b, v0.8b, v3.8b 299 umin v1.8b, v1.8b, v3.8b 300 umin v2.8b, v2.8b, v3.8b 301 st1 {v0.8b, v1.8b, v2.8b}, [x0] 302WELS_ASM_AARCH64_FUNC_END 303 304 305WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaLt4V_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc 306 dup v16.16b, w2 //alpha 307 dup v17.16b, w3 //beta 308 SIGN_EXTENSION x1,w1 309 add x2, x1, x1, lsl #1 310 sub x2, x0, x2 311 movi v23.16b, #128 312 ld1 {v0.16b}, [x2], x1 313 ld1 {v1.16b}, [x2], x1 314 ld1 {v2.16b}, [x2] 315 ld1 {v3.16b}, [x0], x1 316 ld1 {v4.16b}, [x0], x1 317 ld1 {v5.16b}, [x0] 318 sub x2, x2, x1 319 ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4] 320 trn1 v18.2s, v18.2s, v19.2s 321 trn1 v20.2s, v20.2s, v21.2s 322 trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333 323 cmge v7.16b, v6.16b, #0 // iTc0 Flag 324 325 MASK_MATRIX v1, v2, v3, v4, v16, v17, v18 326 and v7.16b, v7.16b, v18.16b // need filter flag 327 328 ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4V_AArch64_neon_end 329 330 eor v18.16b, v18.16b, v18.16b 331 sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333 332 333 DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 334 st1 {v19.16b}, [x2], x1 335 336 DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 337 338 abs v20.16b, v20.16b 339 abs v22.16b, v22.16b 340 add v6.16b, v6.16b, v20.16b 341 add v6.16b, v6.16b, v22.16b 342 eor v18.16b, v18.16b, v18.16b 343 sub v18.16b, v18.16b, v6.16b 344 345 DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22 346 DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22 347 348 smax v19.16b, v19.16b, v18.16b 349 smin v19.16b, v19.16b, v6.16b 350 and v19.16b, v19.16b, v7.16b 351 352 EXTRACT_DELTA_INTO_TWO_PART v19, v20 353 uqadd v2.16b, v2.16b, v20.16b 354 uqsub v2.16b, v2.16b, v19.16b 355 st1 {v2.16b}, [x2], x1 356 uqsub v3.16b, v3.16b, v20.16b 357 uqadd v3.16b, v3.16b, v19.16b 358 st1 {v3.16b}, [x2], x1 359 st1 {v21.16b}, [x2] 360DeblockLumaLt4V_AArch64_neon_end: 361WELS_ASM_AARCH64_FUNC_END 362 363 364WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaEq4V_AArch64_neon 365 dup v16.16b, w2 //alpha 366 dup v17.16b, w3 //beta 367 SIGN_EXTENSION x1,w1 368 sub x3, x0, x1, lsl #2 369 ld1 {v0.16b}, [x3], x1 370 ld1 {v4.16b}, [x0], x1 371 ld1 {v1.16b}, [x3], x1 372 ld1 {v5.16b}, [x0], x1 373 ld1 {v2.16b}, [x3], x1 374 ld1 {v6.16b}, [x0], x1 375 ld1 {v3.16b}, [x3] 376 ld1 {v7.16b}, [x0] 377 378 sub x3, x3, x1, lsl #1 379 MASK_MATRIX v2, v3, v4, v5, v16, v17, v18 380 lsr w2, w2, #2 381 add w2, w2, #2 382 dup v16.16b, w2 //((alpha >> 2) + 2) 383 uabd v19.16b, v3.16b, v4.16b 384 cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2) 385 386 uabd v21.16b, v1.16b, v3.16b 387 cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0 388 and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0 389 390 uabd v22.16b, v6.16b, v4.16b 391 cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0 392 and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0 393 and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2)) 394 395 mov v23.16b, v21.16b 396 mov v24.16b, v21.16b 397 398 mov v25.16b, v0.16b 399 DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16 400 DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16 401 ins v0.d[1], v25.d[1] 402 ins v23.d[1], v24.d[1] 403 and v21.16b, v20.16b, v21.16b 404 DIFF_LUMA_EQ4_MASK v19, v1, v21, v17 405 st1 {v17.16b}, [x3], x1 406 DIFF_LUMA_EQ4_MASK v0, v2, v21, v17 407 st1 {v17.16b}, [x3], x1 408 DIFF_LUMA_EQ4_MASK v23, v3, v18, v17 409 st1 {v17.16b}, [x3], x1 410 411 412 mov v23.16b, v22.16b 413 mov v24.16b, v22.16b 414 mov v25.16b, v7.16b 415 DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16 416 DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16 417 ins v7.d[1], v25.d[1] 418 ins v23.d[1], v24.d[1] 419 and v22.16b, v20.16b, v22.16b 420 DIFF_LUMA_EQ4_MASK v23, v4, v18, v17 421 st1 {v17.16b}, [x3], x1 422 DIFF_LUMA_EQ4_MASK v7, v5, v22, v17 423 st1 {v17.16b}, [x3], x1 424 DIFF_LUMA_EQ4_MASK v19, v6, v22, v17 425 st1 {v17.16b}, [x3], x1 426DeblockLumaEq4V_AArch64_neon_end: 427WELS_ASM_AARCH64_FUNC_END 428 429 430WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaLt4H_AArch64_neon //uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc 431 dup v16.16b, w2 //alpha 432 dup v17.16b, w3 //beta 433 sub x2, x0, #3 434 movi v23.16b, #128 435 SIGN_EXTENSION x1,w1 436 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 0 437 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 1 438 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 2 439 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 3 440 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 4 441 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 5 442 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 6 443 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 7 444 445 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 8 446 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 9 447 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 10 448 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 11 449 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 12 450 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 13 451 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 14 452 LOAD_LUMA_DATA_3 v0, v1, v2, v3, v4, v5, 15 453 454 sub x0, x0, x1, lsl #4 455 456 ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x4] 457 trn1 v18.2s, v18.2s, v19.2s 458 trn1 v20.2s, v20.2s, v21.2s 459 trn1 v6.2d, v18.2d, v20.2d // iTc0: 0000, 1111, 2222, 3333 460 cmge v7.16b, v6.16b, #0 // iTc0 Flag 461 462 MASK_MATRIX v1, v2, v3, v4, v16, v17, v18 463 and v7.16b, v7.16b, v18.16b // need filter flag 464 465 ZERO_JUMP_END v7, x3, x4, DeblockLumaLt4H_AArch64_neon_end 466 467 eor v18.16b, v18.16b, v18.16b 468 sub v18.16b, v18.16b, v6.16b // -iTc0: 0000, 1111, 2222, 3333 469 470 DIFF_LUMA_LT4_P1_Q1 v0, v1, v2, v3, v17, v18, v6, v7, v19, v20 //Use Tmp v23,v24 471 mov v25.16b, v19.16b 472 473 DIFF_LUMA_LT4_P1_Q1 v5, v4, v3, v2, v17, v18, v6, v7, v21, v22 //Use Tmp v23,v24 474 475 abs v20.16b, v20.16b 476 abs v22.16b, v22.16b 477 add v6.16b, v6.16b, v20.16b 478 add v6.16b, v6.16b, v22.16b 479 eor v18.16b, v18.16b, v18.16b 480 sub v18.16b, v18.16b, v6.16b 481 482 DIFF_LUMA_LT4_P0_Q0_1 v1, v2, v3, v4, v19, v20, v22 483 DIFF_LUMA_LT4_P0_Q0_2 v1, v2, v3, v4, v19, v20, v22 484 485 smax v19.16b, v19.16b, v18.16b 486 smin v19.16b, v19.16b, v6.16b 487 and v19.16b, v19.16b, v7.16b 488 489 EXTRACT_DELTA_INTO_TWO_PART v19, v20 490 uqadd v2.16b, v2.16b, v20.16b 491 uqsub v2.16b, v2.16b, v19.16b 492 mov v26.16b, v2.16b 493 uqsub v3.16b, v3.16b, v20.16b 494 uqadd v3.16b, v3.16b, v19.16b 495 mov v27.16b, v3.16b 496 mov v28.16b, v21.16b 497 498 sub x0, x0, #2 499 add x2, x0, x1 500 lsl x1, x1, #1 501 502 STORE_LUMA_DATA_4 v25, v26, v27, v28, 0, 1 503 STORE_LUMA_DATA_4 v25, v26, v27, v28, 2, 3 504 STORE_LUMA_DATA_4 v25, v26, v27, v28, 4, 5 505 STORE_LUMA_DATA_4 v25, v26, v27, v28, 6, 7 506 507 STORE_LUMA_DATA_4 v25, v26, v27, v28, 8, 9 508 STORE_LUMA_DATA_4 v25, v26, v27, v28, 10, 11 509 STORE_LUMA_DATA_4 v25, v26, v27, v28, 12, 13 510 STORE_LUMA_DATA_4 v25, v26, v27, v28, 14, 15 511DeblockLumaLt4H_AArch64_neon_end: 512WELS_ASM_AARCH64_FUNC_END 513 514 515WELS_ASM_AARCH64_FUNC_BEGIN DeblockLumaEq4H_AArch64_neon 516 dup v16.16b, w2 //alpha 517 dup v17.16b, w3 //beta 518 sub x3, x0, #4 519 SIGN_EXTENSION x1,w1 520 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 0 521 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 1 522 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 2 523 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 3 524 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 4 525 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 5 526 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 6 527 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 7 528 529 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 8 530 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 9 531 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 10 532 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 11 533 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 12 534 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 13 535 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 14 536 LOAD_LUMA_DATA_4 v0, v1, v2, v3, v4, v5, v6, v7, 15 537 538 sub x0, x0, x1, lsl #4 539 sub x3, x0, #3 540 MASK_MATRIX v2, v3, v4, v5, v16, v17, v18 541 542 ZERO_JUMP_END v18, x4, x5, DeblockLumaEq4H_AArch64_neon_end 543 544 lsr w2, w2, #2 545 add w2, w2, #2 546 dup v16.16b, w2 //((alpha >> 2) + 2) 547 uabd v19.16b, v3.16b, v4.16b 548 cmhi v20.16b, v16.16b, v19.16b //iDetaP0Q0 < ((iAlpha >> 2) + 2) 549 550 uabd v21.16b, v1.16b, v3.16b 551 cmhi v21.16b, v17.16b, v21.16b //bDetaP2P0 552 and v21.16b, v21.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaP2P0 553 554 uabd v22.16b, v6.16b, v4.16b 555 cmhi v22.16b, v17.16b, v22.16b //bDetaQ2Q0 556 and v22.16b, v22.16b, v20.16b //(iDetaP0Q0 < ((iAlpha >> 2) + 2))&&bDetaQ2Q0 557 and v20.16b, v20.16b, v18.16b //(iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0&&(iDetaP0Q0 < ((iAlpha >> 2) + 2)) 558 559 mov v23.16b, v21.16b 560 mov v24.16b, v21.16b 561 562 mov v25.16b, v0.16b 563 DIFF_LUMA_EQ4_P2P1P0_1 v0, v1, v2, v3, v4, v5, v23, v19, v17, v16 564 DIFF_LUMA_EQ4_P2P1P0_2 v25, v1, v2, v3, v4, v5, v24, v19, v17, v16 565 ins v0.d[1], v25.d[1] 566 ins v23.d[1], v24.d[1] 567 and v21.16b, v20.16b, v21.16b 568 DIFF_LUMA_EQ4_MASK v19, v1, v21, v17 569 mov v26.16b, v17.16b 570 DIFF_LUMA_EQ4_MASK v0, v2, v21, v17 571 mov v27.16b, v17.16b 572 DIFF_LUMA_EQ4_MASK v23, v3, v18, v17 573 mov v28.16b, v17.16b 574 575 576 mov v23.16b, v22.16b 577 mov v24.16b, v22.16b 578 mov v25.16b, v7.16b 579 DIFF_LUMA_EQ4_P2P1P0_1 v7, v6, v5, v4, v3, v2, v23, v19, v17, v16 580 DIFF_LUMA_EQ4_P2P1P0_2 v25, v6, v5, v4, v3, v2, v24, v19, v17, v16 581 ins v7.d[1], v25.d[1] 582 ins v23.d[1], v24.d[1] 583 and v22.16b, v20.16b, v22.16b 584 DIFF_LUMA_EQ4_MASK v23, v4, v18, v17 585 mov v29.16b, v17.16b 586 DIFF_LUMA_EQ4_MASK v7, v5, v22, v17 587 mov v30.16b, v17.16b 588 DIFF_LUMA_EQ4_MASK v19, v6, v22, v17 589 mov v31.16b, v17.16b 590 591 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 0 592 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 1 593 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 2 594 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 3 595 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 4 596 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 5 597 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 6 598 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 7 599 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 8 600 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 9 601 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 10 602 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 11 603 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 12 604 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 13 605 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 14 606 STORE_LUMA_DATA_3 v26, v27, v28, v29, v30, v31, 15 607DeblockLumaEq4H_AArch64_neon_end: 608WELS_ASM_AARCH64_FUNC_END 609 610 611WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaLt4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc 612 dup v16.16b, w3 //alpha 613 dup v17.16b, w4 //beta 614 lsl x3, x2, #1 615 sub x6, x0, x3 //pPixCb-2*Stride 616 sub x7, x1, x3 //pPixCr-2*Stride 617 618 ld1 {v0.d} [0], [x6], x2 619 ld1 {v1.d} [0], [x6] 620 ld1 {v2.d} [0], [x0], x2 621 ld1 {v3.d} [0], [x0] 622 ld1 {v0.d} [1], [x7], x2 623 ld1 {v1.d} [1], [x7] 624 ld1 {v2.d} [1], [x1], x2 625 ld1 {v3.d} [1], [x1] 626 627 ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5] 628 trn1 v18.4h, v18.4h, v19.4h //0011,0011, 629 trn1 v20.4h, v20.4h, v21.4h //2233,2233 630 zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233 631 cmgt v7.16b, v6.16b, #0 // iTc0 Flag 632 633 MASK_MATRIX v0, v1, v2, v3, v16, v17, v18 634 and v7.16b, v7.16b, v18.16b // need filter flag 635 636 ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4V_AArch64_neon_end 637 638 eor v18.16b, v18.16b, v18.16b 639 sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233 640 641 DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22 642 DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22 643 644 smax v19.16b, v19.16b, v18.16b 645 smin v19.16b, v19.16b, v6.16b 646 and v19.16b, v19.16b, v7.16b 647 648 EXTRACT_DELTA_INTO_TWO_PART v19, v20 649 uqadd v1.16b, v1.16b, v20.16b 650 uqsub v1.16b, v1.16b, v19.16b 651 st1 {v1.d} [0], [x6], x2 652 st1 {v1.d} [1], [x7], x2 653 uqsub v2.16b, v2.16b, v20.16b 654 uqadd v2.16b, v2.16b, v19.16b 655 st1 {v2.d} [0], [x6] 656 st1 {v2.d} [1], [x7] 657DeblockChromaLt4V_AArch64_neon_end: 658WELS_ASM_AARCH64_FUNC_END 659 660WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaLt4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta, int8_t* pTc 661 dup v16.16b, w3 //alpha 662 dup v17.16b, w4 //beta 663 sub x6, x0, #2 //pPixCb-2 664 sub x7, x1, #2 //pPixCr-2 665 666 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0 667 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1 668 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2 669 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3 670 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4 671 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5 672 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6 673 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7 674 675 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8 676 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9 677 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10 678 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11 679 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12 680 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13 681 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14 682 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15 683 684 sub x0, x0, #1 685 sub x1, x1, #1 686 687 ld4r {v18.8b, v19.8b, v20.8b, v21.8b}, [x5] 688 trn1 v18.4h, v18.4h, v19.4h //0011,0011, 689 trn1 v20.4h, v20.4h, v21.4h //2233,2233 690 zip1 v6.4s, v18.4s, v20.4s //iTc0: 0011,2233,0011,2233 691 cmgt v7.16b, v6.16b, #0 // iTc0 Flag 692 693 MASK_MATRIX v0, v1, v2, v3, v16, v17, v18 694 and v7.16b, v7.16b, v18.16b // need filter flag 695 696 ZERO_JUMP_END v7, x4, x5, DeblockChromaLt4H_AArch64_neon_end 697 eor v18.16b, v18.16b, v18.16b 698 sub v18.16b, v18.16b, v6.16b //-iTc0: 0011,2233,0011,2233 699 700 DIFF_LUMA_LT4_P0_Q0_1 v0, v1, v2, v3, v19, v20, v22 701 DIFF_LUMA_LT4_P0_Q0_2 v0, v1, v2, v3, v19, v20, v22 702 703 smax v19.16b, v19.16b, v18.16b 704 smin v19.16b, v19.16b, v6.16b 705 and v19.16b, v19.16b, v7.16b 706 707 EXTRACT_DELTA_INTO_TWO_PART v19, v20 708 uqadd v1.16b, v1.16b, v20.16b 709 uqsub v1.16b, v1.16b, v19.16b 710 uqsub v2.16b, v2.16b, v20.16b 711 uqadd v2.16b, v2.16b, v19.16b 712 713 STORE_CHROMA_DATA_2 v1, v2, x0, 0 714 STORE_CHROMA_DATA_2 v1, v2, x0, 1 715 STORE_CHROMA_DATA_2 v1, v2, x0, 2 716 STORE_CHROMA_DATA_2 v1, v2, x0, 3 717 STORE_CHROMA_DATA_2 v1, v2, x0, 4 718 STORE_CHROMA_DATA_2 v1, v2, x0, 5 719 STORE_CHROMA_DATA_2 v1, v2, x0, 6 720 STORE_CHROMA_DATA_2 v1, v2, x0, 7 721 722 STORE_CHROMA_DATA_2 v1, v2, x1, 8 723 STORE_CHROMA_DATA_2 v1, v2, x1, 9 724 STORE_CHROMA_DATA_2 v1, v2, x1, 10 725 STORE_CHROMA_DATA_2 v1, v2, x1, 11 726 STORE_CHROMA_DATA_2 v1, v2, x1, 12 727 STORE_CHROMA_DATA_2 v1, v2, x1, 13 728 STORE_CHROMA_DATA_2 v1, v2, x1, 14 729 STORE_CHROMA_DATA_2 v1, v2, x1, 15 730DeblockChromaLt4H_AArch64_neon_end: 731WELS_ASM_AARCH64_FUNC_END 732 733WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaEq4V_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta 734 dup v16.16b, w3 //alpha 735 dup v17.16b, w4 //beta 736 lsl x3, x2, #1 737 sub x6, x0, x3 //pPixCb-2*Stride 738 sub x7, x1, x3 //pPixCr-2*Stride 739 740 ld1 {v0.d} [0], [x6], x2 741 ld1 {v1.d} [0], [x6] 742 ld1 {v2.d} [0], [x0], x2 743 ld1 {v3.d} [0], [x0] 744 ld1 {v0.d} [1], [x7], x2 745 ld1 {v1.d} [1], [x7] 746 ld1 {v2.d} [1], [x1], x2 747 ld1 {v3.d} [1], [x1] 748 749 MASK_MATRIX v0, v1, v2, v3, v16, v17, v7 750 751 ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4V_AArch64_neon_end 752 753 DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21 754 DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21 755 756 mov v6.16b, v7.16b 757 bsl v6.16b, v20.16b, v1.16b 758 bsl v7.16b, v21.16b, v2.16b 759 760 st1 {v6.d} [0], [x6], x2 761 st1 {v6.d} [1], [x7], x2 762 763 st1 {v7.d} [0], [x6] 764 st1 {v7.d} [1], [x7] 765DeblockChromaEq4V_AArch64_neon_end: 766WELS_ASM_AARCH64_FUNC_END 767 768WELS_ASM_AARCH64_FUNC_BEGIN DeblockChromaEq4H_AArch64_neon //uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iAlpha, int32_t iBeta 769 dup v16.16b, w3 //alpha 770 dup v17.16b, w4 //beta 771 772 sub x6, x0, #2 //pPixCb-2 773 sub x7, x1, #2 //pPixCr-2 774 775 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 0 776 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 1 777 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 2 778 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 3 779 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 4 780 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 5 781 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 6 782 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x6, 7 783 784 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 8 785 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 9 786 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 10 787 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 11 788 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 12 789 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 13 790 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 14 791 LOAD_CHROMA_DATA_4 v0, v1, v2, v3, x7, 15 792 sub x0, x0, #1 793 sub x1, x1, #1 794 795 MASK_MATRIX v0, v1, v2, v3, v16, v17, v7 796 797 ZERO_JUMP_END v7, x3, x4, DeblockChromaEq4H_AArch64_neon_end 798 799 DIFF_CHROMA_EQ4_P0Q0_1 v0, v1, v2, v3, v18, v19, v20, v21 800 DIFF_CHROMA_EQ4_P0Q0_2 v0, v1, v2, v3, v18, v19, v20, v21 801 802 mov v6.16b, v7.16b 803 bsl v6.16b, v20.16b, v1.16b 804 bsl v7.16b, v21.16b, v2.16b 805 806 STORE_CHROMA_DATA_2 v6, v7, x0, 0 807 STORE_CHROMA_DATA_2 v6, v7, x0, 1 808 STORE_CHROMA_DATA_2 v6, v7, x0, 2 809 STORE_CHROMA_DATA_2 v6, v7, x0, 3 810 STORE_CHROMA_DATA_2 v6, v7, x0, 4 811 STORE_CHROMA_DATA_2 v6, v7, x0, 5 812 STORE_CHROMA_DATA_2 v6, v7, x0, 6 813 STORE_CHROMA_DATA_2 v6, v7, x0, 7 814 815 STORE_CHROMA_DATA_2 v6, v7, x1, 8 816 STORE_CHROMA_DATA_2 v6, v7, x1, 9 817 STORE_CHROMA_DATA_2 v6, v7, x1, 10 818 STORE_CHROMA_DATA_2 v6, v7, x1, 11 819 STORE_CHROMA_DATA_2 v6, v7, x1, 12 820 STORE_CHROMA_DATA_2 v6, v7, x1, 13 821 STORE_CHROMA_DATA_2 v6, v7, x1, 14 822 STORE_CHROMA_DATA_2 v6, v7, x1, 15 823DeblockChromaEq4H_AArch64_neon_end: 824WELS_ASM_AARCH64_FUNC_END 825 826 827WELS_ASM_AARCH64_FUNC_BEGIN DeblockingBSCalcEnc_AArch64_neon 828 // Checking the nzc status 829 BS_NZC_CHECK x0, x2, x3, v16, v17 //v16,v17 save the nzc status 830 // For checking bS[I] = 2 831 movi v0.16b, #0 832 cmgt v16.16b, v16.16b, v0.16b 833 cmgt v17.16b, v17.16b, v0.16b 834 movi v0.16b, #2 835 836 and v16.16b, v16.16b, v0.16b //v16 save the nzc check result all the time --- for dir is top 837 and v17.16b, v17.16b, v0.16b //v17 save the nzc check result all the time --- for dir is left 838 839 // Checking the mv status 840 BS_MV_CHECK x1, x2, x3, v18, v19, v5 , v6 //v18, v19 save the mv status 841 // For checking bS[I] = 1 842 movi v0.16b, #1 843 and v18.16b, v18.16b, v0.16b //v18 save the nzc check result all the time --- for dir is top 844 and v19.16b, v19.16b, v0.16b //v19 save the nzc check result all the time --- for dir is left 845 // Check bS[I] is '1' or '2' 846 umax v1.16b, v18.16b, v16.16b 847 umax v0.16b, v19.16b, v17.16b 848 st1 {v0.16b, v1.16b}, [x4] 849WELS_ASM_AARCH64_FUNC_END 850 851 852#endif 853