1/* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/aarch64/asm.S" 24#include "neon.S" 25 26.macro h264_loop_filter_start 27 cmp w2, #0 28 ldr w6, [x4] 29 ccmp w3, #0, #0, ne 30 mov v24.S[0], w6 31 and w8, w6, w6, lsl #16 32 b.eq 1f 33 ands w8, w8, w8, lsl #8 34 b.ge 2f 351: 36 ret 372: 38.endm 39 40.macro h264_loop_filter_luma 41 dup v22.16B, w2 // alpha 42 uxtl v24.8H, v24.8B 43 uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) 44 uxtl v24.4S, v24.4H 45 uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) 46 sli v24.8H, v24.8H, #8 47 uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) 48 sli v24.4S, v24.4S, #16 49 cmhi v21.16B, v22.16B, v21.16B // < alpha 50 dup v22.16B, w3 // beta 51 cmlt v23.16B, v24.16B, #0 52 cmhi v28.16B, v22.16B, v28.16B // < beta 53 cmhi v30.16B, v22.16B, v30.16B // < beta 54 bic v21.16B, v21.16B, v23.16B 55 uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) 56 and v21.16B, v21.16B, v28.16B 57 uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) 58 and v21.16B, v21.16B, v30.16B // < beta 59 shrn v30.8b, v21.8h, #4 60 mov x7, v30.d[0] 61 cmhi v17.16B, v22.16B, v17.16B // < beta 62 cmhi v19.16B, v22.16B, v19.16B // < beta 63 cbz x7, 9f 64 and v17.16B, v17.16B, v21.16B 65 and v19.16B, v19.16B, v21.16B 66 and v24.16B, v24.16B, v21.16B 67 urhadd v28.16B, v16.16B, v0.16B 68 sub v21.16B, v24.16B, v17.16B 69 uqadd v23.16B, v18.16B, v24.16B 70 uhadd v20.16B, v20.16B, v28.16B 71 sub v21.16B, v21.16B, v19.16B 72 uhadd v28.16B, v4.16B, v28.16B 73 umin v23.16B, v23.16B, v20.16B 74 uqsub v22.16B, v18.16B, v24.16B 75 uqadd v4.16B, v2.16B, v24.16B 76 umax v23.16B, v23.16B, v22.16B 77 uqsub v22.16B, v2.16B, v24.16B 78 umin v28.16B, v4.16B, v28.16B 79 uxtl v4.8H, v0.8B 80 umax v28.16B, v28.16B, v22.16B 81 uxtl2 v20.8H, v0.16B 82 usubw v4.8H, v4.8H, v16.8B 83 usubw2 v20.8H, v20.8H, v16.16B 84 shl v4.8H, v4.8H, #2 85 shl v20.8H, v20.8H, #2 86 uaddw v4.8H, v4.8H, v18.8B 87 uaddw2 v20.8H, v20.8H, v18.16B 88 usubw v4.8H, v4.8H, v2.8B 89 usubw2 v20.8H, v20.8H, v2.16B 90 rshrn v4.8B, v4.8H, #3 91 rshrn2 v4.16B, v20.8H, #3 92 bsl v17.16B, v23.16B, v18.16B 93 bsl v19.16B, v28.16B, v2.16B 94 neg v23.16B, v21.16B 95 uxtl v28.8H, v16.8B 96 smin v4.16B, v4.16B, v21.16B 97 uxtl2 v21.8H, v16.16B 98 smax v4.16B, v4.16B, v23.16B 99 uxtl v22.8H, v0.8B 100 uxtl2 v24.8H, v0.16B 101 saddw v28.8H, v28.8H, v4.8B 102 saddw2 v21.8H, v21.8H, v4.16B 103 ssubw v22.8H, v22.8H, v4.8B 104 ssubw2 v24.8H, v24.8H, v4.16B 105 sqxtun v16.8B, v28.8H 106 sqxtun2 v16.16B, v21.8H 107 sqxtun v0.8B, v22.8H 108 sqxtun2 v0.16B, v24.8H 109.endm 110 111function ff_h264_v_loop_filter_luma_neon, export=1 112 h264_loop_filter_start 113 sxtw x1, w1 114 115 ld1 {v0.16B}, [x0], x1 116 ld1 {v2.16B}, [x0], x1 117 ld1 {v4.16B}, [x0], x1 118 sub x0, x0, x1, lsl #2 119 sub x0, x0, x1, lsl #1 120 ld1 {v20.16B}, [x0], x1 121 ld1 {v18.16B}, [x0], x1 122 ld1 {v16.16B}, [x0], x1 123 124 h264_loop_filter_luma 125 126 sub x0, x0, x1, lsl #1 127 st1 {v17.16B}, [x0], x1 128 st1 {v16.16B}, [x0], x1 129 st1 {v0.16B}, [x0], x1 130 st1 {v19.16B}, [x0] 1319: 132 ret 133endfunc 134 135function ff_h264_h_loop_filter_luma_neon, export=1 136 h264_loop_filter_start 137 sxtw x1, w1 138 139 sub x0, x0, #4 140 ld1 {v6.8B}, [x0], x1 141 ld1 {v20.8B}, [x0], x1 142 ld1 {v18.8B}, [x0], x1 143 ld1 {v16.8B}, [x0], x1 144 ld1 {v0.8B}, [x0], x1 145 ld1 {v2.8B}, [x0], x1 146 ld1 {v4.8B}, [x0], x1 147 ld1 {v26.8B}, [x0], x1 148 ld1 {v6.D}[1], [x0], x1 149 ld1 {v20.D}[1], [x0], x1 150 ld1 {v18.D}[1], [x0], x1 151 ld1 {v16.D}[1], [x0], x1 152 ld1 {v0.D}[1], [x0], x1 153 ld1 {v2.D}[1], [x0], x1 154 ld1 {v4.D}[1], [x0], x1 155 ld1 {v26.D}[1], [x0], x1 156 157 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 158 159 h264_loop_filter_luma 160 161 transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27 162 163 sub x0, x0, x1, lsl #4 164 add x0, x0, #2 165 st1 {v17.S}[0], [x0], x1 166 st1 {v16.S}[0], [x0], x1 167 st1 {v0.S}[0], [x0], x1 168 st1 {v19.S}[0], [x0], x1 169 st1 {v17.S}[1], [x0], x1 170 st1 {v16.S}[1], [x0], x1 171 st1 {v0.S}[1], [x0], x1 172 st1 {v19.S}[1], [x0], x1 173 st1 {v17.S}[2], [x0], x1 174 st1 {v16.S}[2], [x0], x1 175 st1 {v0.S}[2], [x0], x1 176 st1 {v19.S}[2], [x0], x1 177 st1 {v17.S}[3], [x0], x1 178 st1 {v16.S}[3], [x0], x1 179 st1 {v0.S}[3], [x0], x1 180 st1 {v19.S}[3], [x0], x1 1819: 182 ret 183endfunc 184 185 186.macro h264_loop_filter_start_intra 187 orr w4, w2, w3 188 cbnz w4, 1f 189 ret 1901: 191 sxtw x1, w1 192 dup v30.16b, w2 // alpha 193 dup v31.16b, w3 // beta 194.endm 195 196.macro h264_loop_filter_luma_intra 197 uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) 198 uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) 199 uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) 200 cmhi v19.16b, v30.16b, v16.16b // < alpha 201 cmhi v17.16b, v31.16b, v17.16b // < beta 202 cmhi v18.16b, v31.16b, v18.16b // < beta 203 204 movi v29.16b, #2 205 ushr v30.16b, v30.16b, #2 // alpha >> 2 206 add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 207 cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 208 209 and v19.16b, v19.16b, v17.16b 210 and v19.16b, v19.16b, v18.16b 211 shrn v20.8b, v19.8h, #4 212 mov x4, v20.d[0] 213 cbz x4, 9f 214 215 ushll v20.8h, v6.8b, #1 216 ushll v22.8h, v1.8b, #1 217 ushll2 v21.8h, v6.16b, #1 218 ushll2 v23.8h, v1.16b, #1 219 uaddw v20.8h, v20.8h, v7.8b 220 uaddw v22.8h, v22.8h, v0.8b 221 uaddw2 v21.8h, v21.8h, v7.16b 222 uaddw2 v23.8h, v23.8h, v0.16b 223 uaddw v20.8h, v20.8h, v1.8b 224 uaddw v22.8h, v22.8h, v6.8b 225 uaddw2 v21.8h, v21.8h, v1.16b 226 uaddw2 v23.8h, v23.8h, v6.16b 227 228 rshrn v24.8b, v20.8h, #2 // p0'_1 229 rshrn v25.8b, v22.8h, #2 // q0'_1 230 rshrn2 v24.16b, v21.8h, #2 // p0'_1 231 rshrn2 v25.16b, v23.8h, #2 // q0'_1 232 233 uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) 234 uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) 235 cmhi v17.16b, v31.16b, v17.16b // < beta 236 cmhi v18.16b, v31.16b, v18.16b // < beta 237 238 and v17.16b, v16.16b, v17.16b // if_2 && if_3 239 and v18.16b, v16.16b, v18.16b // if_2 && if_4 240 241 not v30.16b, v17.16b 242 not v31.16b, v18.16b 243 244 and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) 245 and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) 246 247 and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 248 and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 249 250 //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 251 uaddl v26.8h, v5.8b, v7.8b 252 uaddl2 v27.8h, v5.16b, v7.16b 253 uaddw v26.8h, v26.8h, v0.8b 254 uaddw2 v27.8h, v27.8h, v0.16b 255 add v20.8h, v20.8h, v26.8h 256 add v21.8h, v21.8h, v27.8h 257 uaddw v20.8h, v20.8h, v0.8b 258 uaddw2 v21.8h, v21.8h, v0.16b 259 rshrn v20.8b, v20.8h, #3 // p0'_2 260 rshrn2 v20.16b, v21.8h, #3 // p0'_2 261 uaddw v26.8h, v26.8h, v6.8b 262 uaddw2 v27.8h, v27.8h, v6.16b 263 rshrn v21.8b, v26.8h, #2 // p1'_2 264 rshrn2 v21.16b, v27.8h, #2 // p1'_2 265 uaddl v28.8h, v4.8b, v5.8b 266 uaddl2 v29.8h, v4.16b, v5.16b 267 shl v28.8h, v28.8h, #1 268 shl v29.8h, v29.8h, #1 269 add v28.8h, v28.8h, v26.8h 270 add v29.8h, v29.8h, v27.8h 271 rshrn v19.8b, v28.8h, #3 // p2'_2 272 rshrn2 v19.16b, v29.8h, #3 // p2'_2 273 274 //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 275 uaddl v26.8h, v2.8b, v0.8b 276 uaddl2 v27.8h, v2.16b, v0.16b 277 uaddw v26.8h, v26.8h, v7.8b 278 uaddw2 v27.8h, v27.8h, v7.16b 279 add v22.8h, v22.8h, v26.8h 280 add v23.8h, v23.8h, v27.8h 281 uaddw v22.8h, v22.8h, v7.8b 282 uaddw2 v23.8h, v23.8h, v7.16b 283 rshrn v22.8b, v22.8h, #3 // q0'_2 284 rshrn2 v22.16b, v23.8h, #3 // q0'_2 285 uaddw v26.8h, v26.8h, v1.8b 286 uaddw2 v27.8h, v27.8h, v1.16b 287 rshrn v23.8b, v26.8h, #2 // q1'_2 288 rshrn2 v23.16b, v27.8h, #2 // q1'_2 289 uaddl v28.8h, v2.8b, v3.8b 290 uaddl2 v29.8h, v2.16b, v3.16b 291 shl v28.8h, v28.8h, #1 292 shl v29.8h, v29.8h, #1 293 add v28.8h, v28.8h, v26.8h 294 add v29.8h, v29.8h, v27.8h 295 rshrn v26.8b, v28.8h, #3 // q2'_2 296 rshrn2 v26.16b, v29.8h, #3 // q2'_2 297 298 bit v7.16b, v24.16b, v30.16b // p0'_1 299 bit v0.16b, v25.16b, v31.16b // q0'_1 300 bit v7.16b, v20.16b, v17.16b // p0'_2 301 bit v6.16b, v21.16b, v17.16b // p1'_2 302 bit v5.16b, v19.16b, v17.16b // p2'_2 303 bit v0.16b, v22.16b, v18.16b // q0'_2 304 bit v1.16b, v23.16b, v18.16b // q1'_2 305 bit v2.16b, v26.16b, v18.16b // q2'_2 306.endm 307 308function ff_h264_v_loop_filter_luma_intra_neon, export=1 309 h264_loop_filter_start_intra 310 311 ld1 {v0.16b}, [x0], x1 // q0 312 ld1 {v1.16b}, [x0], x1 // q1 313 ld1 {v2.16b}, [x0], x1 // q2 314 ld1 {v3.16b}, [x0], x1 // q3 315 sub x0, x0, x1, lsl #3 316 ld1 {v4.16b}, [x0], x1 // p3 317 ld1 {v5.16b}, [x0], x1 // p2 318 ld1 {v6.16b}, [x0], x1 // p1 319 ld1 {v7.16b}, [x0] // p0 320 321 h264_loop_filter_luma_intra 322 323 sub x0, x0, x1, lsl #1 324 st1 {v5.16b}, [x0], x1 // p2 325 st1 {v6.16b}, [x0], x1 // p1 326 st1 {v7.16b}, [x0], x1 // p0 327 st1 {v0.16b}, [x0], x1 // q0 328 st1 {v1.16b}, [x0], x1 // q1 329 st1 {v2.16b}, [x0] // q2 3309: 331 ret 332endfunc 333 334function ff_h264_h_loop_filter_luma_intra_neon, export=1 335 h264_loop_filter_start_intra 336 337 sub x0, x0, #4 338 ld1 {v4.8b}, [x0], x1 339 ld1 {v5.8b}, [x0], x1 340 ld1 {v6.8b}, [x0], x1 341 ld1 {v7.8b}, [x0], x1 342 ld1 {v0.8b}, [x0], x1 343 ld1 {v1.8b}, [x0], x1 344 ld1 {v2.8b}, [x0], x1 345 ld1 {v3.8b}, [x0], x1 346 ld1 {v4.d}[1], [x0], x1 347 ld1 {v5.d}[1], [x0], x1 348 ld1 {v6.d}[1], [x0], x1 349 ld1 {v7.d}[1], [x0], x1 350 ld1 {v0.d}[1], [x0], x1 351 ld1 {v1.d}[1], [x0], x1 352 ld1 {v2.d}[1], [x0], x1 353 ld1 {v3.d}[1], [x0], x1 354 355 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 356 357 h264_loop_filter_luma_intra 358 359 transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 360 361 sub x0, x0, x1, lsl #4 362 st1 {v4.8b}, [x0], x1 363 st1 {v5.8b}, [x0], x1 364 st1 {v6.8b}, [x0], x1 365 st1 {v7.8b}, [x0], x1 366 st1 {v0.8b}, [x0], x1 367 st1 {v1.8b}, [x0], x1 368 st1 {v2.8b}, [x0], x1 369 st1 {v3.8b}, [x0], x1 370 st1 {v4.d}[1], [x0], x1 371 st1 {v5.d}[1], [x0], x1 372 st1 {v6.d}[1], [x0], x1 373 st1 {v7.d}[1], [x0], x1 374 st1 {v0.d}[1], [x0], x1 375 st1 {v1.d}[1], [x0], x1 376 st1 {v2.d}[1], [x0], x1 377 st1 {v3.d}[1], [x0], x1 3789: 379 ret 380endfunc 381 382.macro h264_loop_filter_chroma 383 dup v22.8B, w2 // alpha 384 dup v23.8B, w3 // beta 385 uxtl v24.8H, v24.8B 386 uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) 387 uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) 388 uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) 389 cmhi v26.8B, v22.8B, v26.8B // < alpha 390 cmhi v28.8B, v23.8B, v28.8B // < beta 391 cmhi v30.8B, v23.8B, v30.8B // < beta 392 uxtl v4.8H, v0.8B 393 and v26.8B, v26.8B, v28.8B 394 usubw v4.8H, v4.8H, v16.8B 395 and v26.8B, v26.8B, v30.8B 396 shl v4.8H, v4.8H, #2 397 mov x8, v26.d[0] 398 sli v24.8H, v24.8H, #8 399 uaddw v4.8H, v4.8H, v18.8B 400 cbz x8, 9f 401 usubw v4.8H, v4.8H, v2.8B 402 rshrn v4.8B, v4.8H, #3 403 smin v4.8B, v4.8B, v24.8B 404 neg v25.8B, v24.8B 405 smax v4.8B, v4.8B, v25.8B 406 uxtl v22.8H, v0.8B 407 and v4.8B, v4.8B, v26.8B 408 uxtl v28.8H, v16.8B 409 saddw v28.8H, v28.8H, v4.8B 410 ssubw v22.8H, v22.8H, v4.8B 411 sqxtun v16.8B, v28.8H 412 sqxtun v0.8B, v22.8H 413.endm 414 415function ff_h264_v_loop_filter_chroma_neon, export=1 416 h264_loop_filter_start 417 sxtw x1, w1 418 419 sub x0, x0, x1, lsl #1 420 ld1 {v18.8B}, [x0], x1 421 ld1 {v16.8B}, [x0], x1 422 ld1 {v0.8B}, [x0], x1 423 ld1 {v2.8B}, [x0] 424 425 h264_loop_filter_chroma 426 427 sub x0, x0, x1, lsl #1 428 st1 {v16.8B}, [x0], x1 429 st1 {v0.8B}, [x0], x1 4309: 431 ret 432endfunc 433 434function ff_h264_h_loop_filter_chroma_neon, export=1 435 h264_loop_filter_start 436 sxtw x1, w1 437 438 sub x0, x0, #2 439h_loop_filter_chroma420: 440 ld1 {v18.S}[0], [x0], x1 441 ld1 {v16.S}[0], [x0], x1 442 ld1 {v0.S}[0], [x0], x1 443 ld1 {v2.S}[0], [x0], x1 444 ld1 {v18.S}[1], [x0], x1 445 ld1 {v16.S}[1], [x0], x1 446 ld1 {v0.S}[1], [x0], x1 447 ld1 {v2.S}[1], [x0], x1 448 449 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 450 451 h264_loop_filter_chroma 452 453 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 454 455 sub x0, x0, x1, lsl #3 456 st1 {v18.S}[0], [x0], x1 457 st1 {v16.S}[0], [x0], x1 458 st1 {v0.S}[0], [x0], x1 459 st1 {v2.S}[0], [x0], x1 460 st1 {v18.S}[1], [x0], x1 461 st1 {v16.S}[1], [x0], x1 462 st1 {v0.S}[1], [x0], x1 463 st1 {v2.S}[1], [x0], x1 4649: 465 ret 466endfunc 467 468function ff_h264_h_loop_filter_chroma422_neon, export=1 469 sxtw x1, w1 470 h264_loop_filter_start 471 add x5, x0, x1 472 sub x0, x0, #2 473 add x1, x1, x1 474 mov x7, x30 475 bl h_loop_filter_chroma420 476 mov x30, x7 477 sub x0, x5, #2 478 mov v24.s[0], w6 479 b h_loop_filter_chroma420 480endfunc 481 482.macro h264_loop_filter_chroma_intra 483 uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) 484 uabd v27.8b, v18.8b, v16.8b // abs(p1 - p0) 485 uabd v28.8b, v19.8b, v17.8b // abs(q1 - q0) 486 cmhi v26.8b, v30.8b, v26.8b // < alpha 487 cmhi v27.8b, v31.8b, v27.8b // < beta 488 cmhi v28.8b, v31.8b, v28.8b // < beta 489 and v26.8b, v26.8b, v27.8b 490 and v26.8b, v26.8b, v28.8b 491 mov x2, v26.d[0] 492 493 ushll v4.8h, v18.8b, #1 494 ushll v6.8h, v19.8b, #1 495 cbz x2, 9f 496 uaddl v20.8h, v16.8b, v19.8b 497 uaddl v22.8h, v17.8b, v18.8b 498 add v20.8h, v20.8h, v4.8h 499 add v22.8h, v22.8h, v6.8h 500 uqrshrn v24.8b, v20.8h, #2 501 uqrshrn v25.8b, v22.8h, #2 502 bit v16.8b, v24.8b, v26.8b 503 bit v17.8b, v25.8b, v26.8b 504.endm 505 506function ff_h264_v_loop_filter_chroma_intra_neon, export=1 507 h264_loop_filter_start_intra 508 509 sub x0, x0, x1, lsl #1 510 ld1 {v18.8b}, [x0], x1 511 ld1 {v16.8b}, [x0], x1 512 ld1 {v17.8b}, [x0], x1 513 ld1 {v19.8b}, [x0] 514 515 h264_loop_filter_chroma_intra 516 517 sub x0, x0, x1, lsl #1 518 st1 {v16.8b}, [x0], x1 519 st1 {v17.8b}, [x0], x1 520 5219: 522 ret 523endfunc 524 525function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1 526 h264_loop_filter_start_intra 527 528 sub x4, x0, #2 529 sub x0, x0, #1 530 ld1 {v18.8b}, [x4], x1 531 ld1 {v16.8b}, [x4], x1 532 ld1 {v17.8b}, [x4], x1 533 ld1 {v19.8b}, [x4], x1 534 535 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 536 537 h264_loop_filter_chroma_intra 538 539 st2 {v16.b,v17.b}[0], [x0], x1 540 st2 {v16.b,v17.b}[1], [x0], x1 541 st2 {v16.b,v17.b}[2], [x0], x1 542 st2 {v16.b,v17.b}[3], [x0], x1 543 5449: 545 ret 546endfunc 547 548function ff_h264_h_loop_filter_chroma_intra_neon, export=1 549 h264_loop_filter_start_intra 550 551 sub x4, x0, #2 552 sub x0, x0, #1 553h_loop_filter_chroma420_intra: 554 ld1 {v18.8b}, [x4], x1 555 ld1 {v16.8b}, [x4], x1 556 ld1 {v17.8b}, [x4], x1 557 ld1 {v19.8b}, [x4], x1 558 ld1 {v18.s}[1], [x4], x1 559 ld1 {v16.s}[1], [x4], x1 560 ld1 {v17.s}[1], [x4], x1 561 ld1 {v19.s}[1], [x4], x1 562 563 transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29 564 565 h264_loop_filter_chroma_intra 566 567 st2 {v16.b,v17.b}[0], [x0], x1 568 st2 {v16.b,v17.b}[1], [x0], x1 569 st2 {v16.b,v17.b}[2], [x0], x1 570 st2 {v16.b,v17.b}[3], [x0], x1 571 st2 {v16.b,v17.b}[4], [x0], x1 572 st2 {v16.b,v17.b}[5], [x0], x1 573 st2 {v16.b,v17.b}[6], [x0], x1 574 st2 {v16.b,v17.b}[7], [x0], x1 575 5769: 577 ret 578endfunc 579 580function ff_h264_h_loop_filter_chroma422_intra_neon, export=1 581 h264_loop_filter_start_intra 582 sub x4, x0, #2 583 add x5, x0, x1, lsl #3 584 sub x0, x0, #1 585 mov x7, x30 586 bl h_loop_filter_chroma420_intra 587 sub x0, x5, #1 588 mov x30, x7 589 b h_loop_filter_chroma420_intra 590endfunc 591 592.macro biweight_16 macs, macd 593 dup v0.16B, w5 594 dup v1.16B, w6 595 mov v4.16B, v16.16B 596 mov v6.16B, v16.16B 5971: subs w3, w3, #2 598 ld1 {v20.16B}, [x0], x2 599 \macd v4.8H, v0.8B, v20.8B 600 \macd\()2 v6.8H, v0.16B, v20.16B 601 ld1 {v22.16B}, [x1], x2 602 \macs v4.8H, v1.8B, v22.8B 603 \macs\()2 v6.8H, v1.16B, v22.16B 604 mov v24.16B, v16.16B 605 ld1 {v28.16B}, [x0], x2 606 mov v26.16B, v16.16B 607 \macd v24.8H, v0.8B, v28.8B 608 \macd\()2 v26.8H, v0.16B, v28.16B 609 ld1 {v30.16B}, [x1], x2 610 \macs v24.8H, v1.8B, v30.8B 611 \macs\()2 v26.8H, v1.16B, v30.16B 612 sshl v4.8H, v4.8H, v18.8H 613 sshl v6.8H, v6.8H, v18.8H 614 sqxtun v4.8B, v4.8H 615 sqxtun2 v4.16B, v6.8H 616 sshl v24.8H, v24.8H, v18.8H 617 sshl v26.8H, v26.8H, v18.8H 618 sqxtun v24.8B, v24.8H 619 sqxtun2 v24.16B, v26.8H 620 mov v6.16B, v16.16B 621 st1 {v4.16B}, [x7], x2 622 mov v4.16B, v16.16B 623 st1 {v24.16B}, [x7], x2 624 b.ne 1b 625 ret 626.endm 627 628.macro biweight_8 macs, macd 629 dup v0.8B, w5 630 dup v1.8B, w6 631 mov v2.16B, v16.16B 632 mov v20.16B, v16.16B 6331: subs w3, w3, #2 634 ld1 {v4.8B}, [x0], x2 635 \macd v2.8H, v0.8B, v4.8B 636 ld1 {v5.8B}, [x1], x2 637 \macs v2.8H, v1.8B, v5.8B 638 ld1 {v6.8B}, [x0], x2 639 \macd v20.8H, v0.8B, v6.8B 640 ld1 {v7.8B}, [x1], x2 641 \macs v20.8H, v1.8B, v7.8B 642 sshl v2.8H, v2.8H, v18.8H 643 sqxtun v2.8B, v2.8H 644 sshl v20.8H, v20.8H, v18.8H 645 sqxtun v4.8B, v20.8H 646 mov v20.16B, v16.16B 647 st1 {v2.8B}, [x7], x2 648 mov v2.16B, v16.16B 649 st1 {v4.8B}, [x7], x2 650 b.ne 1b 651 ret 652.endm 653 654.macro biweight_4 macs, macd 655 dup v0.8B, w5 656 dup v1.8B, w6 657 mov v2.16B, v16.16B 658 mov v20.16B,v16.16B 6591: subs w3, w3, #4 660 ld1 {v4.S}[0], [x0], x2 661 ld1 {v4.S}[1], [x0], x2 662 \macd v2.8H, v0.8B, v4.8B 663 ld1 {v5.S}[0], [x1], x2 664 ld1 {v5.S}[1], [x1], x2 665 \macs v2.8H, v1.8B, v5.8B 666 b.lt 2f 667 ld1 {v6.S}[0], [x0], x2 668 ld1 {v6.S}[1], [x0], x2 669 \macd v20.8H, v0.8B, v6.8B 670 ld1 {v7.S}[0], [x1], x2 671 ld1 {v7.S}[1], [x1], x2 672 \macs v20.8H, v1.8B, v7.8B 673 sshl v2.8H, v2.8H, v18.8H 674 sqxtun v2.8B, v2.8H 675 sshl v20.8H, v20.8H, v18.8H 676 sqxtun v4.8B, v20.8H 677 mov v20.16B, v16.16B 678 st1 {v2.S}[0], [x7], x2 679 st1 {v2.S}[1], [x7], x2 680 mov v2.16B, v16.16B 681 st1 {v4.S}[0], [x7], x2 682 st1 {v4.S}[1], [x7], x2 683 b.ne 1b 684 ret 6852: sshl v2.8H, v2.8H, v18.8H 686 sqxtun v2.8B, v2.8H 687 st1 {v2.S}[0], [x7], x2 688 st1 {v2.S}[1], [x7], x2 689 ret 690.endm 691 692.macro biweight_func w 693function ff_biweight_h264_pixels_\w\()_neon, export=1 694 sxtw x2, w2 695 lsr w8, w5, #31 696 add w7, w7, #1 697 eor w8, w8, w6, lsr #30 698 orr w7, w7, #1 699 dup v18.8H, w4 700 lsl w7, w7, w4 701 not v18.16B, v18.16B 702 dup v16.8H, w7 703 mov x7, x0 704 cbz w8, 10f 705 subs w8, w8, #1 706 b.eq 20f 707 subs w8, w8, #1 708 b.eq 30f 709 b 40f 71010: biweight_\w umlal, umlal 71120: neg w5, w5 712 biweight_\w umlal, umlsl 71330: neg w5, w5 714 neg w6, w6 715 biweight_\w umlsl, umlsl 71640: neg w6, w6 717 biweight_\w umlsl, umlal 718endfunc 719.endm 720 721 biweight_func 16 722 biweight_func 8 723 biweight_func 4 724 725.macro weight_16 add 726 dup v0.16B, w4 7271: subs w2, w2, #2 728 ld1 {v20.16B}, [x0], x1 729 umull v4.8H, v0.8B, v20.8B 730 umull2 v6.8H, v0.16B, v20.16B 731 ld1 {v28.16B}, [x0], x1 732 umull v24.8H, v0.8B, v28.8B 733 umull2 v26.8H, v0.16B, v28.16B 734 \add v4.8H, v16.8H, v4.8H 735 srshl v4.8H, v4.8H, v18.8H 736 \add v6.8H, v16.8H, v6.8H 737 srshl v6.8H, v6.8H, v18.8H 738 sqxtun v4.8B, v4.8H 739 sqxtun2 v4.16B, v6.8H 740 \add v24.8H, v16.8H, v24.8H 741 srshl v24.8H, v24.8H, v18.8H 742 \add v26.8H, v16.8H, v26.8H 743 srshl v26.8H, v26.8H, v18.8H 744 sqxtun v24.8B, v24.8H 745 sqxtun2 v24.16B, v26.8H 746 st1 {v4.16B}, [x5], x1 747 st1 {v24.16B}, [x5], x1 748 b.ne 1b 749 ret 750.endm 751 752.macro weight_8 add 753 dup v0.8B, w4 7541: subs w2, w2, #2 755 ld1 {v4.8B}, [x0], x1 756 umull v2.8H, v0.8B, v4.8B 757 ld1 {v6.8B}, [x0], x1 758 umull v20.8H, v0.8B, v6.8B 759 \add v2.8H, v16.8H, v2.8H 760 srshl v2.8H, v2.8H, v18.8H 761 sqxtun v2.8B, v2.8H 762 \add v20.8H, v16.8H, v20.8H 763 srshl v20.8H, v20.8H, v18.8H 764 sqxtun v4.8B, v20.8H 765 st1 {v2.8B}, [x5], x1 766 st1 {v4.8B}, [x5], x1 767 b.ne 1b 768 ret 769.endm 770 771.macro weight_4 add 772 dup v0.8B, w4 7731: subs w2, w2, #4 774 ld1 {v4.S}[0], [x0], x1 775 ld1 {v4.S}[1], [x0], x1 776 umull v2.8H, v0.8B, v4.8B 777 b.lt 2f 778 ld1 {v6.S}[0], [x0], x1 779 ld1 {v6.S}[1], [x0], x1 780 umull v20.8H, v0.8B, v6.8B 781 \add v2.8H, v16.8H, v2.8H 782 srshl v2.8H, v2.8H, v18.8H 783 sqxtun v2.8B, v2.8H 784 \add v20.8H, v16.8H, v20.8H 785 srshl v20.8H, v20.8h, v18.8H 786 sqxtun v4.8B, v20.8H 787 st1 {v2.S}[0], [x5], x1 788 st1 {v2.S}[1], [x5], x1 789 st1 {v4.S}[0], [x5], x1 790 st1 {v4.S}[1], [x5], x1 791 b.ne 1b 792 ret 7932: \add v2.8H, v16.8H, v2.8H 794 srshl v2.8H, v2.8H, v18.8H 795 sqxtun v2.8B, v2.8H 796 st1 {v2.S}[0], [x5], x1 797 st1 {v2.S}[1], [x5], x1 798 ret 799.endm 800 801.macro weight_func w 802function ff_weight_h264_pixels_\w\()_neon, export=1 803 sxtw x1, w1 804 cmp w3, #1 805 mov w6, #1 806 lsl w5, w5, w3 807 dup v16.8H, w5 808 mov x5, x0 809 b.le 20f 810 sub w6, w6, w3 811 dup v18.8H, w6 812 cmp w4, #0 813 b.lt 10f 814 weight_\w shadd 81510: neg w4, w4 816 weight_\w shsub 81720: neg w6, w3 818 dup v18.8H, w6 819 cmp w4, #0 820 b.lt 10f 821 weight_\w add 82210: neg w4, w4 823 weight_\w sub 824endfunc 825.endm 826 827 weight_func 16 828 weight_func 8 829 weight_func 4 830