1;****************************************************************************** 2;* mpeg4 qpel 3;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 4;* Copyright (c) 2008 Loren Merritt 5;* Copyright (c) 2013 Daniel Kang 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27cextern pb_1 28cextern pw_3 29cextern pw_15 30cextern pw_16 31cextern pw_20 32 33 34SECTION .text 35 36; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 37%macro PUT_NO_RND_PIXELS8_L2 0 38cglobal put_no_rnd_pixels8_l2, 6,6 39 movsxdifnidn r4, r4d 40 movsxdifnidn r3, r3d 41 pcmpeqb m6, m6 42 test r5d, 1 43 je .loop 44 mova m0, [r1] 45 mova m1, [r2] 46 add r1, r4 47 add r2, 8 48 pxor m0, m6 49 pxor m1, m6 50 PAVGB m0, m1 51 pxor m0, m6 52 mova [r0], m0 53 add r0, r3 54 dec r5d 55.loop: 56 mova m0, [r1] 57 add r1, r4 58 mova m1, [r1] 59 add r1, r4 60 mova m2, [r2] 61 mova m3, [r2+8] 62 pxor m0, m6 63 pxor m1, m6 64 pxor m2, m6 65 pxor m3, m6 66 PAVGB m0, m2 67 PAVGB m1, m3 68 pxor m0, m6 69 pxor m1, m6 70 mova [r0], m0 71 add r0, r3 72 mova [r0], m1 73 add r0, r3 74 mova m0, [r1] 75 add r1, r4 76 mova m1, [r1] 77 add r1, r4 78 mova m2, [r2+16] 79 mova m3, [r2+24] 80 pxor m0, m6 81 pxor m1, m6 82 pxor m2, m6 83 pxor m3, m6 84 PAVGB m0, m2 85 PAVGB m1, m3 86 pxor m0, m6 87 pxor m1, m6 88 mova [r0], m0 89 add r0, r3 90 mova [r0], m1 91 add r0, r3 92 add r2, 32 93 sub r5d, 4 94 jne .loop 95 REP_RET 96%endmacro 97 98INIT_MMX mmxext 99PUT_NO_RND_PIXELS8_L2 100 101 102; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 103%macro PUT_NO_RND_PIXELS16_l2 0 104cglobal put_no_rnd_pixels16_l2, 6,6 105 movsxdifnidn r3, r3d 106 movsxdifnidn r4, r4d 107 pcmpeqb m6, m6 108 test r5d, 1 109 je .loop 110 mova m0, [r1] 111 mova m1, [r1+8] 112 mova m2, [r2] 113 mova m3, [r2+8] 114 pxor m0, m6 115 pxor m1, m6 116 pxor m2, m6 117 pxor m3, m6 118 PAVGB m0, m2 119 PAVGB m1, m3 120 pxor m0, m6 121 pxor m1, m6 122 add r1, r4 123 add r2, 16 124 mova [r0], m0 125 mova [r0+8], m1 126 add r0, r3 127 dec r5d 128.loop: 129 mova m0, [r1] 130 mova m1, [r1+8] 131 add r1, r4 132 mova m2, [r2] 133 mova m3, [r2+8] 134 pxor m0, m6 135 pxor m1, m6 136 pxor m2, m6 137 pxor m3, m6 138 PAVGB m0, m2 139 PAVGB m1, m3 140 pxor m0, m6 141 pxor m1, m6 142 mova [r0], m0 143 mova [r0+8], m1 144 add r0, r3 145 mova m0, [r1] 146 mova m1, [r1+8] 147 add r1, r4 148 mova m2, [r2+16] 149 mova m3, [r2+24] 150 pxor m0, m6 151 pxor m1, m6 152 pxor m2, m6 153 pxor m3, m6 154 PAVGB m0, m2 155 PAVGB m1, m3 156 pxor m0, m6 157 pxor m1, m6 158 mova [r0], m0 159 mova [r0+8], m1 160 add r0, r3 161 add r2, 32 162 sub r5d, 2 163 jne .loop 164 REP_RET 165%endmacro 166 167INIT_MMX mmxext 168PUT_NO_RND_PIXELS16_l2 169 170%macro MPEG4_QPEL16_H_LOWPASS 1 171cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 172 movsxdifnidn r2, r2d 173 movsxdifnidn r3, r3d 174 pxor m7, m7 175.loop: 176 mova m0, [r1] 177 mova m1, m0 178 mova m2, m0 179 punpcklbw m0, m7 180 punpckhbw m1, m7 181 pshufw m5, m0, 0x90 182 pshufw m6, m0, 0x41 183 mova m3, m2 184 mova m4, m2 185 psllq m2, 8 186 psllq m3, 16 187 psllq m4, 24 188 punpckhbw m2, m7 189 punpckhbw m3, m7 190 punpckhbw m4, m7 191 paddw m5, m3 192 paddw m6, m2 193 paddw m5, m5 194 psubw m6, m5 195 pshufw m5, m0, 6 196 pmullw m6, [pw_3] 197 paddw m0, m4 198 paddw m5, m1 199 pmullw m0, [pw_20] 200 psubw m0, m5 201 paddw m6, [PW_ROUND] 202 paddw m0, m6 203 psraw m0, 5 204 mova [rsp+8], m0 205 mova m0, [r1+5] 206 mova m5, m0 207 mova m6, m0 208 psrlq m0, 8 209 psrlq m5, 16 210 punpcklbw m0, m7 211 punpcklbw m5, m7 212 paddw m2, m0 213 paddw m3, m5 214 paddw m2, m2 215 psubw m3, m2 216 mova m2, m6 217 psrlq m6, 24 218 punpcklbw m2, m7 219 punpcklbw m6, m7 220 pmullw m3, [pw_3] 221 paddw m1, m2 222 paddw m4, m6 223 pmullw m1, [pw_20] 224 psubw m3, m4 225 paddw m1, [PW_ROUND] 226 paddw m3, m1 227 psraw m3, 5 228 mova m1, [rsp+8] 229 packuswb m1, m3 230 OP_MOV [r0], m1, m4 231 mova m1, [r1+9] 232 mova m4, m1 233 mova m3, m1 234 psrlq m1, 8 235 psrlq m4, 16 236 punpcklbw m1, m7 237 punpcklbw m4, m7 238 paddw m5, m1 239 paddw m0, m4 240 paddw m5, m5 241 psubw m0, m5 242 mova m5, m3 243 psrlq m3, 24 244 pmullw m0, [pw_3] 245 punpcklbw m3, m7 246 paddw m2, m3 247 psubw m0, m2 248 mova m2, m5 249 punpcklbw m2, m7 250 punpckhbw m5, m7 251 paddw m6, m2 252 pmullw m6, [pw_20] 253 paddw m0, [PW_ROUND] 254 paddw m0, m6 255 psraw m0, 5 256 paddw m3, m5 257 pshufw m6, m5, 0xf9 258 paddw m6, m4 259 pshufw m4, m5, 0xbe 260 pshufw m5, m5, 0x6f 261 paddw m4, m1 262 paddw m5, m2 263 paddw m6, m6 264 psubw m4, m6 265 pmullw m3, [pw_20] 266 pmullw m4, [pw_3] 267 psubw m3, m5 268 paddw m4, [PW_ROUND] 269 paddw m4, m3 270 psraw m4, 5 271 packuswb m0, m4 272 OP_MOV [r0+8], m0, m4 273 add r1, r3 274 add r0, r2 275 dec r4d 276 jne .loop 277 REP_RET 278%endmacro 279 280%macro PUT_OP 2-3 281 mova %1, %2 282%endmacro 283 284%macro AVG_OP 2-3 285 mova %3, %1 286 pavgb %2, %3 287 mova %1, %2 288%endmacro 289 290INIT_MMX mmxext 291%define PW_ROUND pw_16 292%define OP_MOV PUT_OP 293MPEG4_QPEL16_H_LOWPASS put 294%define PW_ROUND pw_16 295%define OP_MOV AVG_OP 296MPEG4_QPEL16_H_LOWPASS avg 297%define PW_ROUND pw_15 298%define OP_MOV PUT_OP 299MPEG4_QPEL16_H_LOWPASS put_no_rnd 300 301 302 303%macro MPEG4_QPEL8_H_LOWPASS 1 304cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8 305 movsxdifnidn r2, r2d 306 movsxdifnidn r3, r3d 307 pxor m7, m7 308.loop: 309 mova m0, [r1] 310 mova m1, m0 311 mova m2, m0 312 punpcklbw m0, m7 313 punpckhbw m1, m7 314 pshufw m5, m0, 0x90 315 pshufw m6, m0, 0x41 316 mova m3, m2 317 mova m4, m2 318 psllq m2, 8 319 psllq m3, 16 320 psllq m4, 24 321 punpckhbw m2, m7 322 punpckhbw m3, m7 323 punpckhbw m4, m7 324 paddw m5, m3 325 paddw m6, m2 326 paddw m5, m5 327 psubw m6, m5 328 pshufw m5, m0, 0x6 329 pmullw m6, [pw_3] 330 paddw m0, m4 331 paddw m5, m1 332 pmullw m0, [pw_20] 333 psubw m0, m5 334 paddw m6, [PW_ROUND] 335 paddw m0, m6 336 psraw m0, 5 337 movh m5, [r1+5] 338 punpcklbw m5, m7 339 pshufw m6, m5, 0xf9 340 paddw m1, m5 341 paddw m2, m6 342 pshufw m6, m5, 0xbe 343 pshufw m5, m5, 0x6f 344 paddw m3, m6 345 paddw m4, m5 346 paddw m2, m2 347 psubw m3, m2 348 pmullw m1, [pw_20] 349 pmullw m3, [pw_3] 350 psubw m3, m4 351 paddw m1, [PW_ROUND] 352 paddw m3, m1 353 psraw m3, 5 354 packuswb m0, m3 355 OP_MOV [r0], m0, m4 356 add r1, r3 357 add r0, r2 358 dec r4d 359 jne .loop 360 REP_RET 361%endmacro 362 363INIT_MMX mmxext 364%define PW_ROUND pw_16 365%define OP_MOV PUT_OP 366MPEG4_QPEL8_H_LOWPASS put 367%define PW_ROUND pw_16 368%define OP_MOV AVG_OP 369MPEG4_QPEL8_H_LOWPASS avg 370%define PW_ROUND pw_15 371%define OP_MOV PUT_OP 372MPEG4_QPEL8_H_LOWPASS put_no_rnd 373 374 375 376%macro QPEL_V_LOW 5 377 paddw m0, m1 378 mova m4, [pw_20] 379 pmullw m4, m0 380 mova m0, %4 381 mova m5, %1 382 paddw m5, m0 383 psubw m4, m5 384 mova m5, %2 385 mova m6, %3 386 paddw m5, m3 387 paddw m6, m2 388 paddw m6, m6 389 psubw m5, m6 390 pmullw m5, [pw_3] 391 paddw m4, [PW_ROUND] 392 paddw m5, m4 393 psraw m5, 5 394 packuswb m5, m5 395 OP_MOV %5, m5, m7 396 SWAP 0,1,2,3 397%endmacro 398 399%macro MPEG4_QPEL16_V_LOWPASS 1 400cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 401 movsxdifnidn r2, r2d 402 movsxdifnidn r3, r3d 403 404 mov r4d, 17 405 mov r5, rsp 406 pxor m7, m7 407.looph: 408 mova m0, [r1] 409 mova m1, [r1] 410 mova m2, [r1+8] 411 mova m3, [r1+8] 412 punpcklbw m0, m7 413 punpckhbw m1, m7 414 punpcklbw m2, m7 415 punpckhbw m3, m7 416 mova [r5], m0 417 mova [r5+0x88], m1 418 mova [r5+0x110], m2 419 mova [r5+0x198], m3 420 add r5, 8 421 add r1, r3 422 dec r4d 423 jne .looph 424 425 426 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride 427 mov r4d, 4 428 mov r1, 4 429 neg r2 430 lea r1, [r1+r2*8] 431 lea r1, [r1+r2*4] 432 lea r1, [r1+r2*2] 433 neg r2 434 mov r5, rsp 435.loopv: 436 pxor m7, m7 437 mova m0, [r5+ 0x0] 438 mova m1, [r5+ 0x8] 439 mova m2, [r5+0x10] 440 mova m3, [r5+0x18] 441 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] 442 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] 443 lea r0, [r0+r2*2] 444 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] 445 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] 446 lea r0, [r0+r2*2] 447 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] 448 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2] 449 lea r0, [r0+r2*2] 450 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0] 451 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2] 452 lea r0, [r0+r2*2] 453 QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0] 454 QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2] 455 lea r0, [r0+r2*2] 456 QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0] 457 QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2] 458 lea r0, [r0+r2*2] 459 QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0] 460 QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2] 461 lea r0, [r0+r2*2] 462 QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0] 463 QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2] 464 465 add r5, 0x88 466 add r0, r1 467 dec r4d 468 jne .loopv 469 REP_RET 470%endmacro 471 472%macro PUT_OPH 2-3 473 movh %1, %2 474%endmacro 475 476%macro AVG_OPH 2-3 477 movh %3, %1 478 pavgb %2, %3 479 movh %1, %2 480%endmacro 481 482INIT_MMX mmxext 483%define PW_ROUND pw_16 484%define OP_MOV PUT_OPH 485MPEG4_QPEL16_V_LOWPASS put 486%define PW_ROUND pw_16 487%define OP_MOV AVG_OPH 488MPEG4_QPEL16_V_LOWPASS avg 489%define PW_ROUND pw_15 490%define OP_MOV PUT_OPH 491MPEG4_QPEL16_V_LOWPASS put_no_rnd 492 493 494 495%macro MPEG4_QPEL8_V_LOWPASS 1 496cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288 497 movsxdifnidn r2, r2d 498 movsxdifnidn r3, r3d 499 500 mov r4d, 9 501 mov r5, rsp 502 pxor m7, m7 503.looph: 504 mova m0, [r1] 505 mova m1, [r1] 506 punpcklbw m0, m7 507 punpckhbw m1, m7 508 mova [r5], m0 509 mova [r5+0x48], m1 510 add r5, 8 511 add r1, r3 512 dec r4d 513 jne .looph 514 515 516 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride 517 mov r4d, 2 518 mov r1, 4 519 neg r2 520 lea r1, [r1+r2*4] 521 lea r1, [r1+r2*2] 522 neg r2 523 mov r5, rsp 524.loopv: 525 pxor m7, m7 526 mova m0, [r5+ 0x0] 527 mova m1, [r5+ 0x8] 528 mova m2, [r5+0x10] 529 mova m3, [r5+0x18] 530 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] 531 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] 532 lea r0, [r0+r2*2] 533 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] 534 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] 535 lea r0, [r0+r2*2] 536 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] 537 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2] 538 lea r0, [r0+r2*2] 539 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0] 540 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2] 541 542 add r5, 0x48 543 add r0, r1 544 dec r4d 545 jne .loopv 546 REP_RET 547%endmacro 548 549INIT_MMX mmxext 550%define PW_ROUND pw_16 551%define OP_MOV PUT_OPH 552MPEG4_QPEL8_V_LOWPASS put 553%define PW_ROUND pw_16 554%define OP_MOV AVG_OPH 555MPEG4_QPEL8_V_LOWPASS avg 556%define PW_ROUND pw_15 557%define OP_MOV PUT_OPH 558MPEG4_QPEL8_V_LOWPASS put_no_rnd 559