1;****************************************************************************** 2;* mpeg4 qpel 3;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> 4;* Copyright (c) 2008 Loren Merritt 5;* Copyright (c) 2013 Daniel Kang 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27cextern pb_1 28cextern pw_3 29cextern pw_15 30cextern pw_16 31cextern pw_20 32 33 34SECTION .text 35 36; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 37%macro PUT_NO_RND_PIXELS8_L2 0 38cglobal put_no_rnd_pixels8_l2, 6,6 39 movsxdifnidn r4, r4d 40 movsxdifnidn r3, r3d 41 pcmpeqb m6, m6 42 test r5d, 1 43 je .loop 44 mova m0, [r1] 45 mova m1, [r2] 46 add r1, r4 47 add r2, 8 48 pxor m0, m6 49 pxor m1, m6 50 PAVGB m0, m1 51 pxor m0, m6 52 mova [r0], m0 53 add r0, r3 54 dec r5d 55.loop: 56 mova m0, [r1] 57 add r1, r4 58 mova m1, [r1] 59 add r1, r4 60 mova m2, [r2] 61 mova m3, [r2+8] 62 pxor m0, m6 63 pxor m1, m6 64 pxor m2, m6 65 pxor m3, m6 66 PAVGB m0, m2 67 PAVGB m1, m3 68 pxor m0, m6 69 pxor m1, m6 70 mova [r0], m0 71 add r0, r3 72 mova [r0], m1 73 add r0, r3 74 mova m0, [r1] 75 add r1, r4 76 mova m1, [r1] 77 add r1, r4 78 mova m2, [r2+16] 79 mova m3, [r2+24] 80 pxor m0, m6 81 pxor m1, m6 82 pxor m2, m6 83 pxor m3, m6 84 PAVGB m0, m2 85 PAVGB m1, m3 86 pxor m0, m6 87 pxor m1, m6 88 mova [r0], m0 89 add r0, r3 90 mova [r0], m1 91 add r0, r3 92 add r2, 32 93 sub r5d, 4 94 jne .loop 95 REP_RET 96%endmacro 97 98INIT_MMX mmxext 99PUT_NO_RND_PIXELS8_L2 100 101 102; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 103%macro PUT_NO_RND_PIXELS16_l2 0 104cglobal put_no_rnd_pixels16_l2, 6,6 105 movsxdifnidn r3, r3d 106 movsxdifnidn r4, r4d 107 pcmpeqb m6, m6 108 test r5d, 1 109 je .loop 110 mova m0, [r1] 111 mova m1, [r1+8] 112 mova m2, [r2] 113 mova m3, [r2+8] 114 pxor m0, m6 115 pxor m1, m6 116 pxor m2, m6 117 pxor m3, m6 118 PAVGB m0, m2 119 PAVGB m1, m3 120 pxor m0, m6 121 pxor m1, m6 122 add r1, r4 123 add r2, 16 124 mova [r0], m0 125 mova [r0+8], m1 126 add r0, r3 127 dec r5d 128.loop: 129 mova m0, [r1] 130 mova m1, [r1+8] 131 add r1, r4 132 mova m2, [r2] 133 mova m3, [r2+8] 134 pxor m0, m6 135 pxor m1, m6 136 pxor m2, m6 137 pxor m3, m6 138 PAVGB m0, m2 139 PAVGB m1, m3 140 pxor m0, m6 141 pxor m1, m6 142 mova [r0], m0 143 mova [r0+8], m1 144 add r0, r3 145 mova m0, [r1] 146 mova m1, [r1+8] 147 add r1, r4 148 mova m2, [r2+16] 149 mova m3, [r2+24] 150 pxor m0, m6 151 pxor m1, m6 152 pxor m2, m6 153 pxor m3, m6 154 PAVGB m0, m2 155 PAVGB m1, m3 156 pxor m0, m6 157 pxor m1, m6 158 mova [r0], m0 159 mova [r0+8], m1 160 add r0, r3 161 add r2, 32 162 sub r5d, 2 163 jne .loop 164 REP_RET 165%endmacro 166 167INIT_MMX mmxext 168PUT_NO_RND_PIXELS16_l2 169INIT_MMX 3dnow 170PUT_NO_RND_PIXELS16_l2 171 172%macro MPEG4_QPEL16_H_LOWPASS 1 173cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 174 movsxdifnidn r2, r2d 175 movsxdifnidn r3, r3d 176 pxor m7, m7 177.loop: 178 mova m0, [r1] 179 mova m1, m0 180 mova m2, m0 181 punpcklbw m0, m7 182 punpckhbw m1, m7 183 pshufw m5, m0, 0x90 184 pshufw m6, m0, 0x41 185 mova m3, m2 186 mova m4, m2 187 psllq m2, 8 188 psllq m3, 16 189 psllq m4, 24 190 punpckhbw m2, m7 191 punpckhbw m3, m7 192 punpckhbw m4, m7 193 paddw m5, m3 194 paddw m6, m2 195 paddw m5, m5 196 psubw m6, m5 197 pshufw m5, m0, 6 198 pmullw m6, [pw_3] 199 paddw m0, m4 200 paddw m5, m1 201 pmullw m0, [pw_20] 202 psubw m0, m5 203 paddw m6, [PW_ROUND] 204 paddw m0, m6 205 psraw m0, 5 206 mova [rsp+8], m0 207 mova m0, [r1+5] 208 mova m5, m0 209 mova m6, m0 210 psrlq m0, 8 211 psrlq m5, 16 212 punpcklbw m0, m7 213 punpcklbw m5, m7 214 paddw m2, m0 215 paddw m3, m5 216 paddw m2, m2 217 psubw m3, m2 218 mova m2, m6 219 psrlq m6, 24 220 punpcklbw m2, m7 221 punpcklbw m6, m7 222 pmullw m3, [pw_3] 223 paddw m1, m2 224 paddw m4, m6 225 pmullw m1, [pw_20] 226 psubw m3, m4 227 paddw m1, [PW_ROUND] 228 paddw m3, m1 229 psraw m3, 5 230 mova m1, [rsp+8] 231 packuswb m1, m3 232 OP_MOV [r0], m1, m4 233 mova m1, [r1+9] 234 mova m4, m1 235 mova m3, m1 236 psrlq m1, 8 237 psrlq m4, 16 238 punpcklbw m1, m7 239 punpcklbw m4, m7 240 paddw m5, m1 241 paddw m0, m4 242 paddw m5, m5 243 psubw m0, m5 244 mova m5, m3 245 psrlq m3, 24 246 pmullw m0, [pw_3] 247 punpcklbw m3, m7 248 paddw m2, m3 249 psubw m0, m2 250 mova m2, m5 251 punpcklbw m2, m7 252 punpckhbw m5, m7 253 paddw m6, m2 254 pmullw m6, [pw_20] 255 paddw m0, [PW_ROUND] 256 paddw m0, m6 257 psraw m0, 5 258 paddw m3, m5 259 pshufw m6, m5, 0xf9 260 paddw m6, m4 261 pshufw m4, m5, 0xbe 262 pshufw m5, m5, 0x6f 263 paddw m4, m1 264 paddw m5, m2 265 paddw m6, m6 266 psubw m4, m6 267 pmullw m3, [pw_20] 268 pmullw m4, [pw_3] 269 psubw m3, m5 270 paddw m4, [PW_ROUND] 271 paddw m4, m3 272 psraw m4, 5 273 packuswb m0, m4 274 OP_MOV [r0+8], m0, m4 275 add r1, r3 276 add r0, r2 277 dec r4d 278 jne .loop 279 REP_RET 280%endmacro 281 282%macro PUT_OP 2-3 283 mova %1, %2 284%endmacro 285 286%macro AVG_OP 2-3 287 mova %3, %1 288 pavgb %2, %3 289 mova %1, %2 290%endmacro 291 292INIT_MMX mmxext 293%define PW_ROUND pw_16 294%define OP_MOV PUT_OP 295MPEG4_QPEL16_H_LOWPASS put 296%define PW_ROUND pw_16 297%define OP_MOV AVG_OP 298MPEG4_QPEL16_H_LOWPASS avg 299%define PW_ROUND pw_15 300%define OP_MOV PUT_OP 301MPEG4_QPEL16_H_LOWPASS put_no_rnd 302 303 304 305%macro MPEG4_QPEL8_H_LOWPASS 1 306cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8 307 movsxdifnidn r2, r2d 308 movsxdifnidn r3, r3d 309 pxor m7, m7 310.loop: 311 mova m0, [r1] 312 mova m1, m0 313 mova m2, m0 314 punpcklbw m0, m7 315 punpckhbw m1, m7 316 pshufw m5, m0, 0x90 317 pshufw m6, m0, 0x41 318 mova m3, m2 319 mova m4, m2 320 psllq m2, 8 321 psllq m3, 16 322 psllq m4, 24 323 punpckhbw m2, m7 324 punpckhbw m3, m7 325 punpckhbw m4, m7 326 paddw m5, m3 327 paddw m6, m2 328 paddw m5, m5 329 psubw m6, m5 330 pshufw m5, m0, 0x6 331 pmullw m6, [pw_3] 332 paddw m0, m4 333 paddw m5, m1 334 pmullw m0, [pw_20] 335 psubw m0, m5 336 paddw m6, [PW_ROUND] 337 paddw m0, m6 338 psraw m0, 5 339 movh m5, [r1+5] 340 punpcklbw m5, m7 341 pshufw m6, m5, 0xf9 342 paddw m1, m5 343 paddw m2, m6 344 pshufw m6, m5, 0xbe 345 pshufw m5, m5, 0x6f 346 paddw m3, m6 347 paddw m4, m5 348 paddw m2, m2 349 psubw m3, m2 350 pmullw m1, [pw_20] 351 pmullw m3, [pw_3] 352 psubw m3, m4 353 paddw m1, [PW_ROUND] 354 paddw m3, m1 355 psraw m3, 5 356 packuswb m0, m3 357 OP_MOV [r0], m0, m4 358 add r1, r3 359 add r0, r2 360 dec r4d 361 jne .loop 362 REP_RET 363%endmacro 364 365INIT_MMX mmxext 366%define PW_ROUND pw_16 367%define OP_MOV PUT_OP 368MPEG4_QPEL8_H_LOWPASS put 369%define PW_ROUND pw_16 370%define OP_MOV AVG_OP 371MPEG4_QPEL8_H_LOWPASS avg 372%define PW_ROUND pw_15 373%define OP_MOV PUT_OP 374MPEG4_QPEL8_H_LOWPASS put_no_rnd 375 376 377 378%macro QPEL_V_LOW 5 379 paddw m0, m1 380 mova m4, [pw_20] 381 pmullw m4, m0 382 mova m0, %4 383 mova m5, %1 384 paddw m5, m0 385 psubw m4, m5 386 mova m5, %2 387 mova m6, %3 388 paddw m5, m3 389 paddw m6, m2 390 paddw m6, m6 391 psubw m5, m6 392 pmullw m5, [pw_3] 393 paddw m4, [PW_ROUND] 394 paddw m5, m4 395 psraw m5, 5 396 packuswb m5, m5 397 OP_MOV %5, m5, m7 398 SWAP 0,1,2,3 399%endmacro 400 401%macro MPEG4_QPEL16_V_LOWPASS 1 402cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 403 movsxdifnidn r2, r2d 404 movsxdifnidn r3, r3d 405 406 mov r4d, 17 407 mov r5, rsp 408 pxor m7, m7 409.looph: 410 mova m0, [r1] 411 mova m1, [r1] 412 mova m2, [r1+8] 413 mova m3, [r1+8] 414 punpcklbw m0, m7 415 punpckhbw m1, m7 416 punpcklbw m2, m7 417 punpckhbw m3, m7 418 mova [r5], m0 419 mova [r5+0x88], m1 420 mova [r5+0x110], m2 421 mova [r5+0x198], m3 422 add r5, 8 423 add r1, r3 424 dec r4d 425 jne .looph 426 427 428 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride 429 mov r4d, 4 430 mov r1, 4 431 neg r2 432 lea r1, [r1+r2*8] 433 lea r1, [r1+r2*4] 434 lea r1, [r1+r2*2] 435 neg r2 436 mov r5, rsp 437.loopv: 438 pxor m7, m7 439 mova m0, [r5+ 0x0] 440 mova m1, [r5+ 0x8] 441 mova m2, [r5+0x10] 442 mova m3, [r5+0x18] 443 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] 444 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] 445 lea r0, [r0+r2*2] 446 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] 447 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] 448 lea r0, [r0+r2*2] 449 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] 450 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2] 451 lea r0, [r0+r2*2] 452 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0] 453 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2] 454 lea r0, [r0+r2*2] 455 QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0] 456 QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2] 457 lea r0, [r0+r2*2] 458 QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0] 459 QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2] 460 lea r0, [r0+r2*2] 461 QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0] 462 QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2] 463 lea r0, [r0+r2*2] 464 QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0] 465 QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2] 466 467 add r5, 0x88 468 add r0, r1 469 dec r4d 470 jne .loopv 471 REP_RET 472%endmacro 473 474%macro PUT_OPH 2-3 475 movh %1, %2 476%endmacro 477 478%macro AVG_OPH 2-3 479 movh %3, %1 480 pavgb %2, %3 481 movh %1, %2 482%endmacro 483 484INIT_MMX mmxext 485%define PW_ROUND pw_16 486%define OP_MOV PUT_OPH 487MPEG4_QPEL16_V_LOWPASS put 488%define PW_ROUND pw_16 489%define OP_MOV AVG_OPH 490MPEG4_QPEL16_V_LOWPASS avg 491%define PW_ROUND pw_15 492%define OP_MOV PUT_OPH 493MPEG4_QPEL16_V_LOWPASS put_no_rnd 494 495 496 497%macro MPEG4_QPEL8_V_LOWPASS 1 498cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288 499 movsxdifnidn r2, r2d 500 movsxdifnidn r3, r3d 501 502 mov r4d, 9 503 mov r5, rsp 504 pxor m7, m7 505.looph: 506 mova m0, [r1] 507 mova m1, [r1] 508 punpcklbw m0, m7 509 punpckhbw m1, m7 510 mova [r5], m0 511 mova [r5+0x48], m1 512 add r5, 8 513 add r1, r3 514 dec r4d 515 jne .looph 516 517 518 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride 519 mov r4d, 2 520 mov r1, 4 521 neg r2 522 lea r1, [r1+r2*4] 523 lea r1, [r1+r2*2] 524 neg r2 525 mov r5, rsp 526.loopv: 527 pxor m7, m7 528 mova m0, [r5+ 0x0] 529 mova m1, [r5+ 0x8] 530 mova m2, [r5+0x10] 531 mova m3, [r5+0x18] 532 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] 533 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] 534 lea r0, [r0+r2*2] 535 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] 536 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] 537 lea r0, [r0+r2*2] 538 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] 539 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2] 540 lea r0, [r0+r2*2] 541 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0] 542 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2] 543 544 add r5, 0x48 545 add r0, r1 546 dec r4d 547 jne .loopv 548 REP_RET 549%endmacro 550 551INIT_MMX mmxext 552%define PW_ROUND pw_16 553%define OP_MOV PUT_OPH 554MPEG4_QPEL8_V_LOWPASS put 555%define PW_ROUND pw_16 556%define OP_MOV AVG_OPH 557MPEG4_QPEL8_V_LOWPASS avg 558%define PW_ROUND pw_15 559%define OP_MOV PUT_OPH 560MPEG4_QPEL8_V_LOWPASS put_no_rnd 561