1;****************************************************************************** 2;* 3;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> 4;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> 5;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> 6;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> 7;* Copyright (c) 2013 Daniel Kang 8;* 9;* SIMD-optimized halfpel functions 10;* 11;* This file is part of FFmpeg. 12;* 13;* FFmpeg is free software; you can redistribute it and/or 14;* modify it under the terms of the GNU Lesser General Public 15;* License as published by the Free Software Foundation; either 16;* version 2.1 of the License, or (at your option) any later version. 17;* 18;* FFmpeg is distributed in the hope that it will be useful, 19;* but WITHOUT ANY WARRANTY; without even the implied warranty of 20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21;* Lesser General Public License for more details. 22;* 23;* You should have received a copy of the GNU Lesser General Public 24;* License along with FFmpeg; if not, write to the Free Software 25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26;****************************************************************************** 27 28%include "libavutil/x86/x86util.asm" 29 30SECTION_RODATA 31cextern pb_1 32cextern pw_2 33pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 34pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 35 36cextern pw_8192 37 38SECTION .text 39 40; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 41%macro PUT_PIXELS8_X2 0 42%if cpuflag(sse2) 43cglobal put_pixels16_x2, 4,5,4 44%else 45cglobal put_pixels8_x2, 4,5 46%endif 47 lea r4, [r2*2] 48.loop: 49 movu m0, [r1+1] 50 movu m1, [r1+r2+1] 51%if cpuflag(sse2) 52 movu m2, [r1] 53 movu m3, [r1+r2] 54 pavgb m0, m2 55 pavgb m1, m3 56%else 57 PAVGB m0, [r1] 58 PAVGB m1, [r1+r2] 59%endif 60 mova [r0], m0 61 mova [r0+r2], m1 62 add r1, r4 63 add r0, r4 64 movu m0, [r1+1] 65 movu m1, [r1+r2+1] 66%if cpuflag(sse2) 67 movu m2, [r1] 68 movu m3, [r1+r2] 69 pavgb m0, m2 70 pavgb m1, m3 71%else 72 PAVGB m0, [r1] 73 PAVGB m1, [r1+r2] 74%endif 75 add r1, r4 76 mova [r0], m0 77 mova [r0+r2], m1 78 add r0, r4 79 sub r3d, 4 80 jne .loop 81 REP_RET 82%endmacro 83 84INIT_MMX mmxext 85PUT_PIXELS8_X2 86INIT_MMX 3dnow 87PUT_PIXELS8_X2 88 89 90; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 91%macro PUT_PIXELS_16 0 92cglobal put_pixels16_x2, 4,5 93 lea r4, [r2*2] 94.loop: 95 mova m0, [r1] 96 mova m1, [r1+r2] 97 mova m2, [r1+8] 98 mova m3, [r1+r2+8] 99 PAVGB m0, [r1+1] 100 PAVGB m1, [r1+r2+1] 101 PAVGB m2, [r1+9] 102 PAVGB m3, [r1+r2+9] 103 mova [r0], m0 104 mova [r0+r2], m1 105 mova [r0+8], m2 106 mova [r0+r2+8], m3 107 add r1, r4 108 add r0, r4 109 mova m0, [r1] 110 mova m1, [r1+r2] 111 mova m2, [r1+8] 112 mova m3, [r1+r2+8] 113 PAVGB m0, [r1+1] 114 PAVGB m1, [r1+r2+1] 115 PAVGB m2, [r1+9] 116 PAVGB m3, [r1+r2+9] 117 add r1, r4 118 mova [r0], m0 119 mova [r0+r2], m1 120 mova [r0+8], m2 121 mova [r0+r2+8], m3 122 add r0, r4 123 sub r3d, 4 124 jne .loop 125 REP_RET 126%endmacro 127 128INIT_MMX mmxext 129PUT_PIXELS_16 130INIT_MMX 3dnow 131PUT_PIXELS_16 132; The 8_X2 macro can easily be used here 133INIT_XMM sse2 134PUT_PIXELS8_X2 135 136 137; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 138%macro PUT_NO_RND_PIXELS8_X2 0 139cglobal put_no_rnd_pixels8_x2, 4,5 140 mova m6, [pb_1] 141 lea r4, [r2*2] 142.loop: 143 mova m0, [r1] 144 mova m2, [r1+r2] 145 mova m1, [r1+1] 146 mova m3, [r1+r2+1] 147 add r1, r4 148 psubusb m0, m6 149 psubusb m2, m6 150 PAVGB m0, m1 151 PAVGB m2, m3 152 mova [r0], m0 153 mova [r0+r2], m2 154 mova m0, [r1] 155 mova m1, [r1+1] 156 mova m2, [r1+r2] 157 mova m3, [r1+r2+1] 158 add r0, r4 159 add r1, r4 160 psubusb m0, m6 161 psubusb m2, m6 162 PAVGB m0, m1 163 PAVGB m2, m3 164 mova [r0], m0 165 mova [r0+r2], m2 166 add r0, r4 167 sub r3d, 4 168 jne .loop 169 REP_RET 170%endmacro 171 172INIT_MMX mmxext 173PUT_NO_RND_PIXELS8_X2 174INIT_MMX 3dnow 175PUT_NO_RND_PIXELS8_X2 176 177 178; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 179%macro PUT_PIXELS8_Y2 0 180%if cpuflag(sse2) 181cglobal put_pixels16_y2, 4,5,3 182%else 183cglobal put_pixels8_y2, 4,5 184%endif 185 lea r4, [r2*2] 186 movu m0, [r1] 187 sub r0, r2 188.loop: 189 movu m1, [r1+r2] 190 movu m2, [r1+r4] 191 add r1, r4 192 PAVGB m0, m1 193 PAVGB m1, m2 194 mova [r0+r2], m0 195 mova [r0+r4], m1 196 movu m1, [r1+r2] 197 movu m0, [r1+r4] 198 add r0, r4 199 add r1, r4 200 PAVGB m2, m1 201 PAVGB m1, m0 202 mova [r0+r2], m2 203 mova [r0+r4], m1 204 add r0, r4 205 sub r3d, 4 206 jne .loop 207 REP_RET 208%endmacro 209 210INIT_MMX mmxext 211PUT_PIXELS8_Y2 212INIT_MMX 3dnow 213PUT_PIXELS8_Y2 214; actually, put_pixels16_y2_sse2 215INIT_XMM sse2 216PUT_PIXELS8_Y2 217 218 219; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 220%macro PUT_NO_RND_PIXELS8_Y2 0 221cglobal put_no_rnd_pixels8_y2, 4,5 222 mova m6, [pb_1] 223 lea r4, [r2+r2] 224 mova m0, [r1] 225 sub r0, r2 226.loop: 227 mova m1, [r1+r2] 228 mova m2, [r1+r4] 229 add r1, r4 230 psubusb m1, m6 231 PAVGB m0, m1 232 PAVGB m1, m2 233 mova [r0+r2], m0 234 mova [r0+r4], m1 235 mova m1, [r1+r2] 236 mova m0, [r1+r4] 237 add r0, r4 238 add r1, r4 239 psubusb m1, m6 240 PAVGB m2, m1 241 PAVGB m1, m0 242 mova [r0+r2], m2 243 mova [r0+r4], m1 244 add r0, r4 245 sub r3d, 4 246 jne .loop 247 REP_RET 248%endmacro 249 250INIT_MMX mmxext 251PUT_NO_RND_PIXELS8_Y2 252INIT_MMX 3dnow 253PUT_NO_RND_PIXELS8_Y2 254 255 256; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 257%macro AVG_PIXELS8 0 258cglobal avg_pixels8, 4,5 259 lea r4, [r2*2] 260.loop: 261 mova m0, [r0] 262 mova m1, [r0+r2] 263 PAVGB m0, [r1] 264 PAVGB m1, [r1+r2] 265 mova [r0], m0 266 mova [r0+r2], m1 267 add r1, r4 268 add r0, r4 269 mova m0, [r0] 270 mova m1, [r0+r2] 271 PAVGB m0, [r1] 272 PAVGB m1, [r1+r2] 273 add r1, r4 274 mova [r0], m0 275 mova [r0+r2], m1 276 add r0, r4 277 sub r3d, 4 278 jne .loop 279 REP_RET 280%endmacro 281 282INIT_MMX 3dnow 283AVG_PIXELS8 284 285 286; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 287%macro AVG_PIXELS8_X2 0 288%if cpuflag(sse2) 289cglobal avg_pixels16_x2, 4,5,4 290%else 291cglobal avg_pixels8_x2, 4,5 292%endif 293 lea r4, [r2*2] 294%if notcpuflag(mmxext) 295 pcmpeqd m5, m5 296 paddb m5, m5 297%endif 298.loop: 299 movu m0, [r1] 300 movu m2, [r1+r2] 301%if cpuflag(sse2) 302 movu m1, [r1+1] 303 movu m3, [r1+r2+1] 304 pavgb m0, m1 305 pavgb m2, m3 306%else 307 PAVGB m0, [r1+1], m3, m5 308 PAVGB m2, [r1+r2+1], m4, m5 309%endif 310 PAVGB m0, [r0], m3, m5 311 PAVGB m2, [r0+r2], m4, m5 312 add r1, r4 313 mova [r0], m0 314 mova [r0+r2], m2 315 movu m0, [r1] 316 movu m2, [r1+r2] 317%if cpuflag(sse2) 318 movu m1, [r1+1] 319 movu m3, [r1+r2+1] 320 pavgb m0, m1 321 pavgb m2, m3 322%else 323 PAVGB m0, [r1+1], m3, m5 324 PAVGB m2, [r1+r2+1], m4, m5 325%endif 326 add r0, r4 327 add r1, r4 328 PAVGB m0, [r0], m3, m5 329 PAVGB m2, [r0+r2], m4, m5 330 mova [r0], m0 331 mova [r0+r2], m2 332 add r0, r4 333 sub r3d, 4 334 jne .loop 335 REP_RET 336%endmacro 337 338INIT_MMX mmx 339AVG_PIXELS8_X2 340INIT_MMX mmxext 341AVG_PIXELS8_X2 342INIT_MMX 3dnow 343AVG_PIXELS8_X2 344; actually avg_pixels16_x2 345INIT_XMM sse2 346AVG_PIXELS8_X2 347 348 349; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 350%macro AVG_PIXELS8_Y2 0 351%if cpuflag(sse2) 352cglobal avg_pixels16_y2, 4,5,3 353%else 354cglobal avg_pixels8_y2, 4,5 355%endif 356 lea r4, [r2*2] 357 movu m0, [r1] 358 sub r0, r2 359.loop: 360 movu m1, [r1+r2] 361 movu m2, [r1+r4] 362 add r1, r4 363 PAVGB m0, m1 364 PAVGB m1, m2 365 PAVGB m0, [r0+r2] 366 PAVGB m1, [r0+r4] 367 mova [r0+r2], m0 368 mova [r0+r4], m1 369 movu m1, [r1+r2] 370 movu m0, [r1+r4] 371 PAVGB m2, m1 372 PAVGB m1, m0 373 add r0, r4 374 add r1, r4 375 PAVGB m2, [r0+r2] 376 PAVGB m1, [r0+r4] 377 mova [r0+r2], m2 378 mova [r0+r4], m1 379 add r0, r4 380 sub r3d, 4 381 jne .loop 382 REP_RET 383%endmacro 384 385INIT_MMX mmxext 386AVG_PIXELS8_Y2 387INIT_MMX 3dnow 388AVG_PIXELS8_Y2 389; actually avg_pixels16_y2 390INIT_XMM sse2 391AVG_PIXELS8_Y2 392 393 394; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 395; Note this is not correctly rounded, and is therefore used for 396; not-bitexact output 397%macro AVG_APPROX_PIXELS8_XY2 0 398cglobal avg_approx_pixels8_xy2, 4,5 399 mova m6, [pb_1] 400 lea r4, [r2*2] 401 mova m0, [r1] 402 PAVGB m0, [r1+1] 403.loop: 404 mova m2, [r1+r4] 405 mova m1, [r1+r2] 406 psubusb m2, m6 407 PAVGB m1, [r1+r2+1] 408 PAVGB m2, [r1+r4+1] 409 add r1, r4 410 PAVGB m0, m1 411 PAVGB m1, m2 412 PAVGB m0, [r0] 413 PAVGB m1, [r0+r2] 414 mova [r0], m0 415 mova [r0+r2], m1 416 mova m1, [r1+r2] 417 mova m0, [r1+r4] 418 PAVGB m1, [r1+r2+1] 419 PAVGB m0, [r1+r4+1] 420 add r0, r4 421 add r1, r4 422 PAVGB m2, m1 423 PAVGB m1, m0 424 PAVGB m2, [r0] 425 PAVGB m1, [r0+r2] 426 mova [r0], m2 427 mova [r0+r2], m1 428 add r0, r4 429 sub r3d, 4 430 jne .loop 431 REP_RET 432%endmacro 433 434INIT_MMX mmxext 435AVG_APPROX_PIXELS8_XY2 436INIT_MMX 3dnow 437AVG_APPROX_PIXELS8_XY2 438 439 440; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 441%macro SET_PIXELS_XY2 1 442%if cpuflag(sse2) 443cglobal %1_pixels16_xy2, 4,5,8 444%else 445cglobal %1_pixels8_xy2, 4,5 446%endif 447 pxor m7, m7 448 mova m6, [pw_2] 449 movu m0, [r1] 450 movu m4, [r1+1] 451 mova m1, m0 452 mova m5, m4 453 punpcklbw m0, m7 454 punpcklbw m4, m7 455 punpckhbw m1, m7 456 punpckhbw m5, m7 457 paddusw m4, m0 458 paddusw m5, m1 459 xor r4, r4 460 add r1, r2 461.loop: 462 movu m0, [r1+r4] 463 movu m2, [r1+r4+1] 464 mova m1, m0 465 mova m3, m2 466 punpcklbw m0, m7 467 punpcklbw m2, m7 468 punpckhbw m1, m7 469 punpckhbw m3, m7 470 paddusw m0, m2 471 paddusw m1, m3 472 paddusw m4, m6 473 paddusw m5, m6 474 paddusw m4, m0 475 paddusw m5, m1 476 psrlw m4, 2 477 psrlw m5, 2 478%ifidn %1, avg 479 mova m3, [r0+r4] 480 packuswb m4, m5 481 PAVGB m4, m3 482%else 483 packuswb m4, m5 484%endif 485 mova [r0+r4], m4 486 add r4, r2 487 488 movu m2, [r1+r4] 489 movu m4, [r1+r4+1] 490 mova m3, m2 491 mova m5, m4 492 punpcklbw m2, m7 493 punpcklbw m4, m7 494 punpckhbw m3, m7 495 punpckhbw m5, m7 496 paddusw m4, m2 497 paddusw m5, m3 498 paddusw m0, m6 499 paddusw m1, m6 500 paddusw m0, m4 501 paddusw m1, m5 502 psrlw m0, 2 503 psrlw m1, 2 504%ifidn %1, avg 505 mova m3, [r0+r4] 506 packuswb m0, m1 507 PAVGB m0, m3 508%else 509 packuswb m0, m1 510%endif 511 mova [r0+r4], m0 512 add r4, r2 513 sub r3d, 2 514 jnz .loop 515 REP_RET 516%endmacro 517 518INIT_MMX mmxext 519SET_PIXELS_XY2 avg 520INIT_MMX 3dnow 521SET_PIXELS_XY2 avg 522INIT_XMM sse2 523SET_PIXELS_XY2 put 524SET_PIXELS_XY2 avg 525 526%macro SSSE3_PIXELS_XY2 1-2 527%if %0 == 2 ; sse2 528cglobal %1_pixels16_xy2, 4,5,%2 529 mova m4, [pb_interleave16] 530%else 531cglobal %1_pixels8_xy2, 4,5 532 mova m4, [pb_interleave8] 533%endif 534 mova m5, [pb_1] 535 movu m0, [r1] 536 movu m1, [r1+1] 537 pmaddubsw m0, m5 538 pmaddubsw m1, m5 539 xor r4, r4 540 add r1, r2 541.loop: 542 movu m2, [r1+r4] 543 movu m3, [r1+r4+1] 544 pmaddubsw m2, m5 545 pmaddubsw m3, m5 546 paddusw m0, m2 547 paddusw m1, m3 548 pmulhrsw m0, [pw_8192] 549 pmulhrsw m1, [pw_8192] 550%ifidn %1, avg 551 mova m6, [r0+r4] 552 packuswb m0, m1 553 pshufb m0, m4 554 pavgb m0, m6 555%else 556 packuswb m0, m1 557 pshufb m0, m4 558%endif 559 mova [r0+r4], m0 560 add r4, r2 561 562 movu m0, [r1+r4] 563 movu m1, [r1+r4+1] 564 pmaddubsw m0, m5 565 pmaddubsw m1, m5 566 paddusw m2, m0 567 paddusw m3, m1 568 pmulhrsw m2, [pw_8192] 569 pmulhrsw m3, [pw_8192] 570%ifidn %1, avg 571 mova m6, [r0+r4] 572 packuswb m2, m3 573 pshufb m2, m4 574 pavgb m2, m6 575%else 576 packuswb m2, m3 577 pshufb m2, m4 578%endif 579 mova [r0+r4], m2 580 add r4, r2 581 sub r3d, 2 582 jnz .loop 583 REP_RET 584%endmacro 585 586INIT_MMX ssse3 587SSSE3_PIXELS_XY2 put 588SSSE3_PIXELS_XY2 avg 589INIT_XMM ssse3 590SSSE3_PIXELS_XY2 put, 6 591SSSE3_PIXELS_XY2 avg, 7 592