1;****************************************************************************** 2;* 3;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> 4;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> 5;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> 6;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> 7;* Copyright (c) 2013 Daniel Kang 8;* 9;* SIMD-optimized halfpel functions 10;* 11;* This file is part of FFmpeg. 12;* 13;* FFmpeg is free software; you can redistribute it and/or 14;* modify it under the terms of the GNU Lesser General Public 15;* License as published by the Free Software Foundation; either 16;* version 2.1 of the License, or (at your option) any later version. 17;* 18;* FFmpeg is distributed in the hope that it will be useful, 19;* but WITHOUT ANY WARRANTY; without even the implied warranty of 20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21;* Lesser General Public License for more details. 22;* 23;* You should have received a copy of the GNU Lesser General Public 24;* License along with FFmpeg; if not, write to the Free Software 25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 26;****************************************************************************** 27 28%include "libavutil/x86/x86util.asm" 29 30SECTION_RODATA 31cextern pb_1 32cextern pw_2 33pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 34pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 35 36cextern pw_8192 37 38SECTION .text 39 40; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 41%macro PUT_PIXELS8_X2 0 42%if cpuflag(sse2) 43cglobal put_pixels16_x2, 4,5,4 44%else 45cglobal put_pixels8_x2, 4,5 46%endif 47 lea r4, [r2*2] 48.loop: 49 movu m0, [r1+1] 50 movu m1, [r1+r2+1] 51%if cpuflag(sse2) 52 movu m2, [r1] 53 movu m3, [r1+r2] 54 pavgb m0, m2 55 pavgb m1, m3 56%else 57 PAVGB m0, [r1] 58 PAVGB m1, [r1+r2] 59%endif 60 mova [r0], m0 61 mova [r0+r2], m1 62 add r1, r4 63 add r0, r4 64 movu m0, [r1+1] 65 movu m1, [r1+r2+1] 66%if cpuflag(sse2) 67 movu m2, [r1] 68 movu m3, [r1+r2] 69 pavgb m0, m2 70 pavgb m1, m3 71%else 72 PAVGB m0, [r1] 73 PAVGB m1, [r1+r2] 74%endif 75 add r1, r4 76 mova [r0], m0 77 mova [r0+r2], m1 78 add r0, r4 79 sub r3d, 4 80 jne .loop 81 REP_RET 82%endmacro 83 84INIT_MMX mmxext 85PUT_PIXELS8_X2 86 87 88; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 89%macro PUT_PIXELS_16 0 90cglobal put_pixels16_x2, 4,5 91 lea r4, [r2*2] 92.loop: 93 mova m0, [r1] 94 mova m1, [r1+r2] 95 mova m2, [r1+8] 96 mova m3, [r1+r2+8] 97 PAVGB m0, [r1+1] 98 PAVGB m1, [r1+r2+1] 99 PAVGB m2, [r1+9] 100 PAVGB m3, [r1+r2+9] 101 mova [r0], m0 102 mova [r0+r2], m1 103 mova [r0+8], m2 104 mova [r0+r2+8], m3 105 add r1, r4 106 add r0, r4 107 mova m0, [r1] 108 mova m1, [r1+r2] 109 mova m2, [r1+8] 110 mova m3, [r1+r2+8] 111 PAVGB m0, [r1+1] 112 PAVGB m1, [r1+r2+1] 113 PAVGB m2, [r1+9] 114 PAVGB m3, [r1+r2+9] 115 add r1, r4 116 mova [r0], m0 117 mova [r0+r2], m1 118 mova [r0+8], m2 119 mova [r0+r2+8], m3 120 add r0, r4 121 sub r3d, 4 122 jne .loop 123 REP_RET 124%endmacro 125 126INIT_MMX mmxext 127PUT_PIXELS_16 128; The 8_X2 macro can easily be used here 129INIT_XMM sse2 130PUT_PIXELS8_X2 131 132 133; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 134INIT_MMX mmxext 135cglobal put_no_rnd_pixels8_x2, 4,5 136 mova m6, [pb_1] 137 lea r4, [r2*2] 138.loop: 139 mova m0, [r1] 140 mova m2, [r1+r2] 141 mova m1, [r1+1] 142 mova m3, [r1+r2+1] 143 add r1, r4 144 psubusb m0, m6 145 psubusb m2, m6 146 PAVGB m0, m1 147 PAVGB m2, m3 148 mova [r0], m0 149 mova [r0+r2], m2 150 mova m0, [r1] 151 mova m1, [r1+1] 152 mova m2, [r1+r2] 153 mova m3, [r1+r2+1] 154 add r0, r4 155 add r1, r4 156 psubusb m0, m6 157 psubusb m2, m6 158 PAVGB m0, m1 159 PAVGB m2, m3 160 mova [r0], m0 161 mova [r0+r2], m2 162 add r0, r4 163 sub r3d, 4 164 jne .loop 165 REP_RET 166 167 168; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 169%macro PUT_PIXELS8_Y2 0 170%if cpuflag(sse2) 171cglobal put_pixels16_y2, 4,5,3 172%else 173cglobal put_pixels8_y2, 4,5 174%endif 175 lea r4, [r2*2] 176 movu m0, [r1] 177 sub r0, r2 178.loop: 179 movu m1, [r1+r2] 180 movu m2, [r1+r4] 181 add r1, r4 182 PAVGB m0, m1 183 PAVGB m1, m2 184 mova [r0+r2], m0 185 mova [r0+r4], m1 186 movu m1, [r1+r2] 187 movu m0, [r1+r4] 188 add r0, r4 189 add r1, r4 190 PAVGB m2, m1 191 PAVGB m1, m0 192 mova [r0+r2], m2 193 mova [r0+r4], m1 194 add r0, r4 195 sub r3d, 4 196 jne .loop 197 REP_RET 198%endmacro 199 200INIT_MMX mmxext 201PUT_PIXELS8_Y2 202; actually, put_pixels16_y2_sse2 203INIT_XMM sse2 204PUT_PIXELS8_Y2 205 206 207; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 208INIT_MMX mmxext 209cglobal put_no_rnd_pixels8_y2, 4,5 210 mova m6, [pb_1] 211 lea r4, [r2+r2] 212 mova m0, [r1] 213 sub r0, r2 214.loop: 215 mova m1, [r1+r2] 216 mova m2, [r1+r4] 217 add r1, r4 218 psubusb m1, m6 219 PAVGB m0, m1 220 PAVGB m1, m2 221 mova [r0+r2], m0 222 mova [r0+r4], m1 223 mova m1, [r1+r2] 224 mova m0, [r1+r4] 225 add r0, r4 226 add r1, r4 227 psubusb m1, m6 228 PAVGB m2, m1 229 PAVGB m1, m0 230 mova [r0+r2], m2 231 mova [r0+r4], m1 232 add r0, r4 233 sub r3d, 4 234 jne .loop 235 REP_RET 236 237 238; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 239%macro AVG_PIXELS8_X2 0 240%if cpuflag(sse2) 241cglobal avg_pixels16_x2, 4,5,4 242%else 243cglobal avg_pixels8_x2, 4,5 244%endif 245 lea r4, [r2*2] 246.loop: 247 movu m0, [r1] 248 movu m2, [r1+r2] 249%if cpuflag(sse2) 250 movu m1, [r1+1] 251 movu m3, [r1+r2+1] 252 pavgb m0, m1 253 pavgb m2, m3 254%else 255 PAVGB m0, [r1+1], m3, m5 256 PAVGB m2, [r1+r2+1], m4, m5 257%endif 258 PAVGB m0, [r0], m3, m5 259 PAVGB m2, [r0+r2], m4, m5 260 add r1, r4 261 mova [r0], m0 262 mova [r0+r2], m2 263 movu m0, [r1] 264 movu m2, [r1+r2] 265%if cpuflag(sse2) 266 movu m1, [r1+1] 267 movu m3, [r1+r2+1] 268 pavgb m0, m1 269 pavgb m2, m3 270%else 271 PAVGB m0, [r1+1], m3, m5 272 PAVGB m2, [r1+r2+1], m4, m5 273%endif 274 add r0, r4 275 add r1, r4 276 PAVGB m0, [r0], m3, m5 277 PAVGB m2, [r0+r2], m4, m5 278 mova [r0], m0 279 mova [r0+r2], m2 280 add r0, r4 281 sub r3d, 4 282 jne .loop 283 REP_RET 284%endmacro 285 286INIT_MMX mmxext 287AVG_PIXELS8_X2 288; actually avg_pixels16_x2 289INIT_XMM sse2 290AVG_PIXELS8_X2 291 292 293; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 294%macro AVG_PIXELS8_Y2 0 295%if cpuflag(sse2) 296cglobal avg_pixels16_y2, 4,5,3 297%else 298cglobal avg_pixels8_y2, 4,5 299%endif 300 lea r4, [r2*2] 301 movu m0, [r1] 302 sub r0, r2 303.loop: 304 movu m1, [r1+r2] 305 movu m2, [r1+r4] 306 add r1, r4 307 PAVGB m0, m1 308 PAVGB m1, m2 309 PAVGB m0, [r0+r2] 310 PAVGB m1, [r0+r4] 311 mova [r0+r2], m0 312 mova [r0+r4], m1 313 movu m1, [r1+r2] 314 movu m0, [r1+r4] 315 PAVGB m2, m1 316 PAVGB m1, m0 317 add r0, r4 318 add r1, r4 319 PAVGB m2, [r0+r2] 320 PAVGB m1, [r0+r4] 321 mova [r0+r2], m2 322 mova [r0+r4], m1 323 add r0, r4 324 sub r3d, 4 325 jne .loop 326 REP_RET 327%endmacro 328 329INIT_MMX mmxext 330AVG_PIXELS8_Y2 331; actually avg_pixels16_y2 332INIT_XMM sse2 333AVG_PIXELS8_Y2 334 335 336; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 337; Note this is not correctly rounded, and is therefore used for 338; not-bitexact output 339INIT_MMX mmxext 340cglobal avg_approx_pixels8_xy2, 4,5 341 mova m6, [pb_1] 342 lea r4, [r2*2] 343 mova m0, [r1] 344 PAVGB m0, [r1+1] 345.loop: 346 mova m2, [r1+r4] 347 mova m1, [r1+r2] 348 psubusb m2, m6 349 PAVGB m1, [r1+r2+1] 350 PAVGB m2, [r1+r4+1] 351 add r1, r4 352 PAVGB m0, m1 353 PAVGB m1, m2 354 PAVGB m0, [r0] 355 PAVGB m1, [r0+r2] 356 mova [r0], m0 357 mova [r0+r2], m1 358 mova m1, [r1+r2] 359 mova m0, [r1+r4] 360 PAVGB m1, [r1+r2+1] 361 PAVGB m0, [r1+r4+1] 362 add r0, r4 363 add r1, r4 364 PAVGB m2, m1 365 PAVGB m1, m0 366 PAVGB m2, [r0] 367 PAVGB m1, [r0+r2] 368 mova [r0], m2 369 mova [r0+r2], m1 370 add r0, r4 371 sub r3d, 4 372 jne .loop 373 REP_RET 374 375 376; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) 377%macro SET_PIXELS_XY2 1 378%if cpuflag(sse2) 379cglobal %1_pixels16_xy2, 4,5,8 380%else 381cglobal %1_pixels8_xy2, 4,5 382%endif 383 pxor m7, m7 384 mova m6, [pw_2] 385 movu m0, [r1] 386 movu m4, [r1+1] 387 mova m1, m0 388 mova m5, m4 389 punpcklbw m0, m7 390 punpcklbw m4, m7 391 punpckhbw m1, m7 392 punpckhbw m5, m7 393 paddusw m4, m0 394 paddusw m5, m1 395 xor r4, r4 396 add r1, r2 397.loop: 398 movu m0, [r1+r4] 399 movu m2, [r1+r4+1] 400 mova m1, m0 401 mova m3, m2 402 punpcklbw m0, m7 403 punpcklbw m2, m7 404 punpckhbw m1, m7 405 punpckhbw m3, m7 406 paddusw m0, m2 407 paddusw m1, m3 408 paddusw m4, m6 409 paddusw m5, m6 410 paddusw m4, m0 411 paddusw m5, m1 412 psrlw m4, 2 413 psrlw m5, 2 414%ifidn %1, avg 415 mova m3, [r0+r4] 416 packuswb m4, m5 417 PAVGB m4, m3 418%else 419 packuswb m4, m5 420%endif 421 mova [r0+r4], m4 422 add r4, r2 423 424 movu m2, [r1+r4] 425 movu m4, [r1+r4+1] 426 mova m3, m2 427 mova m5, m4 428 punpcklbw m2, m7 429 punpcklbw m4, m7 430 punpckhbw m3, m7 431 punpckhbw m5, m7 432 paddusw m4, m2 433 paddusw m5, m3 434 paddusw m0, m6 435 paddusw m1, m6 436 paddusw m0, m4 437 paddusw m1, m5 438 psrlw m0, 2 439 psrlw m1, 2 440%ifidn %1, avg 441 mova m3, [r0+r4] 442 packuswb m0, m1 443 PAVGB m0, m3 444%else 445 packuswb m0, m1 446%endif 447 mova [r0+r4], m0 448 add r4, r2 449 sub r3d, 2 450 jnz .loop 451 REP_RET 452%endmacro 453 454INIT_MMX mmxext 455SET_PIXELS_XY2 avg 456INIT_XMM sse2 457SET_PIXELS_XY2 put 458SET_PIXELS_XY2 avg 459 460%macro SSSE3_PIXELS_XY2 1-2 461%if %0 == 2 ; sse2 462cglobal %1_pixels16_xy2, 4,5,%2 463 mova m4, [pb_interleave16] 464%else 465cglobal %1_pixels8_xy2, 4,5 466 mova m4, [pb_interleave8] 467%endif 468 mova m5, [pb_1] 469 movu m0, [r1] 470 movu m1, [r1+1] 471 pmaddubsw m0, m5 472 pmaddubsw m1, m5 473 xor r4, r4 474 add r1, r2 475.loop: 476 movu m2, [r1+r4] 477 movu m3, [r1+r4+1] 478 pmaddubsw m2, m5 479 pmaddubsw m3, m5 480 paddusw m0, m2 481 paddusw m1, m3 482 pmulhrsw m0, [pw_8192] 483 pmulhrsw m1, [pw_8192] 484%ifidn %1, avg 485 mova m6, [r0+r4] 486 packuswb m0, m1 487 pshufb m0, m4 488 pavgb m0, m6 489%else 490 packuswb m0, m1 491 pshufb m0, m4 492%endif 493 mova [r0+r4], m0 494 add r4, r2 495 496 movu m0, [r1+r4] 497 movu m1, [r1+r4+1] 498 pmaddubsw m0, m5 499 pmaddubsw m1, m5 500 paddusw m2, m0 501 paddusw m3, m1 502 pmulhrsw m2, [pw_8192] 503 pmulhrsw m3, [pw_8192] 504%ifidn %1, avg 505 mova m6, [r0+r4] 506 packuswb m2, m3 507 pshufb m2, m4 508 pavgb m2, m6 509%else 510 packuswb m2, m3 511 pshufb m2, m4 512%endif 513 mova [r0+r4], m2 514 add r4, r2 515 sub r3d, 2 516 jnz .loop 517 REP_RET 518%endmacro 519 520INIT_MMX ssse3 521SSSE3_PIXELS_XY2 put 522SSSE3_PIXELS_XY2 avg 523INIT_XMM ssse3 524SSSE3_PIXELS_XY2 put, 6 525SSSE3_PIXELS_XY2 avg, 7 526