1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pb_1: times 16 db 1 15pw_4: times 8 dw 4 16pw_8: times 8 dw 8 17pw_16: times 8 dw 16 18pw_32: times 8 dw 32 19dc_128: times 16 db 128 20pw2_4: times 8 dw 2 21pw2_8: times 8 dw 4 22pw2_16: times 8 dw 8 23pw2_32: times 8 dw 16 24 25SECTION .text 26 27; ------------------------------------------ 28; input: x, y, z, result 29; 30; trick from pascal 31; (x+2y+z+2)>>2 can be calculated as: 32; result = avg(x,z) 33; result -= xor(x,z) & 1 34; result = avg(result,y) 35; ------------------------------------------ 36%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 37 pavgb %4, %1, %3 38 pxor %3, %1 39 pand %3, [GLOBAL(pb_1)] 40 psubb %4, %3 41 pavgb %4, %2 42%endmacro 43 44INIT_XMM sse2 45cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset 46 GET_GOT goffsetq 47 48 movq m0, [aboveq] 49 DEFINE_ARGS dst, stride, temp 50 psrldq m1, m0, 1 51 psrldq m2, m0, 2 52 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 53 54 ; store 4 lines 55 movd [dstq ], m3 56 psrlq m3, 8 57 movd [dstq+strideq ], m3 58 lea dstq, [dstq+strideq*2] 59 psrlq m3, 8 60 movd [dstq ], m3 61 psrlq m3, 8 62 movd [dstq+strideq ], m3 63 psrlq m0, 56 64 movd tempq, m0 65 mov [dstq+strideq+3], tempb 66 67 RESTORE_GOT 68 RET 69 70INIT_XMM sse2 71cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset 72 GET_GOT goffsetq 73 74 movu m1, [aboveq] 75 pslldq m0, m1, 1 76 psrldq m2, m1, 1 77 DEFINE_ARGS dst, stride, stride3 78 lea stride3q, [strideq*3] 79 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 80 punpckhbw m0, m0 ; 7 7 81 punpcklwd m0, m0 ; 7 7 7 7 82 punpckldq m0, m0 ; 7 7 7 7 7 7 7 7 83 punpcklqdq m3, m0 ; -1 0 1 2 3 4 5 6 7 7 7 7 7 7 7 7 84 85 ; store 4 lines 86 psrldq m3, 1 87 movq [dstq ], m3 88 psrldq m3, 1 89 movq [dstq+strideq ], m3 90 psrldq m3, 1 91 movq [dstq+strideq*2], m3 92 psrldq m3, 1 93 movq [dstq+stride3q ], m3 94 lea dstq, [dstq+strideq*4] 95 96 ; store next 4 lines 97 psrldq m3, 1 98 movq [dstq ], m3 99 psrldq m3, 1 100 movq [dstq+strideq ], m3 101 psrldq m3, 1 102 movq [dstq+strideq*2], m3 103 psrldq m3, 1 104 movq [dstq+stride3q ], m3 105 106 RESTORE_GOT 107 RET 108 109INIT_XMM sse2 110cglobal d207_predictor_4x4, 4, 4, 5, dst, stride, unused, left, goffset 111 GET_GOT goffsetq 112 113 movd m0, [leftq] ; abcd [byte] 114 punpcklbw m4, m0, m0 ; aabb ccdd 115 punpcklwd m4, m4 ; aaaa bbbb cccc dddd 116 psrldq m4, 12 ; dddd 117 punpckldq m0, m4 ; abcd dddd 118 psrldq m1, m0, 1 ; bcdd 119 psrldq m2, m0, 2 ; cddd 120 121 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; a2bc b2cd c3d d 122 pavgb m1, m0 ; ab, bc, cd, d [byte] 123 124 punpcklbw m1, m3 ; ab, a2bc, bc, b2cd, cd, c3d, d, d 125 movd [dstq ], m1 126 psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d 127 movd [dstq+strideq], m1 128 129 lea dstq, [dstq+strideq*2] 130 psrlq m1, 16 ; cd, c3d, d, d 131 movd [dstq ], m1 132 movd [dstq+strideq], m4 ; d, d, d, d 133 RESTORE_GOT 134 RET 135 136INIT_XMM sse2 137cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset 138 GET_GOT goffsetq 139 140 movd m2, [leftq] 141 movd m0, [aboveq] 142 pxor m1, m1 143 punpckldq m0, m2 144 psadbw m0, m1 145 paddw m0, [GLOBAL(pw_4)] 146 psraw m0, 3 147 pshuflw m0, m0, 0x0 148 packuswb m0, m0 149 movd [dstq ], m0 150 movd [dstq+strideq], m0 151 lea dstq, [dstq+strideq*2] 152 movd [dstq ], m0 153 movd [dstq+strideq], m0 154 155 RESTORE_GOT 156 RET 157 158INIT_XMM sse2 159cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset 160 movifnidn leftq, leftmp 161 GET_GOT goffsetq 162 163 pxor m1, m1 164 movd m0, [leftq] 165 psadbw m0, m1 166 paddw m0, [GLOBAL(pw2_4)] 167 psraw m0, 2 168 pshuflw m0, m0, 0x0 169 packuswb m0, m0 170 movd [dstq ], m0 171 movd [dstq+strideq], m0 172 lea dstq, [dstq+strideq*2] 173 movd [dstq ], m0 174 movd [dstq+strideq], m0 175 176 RESTORE_GOT 177 RET 178 179INIT_XMM sse2 180cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset 181 GET_GOT goffsetq 182 183 pxor m1, m1 184 movd m0, [aboveq] 185 psadbw m0, m1 186 paddw m0, [GLOBAL(pw2_4)] 187 psraw m0, 2 188 pshuflw m0, m0, 0x0 189 packuswb m0, m0 190 movd [dstq ], m0 191 movd [dstq+strideq], m0 192 lea dstq, [dstq+strideq*2] 193 movd [dstq ], m0 194 movd [dstq+strideq], m0 195 196 RESTORE_GOT 197 RET 198 199INIT_XMM sse2 200cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset 201 GET_GOT goffsetq 202 203 pxor m1, m1 204 movq m0, [aboveq] 205 movq m2, [leftq] 206 DEFINE_ARGS dst, stride, stride3 207 lea stride3q, [strideq*3] 208 psadbw m0, m1 209 psadbw m2, m1 210 paddw m0, m2 211 paddw m0, [GLOBAL(pw_8)] 212 psraw m0, 4 213 punpcklbw m0, m0 214 pshuflw m0, m0, 0x0 215 movq [dstq ], m0 216 movq [dstq+strideq ], m0 217 movq [dstq+strideq*2], m0 218 movq [dstq+stride3q ], m0 219 lea dstq, [dstq+strideq*4] 220 movq [dstq ], m0 221 movq [dstq+strideq ], m0 222 movq [dstq+strideq*2], m0 223 movq [dstq+stride3q ], m0 224 225 RESTORE_GOT 226 RET 227 228INIT_XMM sse2 229cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset 230 GET_GOT goffsetq 231 232 pxor m1, m1 233 movq m0, [aboveq] 234 DEFINE_ARGS dst, stride, stride3 235 lea stride3q, [strideq*3] 236 psadbw m0, m1 237 paddw m0, [GLOBAL(pw2_8)] 238 psraw m0, 3 239 punpcklbw m0, m0 240 pshuflw m0, m0, 0x0 241 movq [dstq ], m0 242 movq [dstq+strideq ], m0 243 movq [dstq+strideq*2], m0 244 movq [dstq+stride3q ], m0 245 lea dstq, [dstq+strideq*4] 246 movq [dstq ], m0 247 movq [dstq+strideq ], m0 248 movq [dstq+strideq*2], m0 249 movq [dstq+stride3q ], m0 250 251 RESTORE_GOT 252 RET 253 254INIT_XMM sse2 255cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset 256 movifnidn leftq, leftmp 257 GET_GOT goffsetq 258 259 pxor m1, m1 260 movq m0, [leftq] 261 DEFINE_ARGS dst, stride, stride3 262 lea stride3q, [strideq*3] 263 psadbw m0, m1 264 paddw m0, [GLOBAL(pw2_8)] 265 psraw m0, 3 266 punpcklbw m0, m0 267 pshuflw m0, m0, 0x0 268 movq [dstq ], m0 269 movq [dstq+strideq ], m0 270 movq [dstq+strideq*2], m0 271 movq [dstq+stride3q ], m0 272 lea dstq, [dstq+strideq*4] 273 movq [dstq ], m0 274 movq [dstq+strideq ], m0 275 movq [dstq+strideq*2], m0 276 movq [dstq+stride3q ], m0 277 278 RESTORE_GOT 279 RET 280 281INIT_XMM sse2 282cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset 283 GET_GOT goffsetq 284 285 DEFINE_ARGS dst, stride, stride3 286 lea stride3q, [strideq*3] 287 movd m0, [GLOBAL(dc_128)] 288 movd [dstq ], m0 289 movd [dstq+strideq ], m0 290 movd [dstq+strideq*2], m0 291 movd [dstq+stride3q ], m0 292 RESTORE_GOT 293 RET 294 295INIT_XMM sse2 296cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset 297 GET_GOT goffsetq 298 299 DEFINE_ARGS dst, stride, stride3 300 lea stride3q, [strideq*3] 301 movq m0, [GLOBAL(dc_128)] 302 movq [dstq ], m0 303 movq [dstq+strideq ], m0 304 movq [dstq+strideq*2], m0 305 movq [dstq+stride3q ], m0 306 lea dstq, [dstq+strideq*4] 307 movq [dstq ], m0 308 movq [dstq+strideq ], m0 309 movq [dstq+strideq*2], m0 310 movq [dstq+stride3q ], m0 311 RESTORE_GOT 312 RET 313 314INIT_XMM sse2 315cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 316 GET_GOT goffsetq 317 318 pxor m1, m1 319 mova m0, [aboveq] 320 mova m2, [leftq] 321 DEFINE_ARGS dst, stride, stride3, lines4 322 lea stride3q, [strideq*3] 323 mov lines4d, 4 324 psadbw m0, m1 325 psadbw m2, m1 326 paddw m0, m2 327 movhlps m2, m0 328 paddw m0, m2 329 paddw m0, [GLOBAL(pw_16)] 330 psraw m0, 5 331 pshuflw m0, m0, 0x0 332 punpcklqdq m0, m0 333 packuswb m0, m0 334.loop: 335 mova [dstq ], m0 336 mova [dstq+strideq ], m0 337 mova [dstq+strideq*2], m0 338 mova [dstq+stride3q ], m0 339 lea dstq, [dstq+strideq*4] 340 dec lines4d 341 jnz .loop 342 343 RESTORE_GOT 344 REP_RET 345 346 347INIT_XMM sse2 348cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 349 GET_GOT goffsetq 350 351 pxor m1, m1 352 mova m0, [aboveq] 353 DEFINE_ARGS dst, stride, stride3, lines4 354 lea stride3q, [strideq*3] 355 mov lines4d, 4 356 psadbw m0, m1 357 movhlps m2, m0 358 paddw m0, m2 359 paddw m0, [GLOBAL(pw2_16)] 360 psraw m0, 4 361 pshuflw m0, m0, 0x0 362 punpcklqdq m0, m0 363 packuswb m0, m0 364.loop: 365 mova [dstq ], m0 366 mova [dstq+strideq ], m0 367 mova [dstq+strideq*2], m0 368 mova [dstq+stride3q ], m0 369 lea dstq, [dstq+strideq*4] 370 dec lines4d 371 jnz .loop 372 373 RESTORE_GOT 374 REP_RET 375 376INIT_XMM sse2 377cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 378 GET_GOT goffsetq 379 380 pxor m1, m1 381 mova m0, [leftq] 382 DEFINE_ARGS dst, stride, stride3, lines4 383 lea stride3q, [strideq*3] 384 mov lines4d, 4 385 psadbw m0, m1 386 movhlps m2, m0 387 paddw m0, m2 388 paddw m0, [GLOBAL(pw2_16)] 389 psraw m0, 4 390 pshuflw m0, m0, 0x0 391 punpcklqdq m0, m0 392 packuswb m0, m0 393.loop: 394 mova [dstq ], m0 395 mova [dstq+strideq ], m0 396 mova [dstq+strideq*2], m0 397 mova [dstq+stride3q ], m0 398 lea dstq, [dstq+strideq*4] 399 dec lines4d 400 jnz .loop 401 402 RESTORE_GOT 403 REP_RET 404 405INIT_XMM sse2 406cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 407 GET_GOT goffsetq 408 409 DEFINE_ARGS dst, stride, stride3, lines4 410 lea stride3q, [strideq*3] 411 mov lines4d, 4 412 mova m0, [GLOBAL(dc_128)] 413.loop: 414 mova [dstq ], m0 415 mova [dstq+strideq ], m0 416 mova [dstq+strideq*2], m0 417 mova [dstq+stride3q ], m0 418 lea dstq, [dstq+strideq*4] 419 dec lines4d 420 jnz .loop 421 RESTORE_GOT 422 RET 423 424 425INIT_XMM sse2 426cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 427 GET_GOT goffsetq 428 429 pxor m1, m1 430 mova m0, [aboveq] 431 mova m2, [aboveq+16] 432 mova m3, [leftq] 433 mova m4, [leftq+16] 434 DEFINE_ARGS dst, stride, stride3, lines4 435 lea stride3q, [strideq*3] 436 mov lines4d, 8 437 psadbw m0, m1 438 psadbw m2, m1 439 psadbw m3, m1 440 psadbw m4, m1 441 paddw m0, m2 442 paddw m0, m3 443 paddw m0, m4 444 movhlps m2, m0 445 paddw m0, m2 446 paddw m0, [GLOBAL(pw_32)] 447 psraw m0, 6 448 pshuflw m0, m0, 0x0 449 punpcklqdq m0, m0 450 packuswb m0, m0 451.loop: 452 mova [dstq ], m0 453 mova [dstq +16], m0 454 mova [dstq+strideq ], m0 455 mova [dstq+strideq +16], m0 456 mova [dstq+strideq*2 ], m0 457 mova [dstq+strideq*2+16], m0 458 mova [dstq+stride3q ], m0 459 mova [dstq+stride3q +16], m0 460 lea dstq, [dstq+strideq*4] 461 dec lines4d 462 jnz .loop 463 464 RESTORE_GOT 465 REP_RET 466 467INIT_XMM sse2 468cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 469 GET_GOT goffsetq 470 471 pxor m1, m1 472 mova m0, [aboveq] 473 mova m2, [aboveq+16] 474 DEFINE_ARGS dst, stride, stride3, lines4 475 lea stride3q, [strideq*3] 476 mov lines4d, 8 477 psadbw m0, m1 478 psadbw m2, m1 479 paddw m0, m2 480 movhlps m2, m0 481 paddw m0, m2 482 paddw m0, [GLOBAL(pw2_32)] 483 psraw m0, 5 484 pshuflw m0, m0, 0x0 485 punpcklqdq m0, m0 486 packuswb m0, m0 487.loop: 488 mova [dstq ], m0 489 mova [dstq +16], m0 490 mova [dstq+strideq ], m0 491 mova [dstq+strideq +16], m0 492 mova [dstq+strideq*2 ], m0 493 mova [dstq+strideq*2+16], m0 494 mova [dstq+stride3q ], m0 495 mova [dstq+stride3q +16], m0 496 lea dstq, [dstq+strideq*4] 497 dec lines4d 498 jnz .loop 499 500 RESTORE_GOT 501 REP_RET 502 503INIT_XMM sse2 504cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 505 GET_GOT goffsetq 506 507 pxor m1, m1 508 mova m0, [leftq] 509 mova m2, [leftq+16] 510 DEFINE_ARGS dst, stride, stride3, lines4 511 lea stride3q, [strideq*3] 512 mov lines4d, 8 513 psadbw m0, m1 514 psadbw m2, m1 515 paddw m0, m2 516 movhlps m2, m0 517 paddw m0, m2 518 paddw m0, [GLOBAL(pw2_32)] 519 psraw m0, 5 520 pshuflw m0, m0, 0x0 521 punpcklqdq m0, m0 522 packuswb m0, m0 523.loop: 524 mova [dstq ], m0 525 mova [dstq +16], m0 526 mova [dstq+strideq ], m0 527 mova [dstq+strideq +16], m0 528 mova [dstq+strideq*2 ], m0 529 mova [dstq+strideq*2+16], m0 530 mova [dstq+stride3q ], m0 531 mova [dstq+stride3q +16], m0 532 lea dstq, [dstq+strideq*4] 533 dec lines4d 534 jnz .loop 535 536 RESTORE_GOT 537 REP_RET 538 539INIT_XMM sse2 540cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset 541 GET_GOT goffsetq 542 543 DEFINE_ARGS dst, stride, stride3, lines4 544 lea stride3q, [strideq*3] 545 mov lines4d, 8 546 mova m0, [GLOBAL(dc_128)] 547.loop: 548 mova [dstq ], m0 549 mova [dstq +16], m0 550 mova [dstq+strideq ], m0 551 mova [dstq+strideq +16], m0 552 mova [dstq+strideq*2 ], m0 553 mova [dstq+strideq*2+16], m0 554 mova [dstq+stride3q ], m0 555 mova [dstq+stride3q +16], m0 556 lea dstq, [dstq+strideq*4] 557 dec lines4d 558 jnz .loop 559 RESTORE_GOT 560 RET 561 562INIT_XMM sse2 563cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above 564 movd m0, [aboveq] 565 movd [dstq ], m0 566 movd [dstq+strideq], m0 567 lea dstq, [dstq+strideq*2] 568 movd [dstq ], m0 569 movd [dstq+strideq], m0 570 RET 571 572INIT_XMM sse2 573cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above 574 movq m0, [aboveq] 575 DEFINE_ARGS dst, stride, stride3 576 lea stride3q, [strideq*3] 577 movq [dstq ], m0 578 movq [dstq+strideq ], m0 579 movq [dstq+strideq*2], m0 580 movq [dstq+stride3q ], m0 581 lea dstq, [dstq+strideq*4] 582 movq [dstq ], m0 583 movq [dstq+strideq ], m0 584 movq [dstq+strideq*2], m0 585 movq [dstq+stride3q ], m0 586 RET 587 588INIT_XMM sse2 589cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above 590 mova m0, [aboveq] 591 DEFINE_ARGS dst, stride, stride3, nlines4 592 lea stride3q, [strideq*3] 593 mov nlines4d, 4 594.loop: 595 mova [dstq ], m0 596 mova [dstq+strideq ], m0 597 mova [dstq+strideq*2], m0 598 mova [dstq+stride3q ], m0 599 lea dstq, [dstq+strideq*4] 600 dec nlines4d 601 jnz .loop 602 REP_RET 603 604INIT_XMM sse2 605cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above 606 mova m0, [aboveq] 607 mova m1, [aboveq+16] 608 DEFINE_ARGS dst, stride, stride3, nlines4 609 lea stride3q, [strideq*3] 610 mov nlines4d, 8 611.loop: 612 mova [dstq ], m0 613 mova [dstq +16], m1 614 mova [dstq+strideq ], m0 615 mova [dstq+strideq +16], m1 616 mova [dstq+strideq*2 ], m0 617 mova [dstq+strideq*2+16], m1 618 mova [dstq+stride3q ], m0 619 mova [dstq+stride3q +16], m1 620 lea dstq, [dstq+strideq*4] 621 dec nlines4d 622 jnz .loop 623 REP_RET 624 625INIT_XMM sse2 626cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left 627 movifnidn leftq, leftmp 628 movd m0, [leftq] 629 punpcklbw m0, m0 630 punpcklbw m0, m0 631 pshufd m1, m0, 0x1 632 movd [dstq ], m0 633 movd [dstq+strideq], m1 634 pshufd m2, m0, 0x2 635 lea dstq, [dstq+strideq*2] 636 pshufd m3, m0, 0x3 637 movd [dstq ], m2 638 movd [dstq+strideq], m3 639 RET 640 641INIT_XMM sse2 642cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left 643 movifnidn leftq, leftmp 644 mov lineq, -2 645 DEFINE_ARGS dst, stride, line, left, stride3 646 lea stride3q, [strideq*3] 647 movq m0, [leftq ] 648 punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 649.loop: 650 pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 651 pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 652 movq [dstq ], m1 653 movq [dstq+strideq], m2 654 pshuflw m1, m0, 0xaa 655 pshuflw m2, m0, 0xff 656 movq [dstq+strideq*2], m1 657 movq [dstq+stride3q ], m2 658 pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 659 inc lineq 660 lea dstq, [dstq+strideq*4] 661 jnz .loop 662 REP_RET 663 664INIT_XMM sse2 665cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left 666 movifnidn leftq, leftmp 667 mov lineq, -4 668 DEFINE_ARGS dst, stride, line, left, stride3 669 lea stride3q, [strideq*3] 670.loop: 671 movd m0, [leftq] 672 punpcklbw m0, m0 673 punpcklbw m0, m0 ; l1 to l4 each repeated 4 times 674 pshufd m1, m0, 0x0 ; l1 repeated 16 times 675 pshufd m2, m0, 0x55 ; l2 repeated 16 times 676 mova [dstq ], m1 677 mova [dstq+strideq ], m2 678 pshufd m1, m0, 0xaa 679 pshufd m2, m0, 0xff 680 mova [dstq+strideq*2], m1 681 mova [dstq+stride3q ], m2 682 inc lineq 683 lea leftq, [leftq+4 ] 684 lea dstq, [dstq+strideq*4] 685 jnz .loop 686 REP_RET 687 688INIT_XMM sse2 689cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left 690 movifnidn leftq, leftmp 691 mov lineq, -8 692 DEFINE_ARGS dst, stride, line, left, stride3 693 lea stride3q, [strideq*3] 694.loop: 695 movd m0, [leftq] 696 punpcklbw m0, m0 697 punpcklbw m0, m0 ; l1 to l4 each repeated 4 times 698 pshufd m1, m0, 0x0 ; l1 repeated 16 times 699 pshufd m2, m0, 0x55 ; l2 repeated 16 times 700 mova [dstq ], m1 701 mova [dstq+16 ], m1 702 mova [dstq+strideq ], m2 703 mova [dstq+strideq+16 ], m2 704 pshufd m1, m0, 0xaa 705 pshufd m2, m0, 0xff 706 mova [dstq+strideq*2 ], m1 707 mova [dstq+strideq*2+16], m1 708 mova [dstq+stride3q ], m2 709 mova [dstq+stride3q+16 ], m2 710 inc lineq 711 lea leftq, [leftq+4 ] 712 lea dstq, [dstq+strideq*4] 713 jnz .loop 714 REP_RET 715 716INIT_XMM sse2 717cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left 718 pxor m1, m1 719 movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x 720 punpcklbw m0, m1 721 pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] 722 psrldq m0, 2 723 psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] 724 movd m2, [leftq] 725 punpcklbw m2, m1 726 pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] 727 pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] 728 paddw m4, m0 729 paddw m3, m0 730 packuswb m4, m4 731 packuswb m3, m3 732 movd [dstq ], m4 733 movd [dstq+strideq], m3 734 lea dstq, [dstq+strideq*2] 735 pshuflw m4, m2, 0xaa 736 pshuflw m3, m2, 0xff 737 paddw m4, m0 738 paddw m3, m0 739 packuswb m4, m4 740 packuswb m3, m3 741 movd [dstq ], m4 742 movd [dstq+strideq], m3 743 RET 744 745INIT_XMM sse2 746cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left 747 pxor m1, m1 748 movd m2, [aboveq-1] 749 movq m0, [aboveq] 750 punpcklbw m2, m1 751 punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] 752 pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] 753 DEFINE_ARGS dst, stride, line, left 754 mov lineq, -4 755 punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] 756 psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] 757 movq m2, [leftq] 758 punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] 759.loop: 760 pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] 761 pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] 762 punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] 763 punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] 764 paddw m4, m0 765 paddw m3, m0 766 packuswb m4, m3 767 movq [dstq ], m4 768 movhps [dstq+strideq], m4 769 lea dstq, [dstq+strideq*2] 770 psrldq m2, 4 771 inc lineq 772 jnz .loop 773 REP_RET 774 775INIT_XMM sse2 776cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left 777 pxor m1, m1 778 mova m2, [aboveq-16]; 779 mova m0, [aboveq] ; t1 t2 ... t16 [byte] 780 punpckhbw m2, m1 ; [127:112] tl [word] 781 punpckhbw m4, m0, m1 782 punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] 783 DEFINE_ARGS dst, stride, line, left, stride8 784 mov lineq, -8 785 pshufhw m2, m2, 0xff 786 mova m3, [leftq] ; l1 l2 ... l16 [byte] 787 punpckhqdq m2, m2 ; tl repeated 8 times [word] 788 psubw m0, m2 789 psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] 790 punpckhbw m5, m3, m1 791 punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] 792 lea stride8q, [strideq*8] 793.loop: 794 pshuflw m6, m3, 0x0 795 pshuflw m7, m5, 0x0 796 punpcklqdq m6, m6 ; l1 repeated 8 times [word] 797 punpcklqdq m7, m7 ; l8 repeated 8 times [word] 798 paddw m1, m6, m0 799 paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] 800 psrldq m5, 2 801 packuswb m1, m6 802 mova [dstq ], m1 803 paddw m1, m7, m0 804 paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] 805 psrldq m3, 2 806 packuswb m1, m7 807 mova [dstq+stride8q], m1 808 inc lineq 809 lea dstq, [dstq+strideq] 810 jnz .loop 811 REP_RET 812 813INIT_XMM sse2 814cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left 815 pxor m1, m1 816 movd m2, [aboveq-1] 817 mova m0, [aboveq] 818 mova m4, [aboveq+16] 819 punpcklbw m2, m1 820 punpckhbw m3, m0, m1 821 punpckhbw m5, m4, m1 822 punpcklbw m0, m1 823 punpcklbw m4, m1 824 pshuflw m2, m2, 0x0 825 DEFINE_ARGS dst, stride, line, left 826 mov lineq, -16 827 punpcklqdq m2, m2 828 add leftq, 32 829 psubw m0, m2 830 psubw m3, m2 831 psubw m4, m2 832 psubw m5, m2 833.loop: 834 movd m2, [leftq+lineq*2] 835 pxor m1, m1 836 punpcklbw m2, m1 837 pshuflw m7, m2, 0x55 838 pshuflw m2, m2, 0x0 839 punpcklqdq m2, m2 840 punpcklqdq m7, m7 841 paddw m6, m2, m3 842 paddw m1, m2, m0 843 packuswb m1, m6 844 mova [dstq ], m1 845 paddw m6, m2, m5 846 paddw m1, m2, m4 847 packuswb m1, m6 848 mova [dstq+16 ], m1 849 paddw m6, m7, m3 850 paddw m1, m7, m0 851 packuswb m1, m6 852 mova [dstq+strideq ], m1 853 paddw m6, m7, m5 854 paddw m1, m7, m4 855 packuswb m1, m6 856 mova [dstq+strideq+16], m1 857 lea dstq, [dstq+strideq*2] 858 inc lineq 859 jnz .loop 860 REP_RET 861