1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION_RODATA 17pb_1: times 16 db 1 18pw_4: times 8 dw 4 19pw_8: times 8 dw 8 20pw_16: times 8 dw 16 21pw_32: times 8 dw 32 22dc_128: times 16 db 128 23pw2_4: times 8 dw 2 24pw2_8: times 8 dw 4 25pw2_16: times 8 dw 8 26pw2_32: times 8 dw 16 27 28SECTION .text 29 30INIT_XMM sse2 31cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset 32 GET_GOT goffsetq 33 34 movd m2, [leftq] 35 movd m0, [aboveq] 36 pxor m1, m1 37 punpckldq m0, m2 38 psadbw m0, m1 39 paddw m0, [GLOBAL(pw_4)] 40 psraw m0, 3 41 pshuflw m0, m0, 0x0 42 packuswb m0, m0 43 movd [dstq ], m0 44 movd [dstq+strideq], m0 45 lea dstq, [dstq+strideq*2] 46 movd [dstq ], m0 47 movd [dstq+strideq], m0 48 49 RESTORE_GOT 50 RET 51 52INIT_XMM sse2 53cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset 54 movifnidn leftq, leftmp 55 GET_GOT goffsetq 56 57 pxor m1, m1 58 movd m0, [leftq] 59 psadbw m0, m1 60 paddw m0, [GLOBAL(pw2_4)] 61 psraw m0, 2 62 pshuflw m0, m0, 0x0 63 packuswb m0, m0 64 movd [dstq ], m0 65 movd [dstq+strideq], m0 66 lea dstq, [dstq+strideq*2] 67 movd [dstq ], m0 68 movd [dstq+strideq], m0 69 70 RESTORE_GOT 71 RET 72 73INIT_XMM sse2 74cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset 75 GET_GOT goffsetq 76 77 pxor m1, m1 78 movd m0, [aboveq] 79 psadbw m0, m1 80 paddw m0, [GLOBAL(pw2_4)] 81 psraw m0, 2 82 pshuflw m0, m0, 0x0 83 packuswb m0, m0 84 movd [dstq ], m0 85 movd [dstq+strideq], m0 86 lea dstq, [dstq+strideq*2] 87 movd [dstq ], m0 88 movd [dstq+strideq], m0 89 90 RESTORE_GOT 91 RET 92 93INIT_XMM sse2 94cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset 95 GET_GOT goffsetq 96 97 pxor m1, m1 98 movq m0, [aboveq] 99 movq m2, [leftq] 100 DEFINE_ARGS dst, stride, stride3 101 lea stride3q, [strideq*3] 102 psadbw m0, m1 103 psadbw m2, m1 104 paddw m0, m2 105 paddw m0, [GLOBAL(pw_8)] 106 psraw m0, 4 107 punpcklbw m0, m0 108 pshuflw m0, m0, 0x0 109 movq [dstq ], m0 110 movq [dstq+strideq ], m0 111 movq [dstq+strideq*2], m0 112 movq [dstq+stride3q ], m0 113 lea dstq, [dstq+strideq*4] 114 movq [dstq ], m0 115 movq [dstq+strideq ], m0 116 movq [dstq+strideq*2], m0 117 movq [dstq+stride3q ], m0 118 119 RESTORE_GOT 120 RET 121 122INIT_XMM sse2 123cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset 124 GET_GOT goffsetq 125 126 pxor m1, m1 127 movq m0, [aboveq] 128 DEFINE_ARGS dst, stride, stride3 129 lea stride3q, [strideq*3] 130 psadbw m0, m1 131 paddw m0, [GLOBAL(pw2_8)] 132 psraw m0, 3 133 punpcklbw m0, m0 134 pshuflw m0, m0, 0x0 135 movq [dstq ], m0 136 movq [dstq+strideq ], m0 137 movq [dstq+strideq*2], m0 138 movq [dstq+stride3q ], m0 139 lea dstq, [dstq+strideq*4] 140 movq [dstq ], m0 141 movq [dstq+strideq ], m0 142 movq [dstq+strideq*2], m0 143 movq [dstq+stride3q ], m0 144 145 RESTORE_GOT 146 RET 147 148INIT_XMM sse2 149cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset 150 movifnidn leftq, leftmp 151 GET_GOT goffsetq 152 153 pxor m1, m1 154 movq m0, [leftq] 155 DEFINE_ARGS dst, stride, stride3 156 lea stride3q, [strideq*3] 157 psadbw m0, m1 158 paddw m0, [GLOBAL(pw2_8)] 159 psraw m0, 3 160 punpcklbw m0, m0 161 pshuflw m0, m0, 0x0 162 movq [dstq ], m0 163 movq [dstq+strideq ], m0 164 movq [dstq+strideq*2], m0 165 movq [dstq+stride3q ], m0 166 lea dstq, [dstq+strideq*4] 167 movq [dstq ], m0 168 movq [dstq+strideq ], m0 169 movq [dstq+strideq*2], m0 170 movq [dstq+stride3q ], m0 171 172 RESTORE_GOT 173 RET 174 175INIT_XMM sse2 176cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset 177 GET_GOT goffsetq 178 179 DEFINE_ARGS dst, stride, stride3 180 lea stride3q, [strideq*3] 181 movd m0, [GLOBAL(dc_128)] 182 movd [dstq ], m0 183 movd [dstq+strideq ], m0 184 movd [dstq+strideq*2], m0 185 movd [dstq+stride3q ], m0 186 RESTORE_GOT 187 RET 188 189INIT_XMM sse2 190cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset 191 GET_GOT goffsetq 192 193 DEFINE_ARGS dst, stride, stride3 194 lea stride3q, [strideq*3] 195 movq m0, [GLOBAL(dc_128)] 196 movq [dstq ], m0 197 movq [dstq+strideq ], m0 198 movq [dstq+strideq*2], m0 199 movq [dstq+stride3q ], m0 200 lea dstq, [dstq+strideq*4] 201 movq [dstq ], m0 202 movq [dstq+strideq ], m0 203 movq [dstq+strideq*2], m0 204 movq [dstq+stride3q ], m0 205 RESTORE_GOT 206 RET 207 208INIT_XMM sse2 209cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 210 GET_GOT goffsetq 211 212 pxor m1, m1 213 mova m0, [aboveq] 214 mova m2, [leftq] 215 DEFINE_ARGS dst, stride, stride3, lines4 216 lea stride3q, [strideq*3] 217 mov lines4d, 4 218 psadbw m0, m1 219 psadbw m2, m1 220 paddw m0, m2 221 movhlps m2, m0 222 paddw m0, m2 223 paddw m0, [GLOBAL(pw_16)] 224 psraw m0, 5 225 pshuflw m0, m0, 0x0 226 punpcklqdq m0, m0 227 packuswb m0, m0 228.loop: 229 mova [dstq ], m0 230 mova [dstq+strideq ], m0 231 mova [dstq+strideq*2], m0 232 mova [dstq+stride3q ], m0 233 lea dstq, [dstq+strideq*4] 234 dec lines4d 235 jnz .loop 236 237 RESTORE_GOT 238 REP_RET 239 240 241INIT_XMM sse2 242cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 243 GET_GOT goffsetq 244 245 pxor m1, m1 246 mova m0, [aboveq] 247 DEFINE_ARGS dst, stride, stride3, lines4 248 lea stride3q, [strideq*3] 249 mov lines4d, 4 250 psadbw m0, m1 251 movhlps m2, m0 252 paddw m0, m2 253 paddw m0, [GLOBAL(pw2_16)] 254 psraw m0, 4 255 pshuflw m0, m0, 0x0 256 punpcklqdq m0, m0 257 packuswb m0, m0 258.loop: 259 mova [dstq ], m0 260 mova [dstq+strideq ], m0 261 mova [dstq+strideq*2], m0 262 mova [dstq+stride3q ], m0 263 lea dstq, [dstq+strideq*4] 264 dec lines4d 265 jnz .loop 266 267 RESTORE_GOT 268 REP_RET 269 270INIT_XMM sse2 271cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 272 GET_GOT goffsetq 273 274 pxor m1, m1 275 mova m0, [leftq] 276 DEFINE_ARGS dst, stride, stride3, lines4 277 lea stride3q, [strideq*3] 278 mov lines4d, 4 279 psadbw m0, m1 280 movhlps m2, m0 281 paddw m0, m2 282 paddw m0, [GLOBAL(pw2_16)] 283 psraw m0, 4 284 pshuflw m0, m0, 0x0 285 punpcklqdq m0, m0 286 packuswb m0, m0 287.loop: 288 mova [dstq ], m0 289 mova [dstq+strideq ], m0 290 mova [dstq+strideq*2], m0 291 mova [dstq+stride3q ], m0 292 lea dstq, [dstq+strideq*4] 293 dec lines4d 294 jnz .loop 295 296 RESTORE_GOT 297 REP_RET 298 299INIT_XMM sse2 300cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 301 GET_GOT goffsetq 302 303 DEFINE_ARGS dst, stride, stride3, lines4 304 lea stride3q, [strideq*3] 305 mov lines4d, 4 306 mova m0, [GLOBAL(dc_128)] 307.loop: 308 mova [dstq ], m0 309 mova [dstq+strideq ], m0 310 mova [dstq+strideq*2], m0 311 mova [dstq+stride3q ], m0 312 lea dstq, [dstq+strideq*4] 313 dec lines4d 314 jnz .loop 315 RESTORE_GOT 316 RET 317 318 319INIT_XMM sse2 320cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 321 GET_GOT goffsetq 322 323 pxor m1, m1 324 mova m0, [aboveq] 325 mova m2, [aboveq+16] 326 mova m3, [leftq] 327 mova m4, [leftq+16] 328 DEFINE_ARGS dst, stride, stride3, lines4 329 lea stride3q, [strideq*3] 330 mov lines4d, 8 331 psadbw m0, m1 332 psadbw m2, m1 333 psadbw m3, m1 334 psadbw m4, m1 335 paddw m0, m2 336 paddw m0, m3 337 paddw m0, m4 338 movhlps m2, m0 339 paddw m0, m2 340 paddw m0, [GLOBAL(pw_32)] 341 psraw m0, 6 342 pshuflw m0, m0, 0x0 343 punpcklqdq m0, m0 344 packuswb m0, m0 345.loop: 346 mova [dstq ], m0 347 mova [dstq +16], m0 348 mova [dstq+strideq ], m0 349 mova [dstq+strideq +16], m0 350 mova [dstq+strideq*2 ], m0 351 mova [dstq+strideq*2+16], m0 352 mova [dstq+stride3q ], m0 353 mova [dstq+stride3q +16], m0 354 lea dstq, [dstq+strideq*4] 355 dec lines4d 356 jnz .loop 357 358 RESTORE_GOT 359 REP_RET 360 361INIT_XMM sse2 362cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 363 GET_GOT goffsetq 364 365 pxor m1, m1 366 mova m0, [aboveq] 367 mova m2, [aboveq+16] 368 DEFINE_ARGS dst, stride, stride3, lines4 369 lea stride3q, [strideq*3] 370 mov lines4d, 8 371 psadbw m0, m1 372 psadbw m2, m1 373 paddw m0, m2 374 movhlps m2, m0 375 paddw m0, m2 376 paddw m0, [GLOBAL(pw2_32)] 377 psraw m0, 5 378 pshuflw m0, m0, 0x0 379 punpcklqdq m0, m0 380 packuswb m0, m0 381.loop: 382 mova [dstq ], m0 383 mova [dstq +16], m0 384 mova [dstq+strideq ], m0 385 mova [dstq+strideq +16], m0 386 mova [dstq+strideq*2 ], m0 387 mova [dstq+strideq*2+16], m0 388 mova [dstq+stride3q ], m0 389 mova [dstq+stride3q +16], m0 390 lea dstq, [dstq+strideq*4] 391 dec lines4d 392 jnz .loop 393 394 RESTORE_GOT 395 REP_RET 396 397INIT_XMM sse2 398cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 399 GET_GOT goffsetq 400 401 pxor m1, m1 402 mova m0, [leftq] 403 mova m2, [leftq+16] 404 DEFINE_ARGS dst, stride, stride3, lines4 405 lea stride3q, [strideq*3] 406 mov lines4d, 8 407 psadbw m0, m1 408 psadbw m2, m1 409 paddw m0, m2 410 movhlps m2, m0 411 paddw m0, m2 412 paddw m0, [GLOBAL(pw2_32)] 413 psraw m0, 5 414 pshuflw m0, m0, 0x0 415 punpcklqdq m0, m0 416 packuswb m0, m0 417.loop: 418 mova [dstq ], m0 419 mova [dstq +16], m0 420 mova [dstq+strideq ], m0 421 mova [dstq+strideq +16], m0 422 mova [dstq+strideq*2 ], m0 423 mova [dstq+strideq*2+16], m0 424 mova [dstq+stride3q ], m0 425 mova [dstq+stride3q +16], m0 426 lea dstq, [dstq+strideq*4] 427 dec lines4d 428 jnz .loop 429 430 RESTORE_GOT 431 REP_RET 432 433INIT_XMM sse2 434cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset 435 GET_GOT goffsetq 436 437 DEFINE_ARGS dst, stride, stride3, lines4 438 lea stride3q, [strideq*3] 439 mov lines4d, 8 440 mova m0, [GLOBAL(dc_128)] 441.loop: 442 mova [dstq ], m0 443 mova [dstq +16], m0 444 mova [dstq+strideq ], m0 445 mova [dstq+strideq +16], m0 446 mova [dstq+strideq*2 ], m0 447 mova [dstq+strideq*2+16], m0 448 mova [dstq+stride3q ], m0 449 mova [dstq+stride3q +16], m0 450 lea dstq, [dstq+strideq*4] 451 dec lines4d 452 jnz .loop 453 RESTORE_GOT 454 RET 455 456INIT_XMM sse2 457cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above 458 movd m0, [aboveq] 459 movd [dstq ], m0 460 movd [dstq+strideq], m0 461 lea dstq, [dstq+strideq*2] 462 movd [dstq ], m0 463 movd [dstq+strideq], m0 464 RET 465 466INIT_XMM sse2 467cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above 468 movq m0, [aboveq] 469 DEFINE_ARGS dst, stride, stride3 470 lea stride3q, [strideq*3] 471 movq [dstq ], m0 472 movq [dstq+strideq ], m0 473 movq [dstq+strideq*2], m0 474 movq [dstq+stride3q ], m0 475 lea dstq, [dstq+strideq*4] 476 movq [dstq ], m0 477 movq [dstq+strideq ], m0 478 movq [dstq+strideq*2], m0 479 movq [dstq+stride3q ], m0 480 RET 481 482INIT_XMM sse2 483cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above 484 mova m0, [aboveq] 485 DEFINE_ARGS dst, stride, stride3, nlines4 486 lea stride3q, [strideq*3] 487 mov nlines4d, 4 488.loop: 489 mova [dstq ], m0 490 mova [dstq+strideq ], m0 491 mova [dstq+strideq*2], m0 492 mova [dstq+stride3q ], m0 493 lea dstq, [dstq+strideq*4] 494 dec nlines4d 495 jnz .loop 496 REP_RET 497 498INIT_XMM sse2 499cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above 500 mova m0, [aboveq] 501 mova m1, [aboveq+16] 502 DEFINE_ARGS dst, stride, stride3, nlines4 503 lea stride3q, [strideq*3] 504 mov nlines4d, 8 505.loop: 506 mova [dstq ], m0 507 mova [dstq +16], m1 508 mova [dstq+strideq ], m0 509 mova [dstq+strideq +16], m1 510 mova [dstq+strideq*2 ], m0 511 mova [dstq+strideq*2+16], m1 512 mova [dstq+stride3q ], m0 513 mova [dstq+stride3q +16], m1 514 lea dstq, [dstq+strideq*4] 515 dec nlines4d 516 jnz .loop 517 REP_RET 518 519INIT_XMM sse2 520cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left 521 movifnidn leftq, leftmp 522 movd m0, [leftq] 523 punpcklbw m0, m0 524 punpcklbw m0, m0 525 pshufd m1, m0, 0x1 526 movd [dstq ], m0 527 movd [dstq+strideq], m1 528 pshufd m2, m0, 0x2 529 lea dstq, [dstq+strideq*2] 530 pshufd m3, m0, 0x3 531 movd [dstq ], m2 532 movd [dstq+strideq], m3 533 RET 534 535INIT_XMM sse2 536cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left 537 movifnidn leftq, leftmp 538 mov lineq, -2 539 DEFINE_ARGS dst, stride, line, left, stride3 540 lea stride3q, [strideq*3] 541 movq m0, [leftq ] 542 punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 543.loop: 544 pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 545 pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 546 movq [dstq ], m1 547 movq [dstq+strideq], m2 548 pshuflw m1, m0, 0xaa 549 pshuflw m2, m0, 0xff 550 movq [dstq+strideq*2], m1 551 movq [dstq+stride3q ], m2 552 pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 553 inc lineq 554 lea dstq, [dstq+strideq*4] 555 jnz .loop 556 REP_RET 557 558INIT_XMM sse2 559cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left 560 movifnidn leftq, leftmp 561 mov lineq, -4 562 DEFINE_ARGS dst, stride, line, left, stride3 563 lea stride3q, [strideq*3] 564.loop: 565 movd m0, [leftq] 566 punpcklbw m0, m0 567 punpcklbw m0, m0 ; l1 to l4 each repeated 4 times 568 pshufd m1, m0, 0x0 ; l1 repeated 16 times 569 pshufd m2, m0, 0x55 ; l2 repeated 16 times 570 mova [dstq ], m1 571 mova [dstq+strideq ], m2 572 pshufd m1, m0, 0xaa 573 pshufd m2, m0, 0xff 574 mova [dstq+strideq*2], m1 575 mova [dstq+stride3q ], m2 576 inc lineq 577 lea leftq, [leftq+4 ] 578 lea dstq, [dstq+strideq*4] 579 jnz .loop 580 REP_RET 581 582INIT_XMM sse2 583cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left 584 movifnidn leftq, leftmp 585 mov lineq, -8 586 DEFINE_ARGS dst, stride, line, left, stride3 587 lea stride3q, [strideq*3] 588.loop: 589 movd m0, [leftq] 590 punpcklbw m0, m0 591 punpcklbw m0, m0 ; l1 to l4 each repeated 4 times 592 pshufd m1, m0, 0x0 ; l1 repeated 16 times 593 pshufd m2, m0, 0x55 ; l2 repeated 16 times 594 mova [dstq ], m1 595 mova [dstq+16 ], m1 596 mova [dstq+strideq ], m2 597 mova [dstq+strideq+16 ], m2 598 pshufd m1, m0, 0xaa 599 pshufd m2, m0, 0xff 600 mova [dstq+strideq*2 ], m1 601 mova [dstq+strideq*2+16], m1 602 mova [dstq+stride3q ], m2 603 mova [dstq+stride3q+16 ], m2 604 inc lineq 605 lea leftq, [leftq+4 ] 606 lea dstq, [dstq+strideq*4] 607 jnz .loop 608 REP_RET 609