1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14 15pb_1: times 16 db 1 16sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 17sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 18sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 19sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 20sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 21sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 22sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 23sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 24sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 25sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 26sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 27sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 28sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 29sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 30sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 31sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 32sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 33 34SECTION .text 35 36INIT_MMX ssse3 37cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left 38 movifnidn leftq, leftmp 39 add leftq, 4 40 mov lineq, -2 41 pxor m0, m0 42.loop: 43 movd m1, [leftq+lineq*2 ] 44 movd m2, [leftq+lineq*2+1] 45 pshufb m1, m0 46 pshufb m2, m0 47 movd [dstq ], m1 48 movd [dstq+strideq], m2 49 lea dstq, [dstq+strideq*2] 50 inc lineq 51 jnz .loop 52 REP_RET 53 54INIT_MMX ssse3 55cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left 56 movifnidn leftq, leftmp 57 add leftq, 8 58 mov lineq, -4 59 pxor m0, m0 60.loop: 61 movd m1, [leftq+lineq*2 ] 62 movd m2, [leftq+lineq*2+1] 63 pshufb m1, m0 64 pshufb m2, m0 65 movq [dstq ], m1 66 movq [dstq+strideq], m2 67 lea dstq, [dstq+strideq*2] 68 inc lineq 69 jnz .loop 70 REP_RET 71 72INIT_XMM ssse3 73cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left 74 movifnidn leftq, leftmp 75 add leftq, 16 76 mov lineq, -8 77 pxor m0, m0 78.loop: 79 movd m1, [leftq+lineq*2 ] 80 movd m2, [leftq+lineq*2+1] 81 pshufb m1, m0 82 pshufb m2, m0 83 mova [dstq ], m1 84 mova [dstq+strideq], m2 85 lea dstq, [dstq+strideq*2] 86 inc lineq 87 jnz .loop 88 REP_RET 89 90INIT_XMM ssse3 91cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left 92 movifnidn leftq, leftmp 93 add leftq, 32 94 mov lineq, -16 95 pxor m0, m0 96.loop: 97 movd m1, [leftq+lineq*2 ] 98 movd m2, [leftq+lineq*2+1] 99 pshufb m1, m0 100 pshufb m2, m0 101 mova [dstq ], m1 102 mova [dstq +16], m1 103 mova [dstq+strideq ], m2 104 mova [dstq+strideq+16], m2 105 lea dstq, [dstq+strideq*2] 106 inc lineq 107 jnz .loop 108 REP_RET 109 110INIT_MMX ssse3 111cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset 112 GET_GOT goffsetq 113 114 movq m0, [aboveq] 115 pshufb m2, m0, [GLOBAL(sh_b23456777)] 116 pshufb m1, m0, [GLOBAL(sh_b01234577)] 117 pshufb m0, [GLOBAL(sh_b12345677)] 118 pavgb m3, m2, m1 119 pxor m2, m1 120 pand m2, [GLOBAL(pb_1)] 121 psubb m3, m2 122 pavgb m0, m3 123 124 ; store 4 lines 125 movd [dstq ], m0 126 psrlq m0, 8 127 movd [dstq+strideq], m0 128 lea dstq, [dstq+strideq*2] 129 psrlq m0, 8 130 movd [dstq ], m0 131 psrlq m0, 8 132 movd [dstq+strideq], m0 133 134 RESTORE_GOT 135 RET 136 137INIT_MMX ssse3 138cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset 139 GET_GOT goffsetq 140 141 movq m0, [aboveq] 142 mova m1, [GLOBAL(sh_b12345677)] 143 DEFINE_ARGS dst, stride, stride3 144 lea stride3q, [strideq*3] 145 pshufb m2, m0, [GLOBAL(sh_b23456777)] 146 pavgb m3, m2, m0 147 pxor m2, m0 148 pshufb m0, m1 149 pand m2, [GLOBAL(pb_1)] 150 psubb m3, m2 151 pavgb m0, m3 152 153 ; store 4 lines 154 movq [dstq ], m0 155 pshufb m0, m1 156 movq [dstq+strideq ], m0 157 pshufb m0, m1 158 movq [dstq+strideq*2], m0 159 pshufb m0, m1 160 movq [dstq+stride3q ], m0 161 pshufb m0, m1 162 lea dstq, [dstq+strideq*4] 163 164 ; store next 4 lines 165 movq [dstq ], m0 166 pshufb m0, m1 167 movq [dstq+strideq ], m0 168 pshufb m0, m1 169 movq [dstq+strideq*2], m0 170 pshufb m0, m1 171 movq [dstq+stride3q ], m0 172 173 RESTORE_GOT 174 RET 175 176INIT_XMM ssse3 177cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset 178 GET_GOT goffsetq 179 180 mova m0, [aboveq] 181 DEFINE_ARGS dst, stride, stride3, dst8, line 182 lea stride3q, [strideq*3] 183 lea dst8q, [dstq+strideq*8] 184 mova m1, [GLOBAL(sh_b123456789abcdeff)] 185 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 186 pavgb m3, m2, m0 187 pxor m2, m0 188 pshufb m0, m1 189 pand m2, [GLOBAL(pb_1)] 190 psubb m3, m2 191 pavgb m0, m3 192 193 ; first 4 lines and first half of 3rd 4 lines 194 mov lined, 2 195.loop: 196 mova [dstq ], m0 197 movhps [dst8q ], m0 198 pshufb m0, m1 199 mova [dstq +strideq ], m0 200 movhps [dst8q+strideq ], m0 201 pshufb m0, m1 202 mova [dstq +strideq*2 ], m0 203 movhps [dst8q+strideq*2 ], m0 204 pshufb m0, m1 205 mova [dstq +stride3q ], m0 206 movhps [dst8q+stride3q ], m0 207 pshufb m0, m1 208 lea dstq, [dstq +strideq*4] 209 lea dst8q, [dst8q+strideq*4] 210 dec lined 211 jnz .loop 212 213 ; bottom-right 8x8 block 214 movhps [dstq +8], m0 215 movhps [dstq+strideq +8], m0 216 movhps [dstq+strideq*2+8], m0 217 movhps [dstq+stride3q +8], m0 218 lea dstq, [dstq+strideq*4] 219 movhps [dstq +8], m0 220 movhps [dstq+strideq +8], m0 221 movhps [dstq+strideq*2+8], m0 222 movhps [dstq+stride3q +8], m0 223 224 RESTORE_GOT 225 RET 226 227INIT_XMM ssse3 228cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset 229 GET_GOT goffsetq 230 231 mova m0, [aboveq] 232 mova m4, [aboveq+16] 233 DEFINE_ARGS dst, stride, stride3, dst16, line 234 lea stride3q, [strideq*3] 235 lea dst16q, [dstq +strideq*8] 236 lea dst16q, [dst16q+strideq*8] 237 mova m1, [GLOBAL(sh_b123456789abcdeff)] 238 pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] 239 pavgb m3, m2, m4 240 pxor m2, m4 241 palignr m5, m4, m0, 1 242 palignr m6, m4, m0, 2 243 pshufb m4, m1 244 pand m2, [GLOBAL(pb_1)] 245 psubb m3, m2 246 pavgb m4, m3 247 pavgb m3, m0, m6 248 pxor m0, m6 249 pand m0, [GLOBAL(pb_1)] 250 psubb m3, m0 251 pavgb m5, m3 252 253 ; write 4x4 lines (and the first half of the second 4x4 lines) 254 mov lined, 4 255.loop: 256 mova [dstq ], m5 257 mova [dstq +16], m4 258 mova [dst16q ], m4 259 palignr m3, m4, m5, 1 260 pshufb m4, m1 261 mova [dstq +strideq ], m3 262 mova [dstq +strideq +16], m4 263 mova [dst16q+strideq ], m4 264 palignr m5, m4, m3, 1 265 pshufb m4, m1 266 mova [dstq +strideq*2 ], m5 267 mova [dstq +strideq*2+16], m4 268 mova [dst16q+strideq*2 ], m4 269 palignr m3, m4, m5, 1 270 pshufb m4, m1 271 mova [dstq +stride3q ], m3 272 mova [dstq +stride3q +16], m4 273 mova [dst16q+stride3q ], m4 274 palignr m5, m4, m3, 1 275 pshufb m4, m1 276 lea dstq, [dstq +strideq*4] 277 lea dst16q, [dst16q+strideq*4] 278 dec lined 279 jnz .loop 280 281 ; write second half of second 4x4 lines 282 mova [dstq +16], m4 283 mova [dstq +strideq +16], m4 284 mova [dstq +strideq*2+16], m4 285 mova [dstq +stride3q +16], m4 286 lea dstq, [dstq +strideq*4] 287 mova [dstq +16], m4 288 mova [dstq +strideq +16], m4 289 mova [dstq +strideq*2+16], m4 290 mova [dstq +stride3q +16], m4 291 lea dstq, [dstq +strideq*4] 292 mova [dstq +16], m4 293 mova [dstq +strideq +16], m4 294 mova [dstq +strideq*2+16], m4 295 mova [dstq +stride3q +16], m4 296 lea dstq, [dstq +strideq*4] 297 mova [dstq +16], m4 298 mova [dstq +strideq +16], m4 299 mova [dstq +strideq*2+16], m4 300 mova [dstq +stride3q +16], m4 301 302 RESTORE_GOT 303 RET 304 305; ------------------------------------------ 306; input: x, y, z, result 307; 308; trick from pascal 309; (x+2y+z+2)>>2 can be calculated as: 310; result = avg(x,z) 311; result -= xor(x,z) & 1 312; result = avg(result,y) 313; ------------------------------------------ 314%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 315 pavgb %4, %1, %3 316 pxor %3, %1 317 pand %3, [GLOBAL(pb_1)] 318 psubb %4, %3 319 pavgb %4, %2 320%endmacro 321 322INIT_XMM ssse3 323cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset 324 GET_GOT goffsetq 325 326 movq m3, [aboveq] 327 pshufb m1, m3, [GLOBAL(sh_b23456777)] 328 pshufb m2, m3, [GLOBAL(sh_b12345677)] 329 330 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 331 pavgb m3, m2 332 333 ; store 4 lines 334 movd [dstq ], m3 335 movd [dstq+strideq], m4 336 lea dstq, [dstq+strideq*2] 337 psrldq m3, 1 338 psrldq m4, 1 339 movd [dstq ], m3 340 movd [dstq+strideq], m4 341 RESTORE_GOT 342 RET 343 344INIT_XMM ssse3 345cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset 346 GET_GOT goffsetq 347 348 movq m3, [aboveq] 349 DEFINE_ARGS dst, stride, stride3 350 lea stride3q, [strideq*3] 351 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] 352 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] 353 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] 354 pshufb m3, [GLOBAL(sh_b0123456777777777)] 355 356 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 357 pavgb m3, m2 358 359 ; store 4 lines 360 movq [dstq ], m3 361 movq [dstq+strideq], m4 362 psrldq m3, 1 363 psrldq m4, 1 364 movq [dstq+strideq*2], m3 365 movq [dstq+stride3q ], m4 366 lea dstq, [dstq+strideq*4] 367 psrldq m3, 1 368 psrldq m4, 1 369 370 ; store 4 lines 371 movq [dstq ], m3 372 movq [dstq+strideq], m4 373 psrldq m3, 1 374 psrldq m4, 1 375 movq [dstq+strideq*2], m3 376 movq [dstq+stride3q ], m4 377 RESTORE_GOT 378 RET 379 380INIT_XMM ssse3 381cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset 382 GET_GOT goffsetq 383 384 mova m0, [aboveq] 385 DEFINE_ARGS dst, stride, stride3, line 386 lea stride3q, [strideq*3] 387 mova m1, [GLOBAL(sh_b123456789abcdeff)] 388 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 389 pshufb m3, m0, m1 390 391 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 392 pavgb m0, m3 393 394 mov lined, 4 395.loop: 396 mova [dstq ], m0 397 mova [dstq+strideq ], m4 398 pshufb m0, m1 399 pshufb m4, m1 400 mova [dstq+strideq*2], m0 401 mova [dstq+stride3q ], m4 402 pshufb m0, m1 403 pshufb m4, m1 404 lea dstq, [dstq+strideq*4] 405 dec lined 406 jnz .loop 407 RESTORE_GOT 408 REP_RET 409 410INIT_XMM ssse3 411cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset 412 GET_GOT goffsetq 413 414 mova m0, [aboveq] 415 mova m7, [aboveq+16] 416 DEFINE_ARGS dst, stride, stride3, line 417 mova m1, [GLOBAL(sh_b123456789abcdeff)] 418 lea stride3q, [strideq*3] 419 pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] 420 pshufb m3, m7, m1 421 422 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 423 palignr m6, m7, m0, 1 424 palignr m5, m7, m0, 2 425 pavgb m7, m3 426 427 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 428 pavgb m0, m6 429 430 mov lined, 8 431.loop: 432 mova [dstq ], m0 433 mova [dstq +16], m7 434 mova [dstq+strideq ], m2 435 mova [dstq+strideq +16], m4 436 palignr m3, m7, m0, 1 437 palignr m5, m4, m2, 1 438 pshufb m7, m1 439 pshufb m4, m1 440 441 mova [dstq+strideq*2 ], m3 442 mova [dstq+strideq*2+16], m7 443 mova [dstq+stride3q ], m5 444 mova [dstq+stride3q +16], m4 445 palignr m0, m7, m3, 1 446 palignr m2, m4, m5, 1 447 pshufb m7, m1 448 pshufb m4, m1 449 lea dstq, [dstq+strideq*4] 450 dec lined 451 jnz .loop 452 RESTORE_GOT 453 REP_RET 454 455INIT_XMM ssse3 456cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset 457 GET_GOT goffsetq 458 movd m0, [leftq] ; l1, l2, l3, l4 459 movd m1, [aboveq-1] ; tl, t1, t2, t3 460 punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 461 pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 462 psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 463 psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 464 ; comments below are for a predictor like this 465 ; A1 B1 C1 D1 466 ; A2 B2 A1 B1 467 ; A3 B3 A2 B2 468 ; A4 B4 A3 B3 469 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 470 pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 471 472 punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. 473 474 DEFINE_ARGS dst, stride, stride3 475 lea stride3q, [strideq*3] 476 pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. 477 movd [dstq+stride3q ], m3 478 psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. 479 movd [dstq+strideq*2], m3 480 psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. 481 movd [dstq+strideq ], m3 482 psrldq m3, 2 ; A1 B1 C1 D1 .. 483 movd [dstq ], m3 484 RESTORE_GOT 485 RET 486 487INIT_XMM ssse3 488cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset 489 GET_GOT goffsetq 490 movq m0, [leftq] ; [0- 7] l1-8 [byte] 491 movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] 492 pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] 493 pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] 494 pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] 495 pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] 496 psrldq m4, m0, 1 ; t1-7 [word] 497 psrldq m5, m0, 2 ; t2-7 [word] 498 ; comments below are for a predictor like this 499 ; A1 B1 C1 D1 E1 F1 G1 H1 500 ; A2 B2 A1 B1 C1 D1 E1 F1 501 ; A3 B3 A2 B2 A1 B1 C1 D1 502 ; A4 B4 A3 B3 A2 B2 A1 B1 503 ; A5 B5 A4 B4 A3 B3 A2 B2 504 ; A6 B6 A5 B5 A4 B4 A3 B3 505 ; A7 B7 A6 B6 A5 B5 A4 B4 506 ; A8 B8 A7 B7 A6 B6 A5 B5 507 pavgb m6, m1, m2 ; 2-tap avg A8-A1 508 509 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 510 511 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 512 513 punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 514 515 DEFINE_ARGS dst, stride, stride3 516 lea stride3q, [strideq*3] 517 518 movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 519 palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 520 movq [dstq+strideq*2], m0 521 psrldq m0, 2 ; A-B2, A-B1, C-H1 522 movq [dstq+strideq ], m0 523 psrldq m0, 2 ; A-H1 524 movq [dstq ], m0 525 lea dstq, [dstq+strideq*4] 526 movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 527 psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 528 movq [dstq+strideq*2], m6 529 psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 530 movq [dstq+strideq ], m6 531 psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 532 movq [dstq ], m6 533 RESTORE_GOT 534 RET 535 536INIT_XMM ssse3 537cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset 538 GET_GOT goffsetq 539 mova m0, [leftq] 540 movu m7, [aboveq-1] 541 ; comments below are for a predictor like this 542 ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 543 ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 544 ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 545 ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 546 ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 547 ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 548 ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 549 ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 550 ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 551 ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 552 ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 553 ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 554 ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 555 ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 556 ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 557 ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 558 pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] 559 palignr m5, m0, m6, 15 560 palignr m3, m0, m6, 14 561 562 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg 563 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] 564 pavgb m5, m0 ; A1 - Ag 565 566 punpcklbw m0, m4, m5 ; A-B8 ... A-B1 567 punpckhbw m4, m5 ; A-B9 ... A-Bg 568 569 pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] 570 pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] 571 572 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 573 574 pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] 575 DEFINE_ARGS dst, stride, stride3 576 lea stride3q, [strideq*3] 577 palignr m2, m1, m6, 14 578 mova [dstq ], m2 579 palignr m2, m1, m6, 12 580 mova [dstq+strideq ], m2 581 palignr m2, m1, m6, 10 582 mova [dstq+strideq*2], m2 583 palignr m2, m1, m6, 8 584 mova [dstq+stride3q ], m2 585 lea dstq, [dstq+strideq*4] 586 palignr m2, m1, m6, 6 587 mova [dstq ], m2 588 palignr m2, m1, m6, 4 589 mova [dstq+strideq ], m2 590 palignr m2, m1, m6, 2 591 mova [dstq+strideq*2], m2 592 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] 593 mova [dstq+stride3q ], m6 594 lea dstq, [dstq+strideq*4] 595 596 palignr m2, m6, m4, 14 597 mova [dstq ], m2 598 palignr m2, m6, m4, 12 599 mova [dstq+strideq ], m2 600 palignr m2, m6, m4, 10 601 mova [dstq+strideq*2], m2 602 palignr m2, m6, m4, 8 603 mova [dstq+stride3q ], m2 604 lea dstq, [dstq+strideq*4] 605 palignr m2, m6, m4, 6 606 mova [dstq ], m2 607 palignr m2, m6, m4, 4 608 mova [dstq+strideq ], m2 609 palignr m2, m6, m4, 2 610 mova [dstq+strideq*2], m2 611 mova [dstq+stride3q ], m4 612 RESTORE_GOT 613 RET 614 615INIT_XMM ssse3 616cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset 617 GET_GOT goffsetq 618 mova m0, [leftq] 619 movu m7, [aboveq-1] 620 movu m1, [aboveq+15] 621 622 pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] 623 pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] 624 625 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] 626 627 palignr m3, m1, m7, 1 628 palignr m5, m1, m7, 2 629 630 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] 631 632 pshufb m7, [GLOBAL(sh_bfedcba9876543210)] 633 palignr m5, m0, m7, 15 634 palignr m3, m0, m7, 14 635 636 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg 637 pavgb m5, m0 ; A1 - Ag 638 punpcklbw m6, m4, m5 ; A-B8 ... A-B1 639 punpckhbw m4, m5 ; A-B9 ... A-Bg 640 pshufb m6, [GLOBAL(sh_bfedcba9876543210)] 641 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] 642 643 DEFINE_ARGS dst, stride, stride3, left, line 644 lea stride3q, [strideq*3] 645 646 palignr m5, m2, m1, 14 647 palignr m7, m1, m6, 14 648 mova [dstq ], m7 649 mova [dstq+16 ], m5 650 palignr m5, m2, m1, 12 651 palignr m7, m1, m6, 12 652 mova [dstq+strideq ], m7 653 mova [dstq+strideq+16 ], m5 654 palignr m5, m2, m1, 10 655 palignr m7, m1, m6, 10 656 mova [dstq+strideq*2 ], m7 657 mova [dstq+strideq*2+16], m5 658 palignr m5, m2, m1, 8 659 palignr m7, m1, m6, 8 660 mova [dstq+stride3q ], m7 661 mova [dstq+stride3q+16 ], m5 662 lea dstq, [dstq+strideq*4] 663 palignr m5, m2, m1, 6 664 palignr m7, m1, m6, 6 665 mova [dstq ], m7 666 mova [dstq+16 ], m5 667 palignr m5, m2, m1, 4 668 palignr m7, m1, m6, 4 669 mova [dstq+strideq ], m7 670 mova [dstq+strideq+16 ], m5 671 palignr m5, m2, m1, 2 672 palignr m7, m1, m6, 2 673 mova [dstq+strideq*2 ], m7 674 mova [dstq+strideq*2+16], m5 675 mova [dstq+stride3q ], m6 676 mova [dstq+stride3q+16 ], m1 677 lea dstq, [dstq+strideq*4] 678 679 palignr m5, m1, m6, 14 680 palignr m3, m6, m4, 14 681 mova [dstq ], m3 682 mova [dstq+16 ], m5 683 palignr m5, m1, m6, 12 684 palignr m3, m6, m4, 12 685 mova [dstq+strideq ], m3 686 mova [dstq+strideq+16 ], m5 687 palignr m5, m1, m6, 10 688 palignr m3, m6, m4, 10 689 mova [dstq+strideq*2 ], m3 690 mova [dstq+strideq*2+16], m5 691 palignr m5, m1, m6, 8 692 palignr m3, m6, m4, 8 693 mova [dstq+stride3q ], m3 694 mova [dstq+stride3q+16 ], m5 695 lea dstq, [dstq+strideq*4] 696 palignr m5, m1, m6, 6 697 palignr m3, m6, m4, 6 698 mova [dstq ], m3 699 mova [dstq+16 ], m5 700 palignr m5, m1, m6, 4 701 palignr m3, m6, m4, 4 702 mova [dstq+strideq ], m3 703 mova [dstq+strideq+16 ], m5 704 palignr m5, m1, m6, 2 705 palignr m3, m6, m4, 2 706 mova [dstq+strideq*2 ], m3 707 mova [dstq+strideq*2+16], m5 708 mova [dstq+stride3q ], m4 709 mova [dstq+stride3q+16 ], m6 710 lea dstq, [dstq+strideq*4] 711 712 mova m7, [leftq] 713 mova m3, [leftq+16] 714 palignr m5, m3, m7, 15 715 palignr m0, m3, m7, 14 716 717 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - 718 pavgb m5, m3 ; Ah - 719 punpcklbw m3, m2, m5 ; A-B8 ... A-B1 720 punpckhbw m2, m5 ; A-B9 ... A-Bg 721 pshufb m3, [GLOBAL(sh_bfedcba9876543210)] 722 pshufb m2, [GLOBAL(sh_bfedcba9876543210)] 723 724 palignr m7, m6, m4, 14 725 palignr m0, m4, m3, 14 726 mova [dstq ], m0 727 mova [dstq+16 ], m7 728 palignr m7, m6, m4, 12 729 palignr m0, m4, m3, 12 730 mova [dstq+strideq ], m0 731 mova [dstq+strideq+16 ], m7 732 palignr m7, m6, m4, 10 733 palignr m0, m4, m3, 10 734 mova [dstq+strideq*2 ], m0 735 mova [dstq+strideq*2+16], m7 736 palignr m7, m6, m4, 8 737 palignr m0, m4, m3, 8 738 mova [dstq+stride3q ], m0 739 mova [dstq+stride3q+16 ], m7 740 lea dstq, [dstq+strideq*4] 741 palignr m7, m6, m4, 6 742 palignr m0, m4, m3, 6 743 mova [dstq ], m0 744 mova [dstq+16 ], m7 745 palignr m7, m6, m4, 4 746 palignr m0, m4, m3, 4 747 mova [dstq+strideq ], m0 748 mova [dstq+strideq+16 ], m7 749 palignr m7, m6, m4, 2 750 palignr m0, m4, m3, 2 751 mova [dstq+strideq*2 ], m0 752 mova [dstq+strideq*2+16], m7 753 mova [dstq+stride3q ], m3 754 mova [dstq+stride3q+16 ], m4 755 lea dstq, [dstq+strideq*4] 756 757 palignr m7, m4, m3, 14 758 palignr m0, m3, m2, 14 759 mova [dstq ], m0 760 mova [dstq+16 ], m7 761 palignr m7, m4, m3, 12 762 palignr m0, m3, m2, 12 763 mova [dstq+strideq ], m0 764 mova [dstq+strideq+16 ], m7 765 palignr m7, m4, m3, 10 766 palignr m0, m3, m2, 10 767 mova [dstq+strideq*2 ], m0 768 mova [dstq+strideq*2+16], m7 769 palignr m7, m4, m3, 8 770 palignr m0, m3, m2, 8 771 mova [dstq+stride3q ], m0 772 mova [dstq+stride3q+16 ], m7 773 lea dstq, [dstq+strideq*4] 774 palignr m7, m4, m3, 6 775 palignr m0, m3, m2, 6 776 mova [dstq ], m0 777 mova [dstq+16 ], m7 778 palignr m7, m4, m3, 4 779 palignr m0, m3, m2, 4 780 mova [dstq+strideq ], m0 781 mova [dstq+strideq+16 ], m7 782 palignr m7, m4, m3, 2 783 palignr m0, m3, m2, 2 784 mova [dstq+strideq*2 ], m0 785 mova [dstq+strideq*2+16], m7 786 mova [dstq+stride3q ], m2 787 mova [dstq+stride3q+16 ], m3 788 789 RESTORE_GOT 790 RET 791 792INIT_MMX ssse3 793cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset 794 GET_GOT goffsetq 795 movd m0, [leftq] ; abcd [byte] 796 pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] 797 pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd 798 799 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 800 pavgb m1, m0 ; ab, bc, cd, d [byte] 801 802 punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d 803 movd [dstq ], m1 804 psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d 805 movd [dstq+strideq], m1 806 lea dstq, [dstq+strideq*2] 807 psrlq m1, 16 ; cd, c3d, d, d 808 movd [dstq ], m1 809 pshufw m1, m1, q1111 ; d, d, d, d 810 movd [dstq+strideq], m1 811 RESTORE_GOT 812 RET 813 814INIT_XMM ssse3 815cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset 816 GET_GOT goffsetq 817 movq m3, [leftq] ; abcdefgh [byte] 818 lea stride3q, [strideq*3] 819 820 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] 821 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] 822 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] 823 824 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 825 pavgb m0, m2 826 punpcklbw m0, m3 ; interleaved output 827 828 movq [dstq ], m0 829 psrldq m0, 2 830 movq [dstq+strideq ], m0 831 psrldq m0, 2 832 movq [dstq+strideq*2], m0 833 psrldq m0, 2 834 movq [dstq+stride3q ], m0 835 lea dstq, [dstq+strideq*4] 836 pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh 837 psrldq m0, 2 838 movq [dstq ], m0 839 psrldq m0, 2 840 movq [dstq+strideq ], m0 841 psrldq m0, 2 842 movq [dstq+strideq*2], m0 843 psrldq m0, 2 844 movq [dstq+stride3q ], m0 845 RESTORE_GOT 846 RET 847 848INIT_XMM ssse3 849cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset 850 GET_GOT goffsetq 851 lea stride3q, [strideq*3] 852 mova m0, [leftq] ; abcdefghijklmnop [byte] 853 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp 854 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 855 856 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 857 pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] 858 859 punpckhbw m4, m1, m3 ; interleaved input 860 punpcklbw m1, m3 ; interleaved output 861 mova [dstq ], m1 862 palignr m3, m4, m1, 2 863 mova [dstq+strideq ], m3 864 palignr m3, m4, m1, 4 865 mova [dstq+strideq*2], m3 866 palignr m3, m4, m1, 6 867 mova [dstq+stride3q ], m3 868 lea dstq, [dstq+strideq*4] 869 palignr m3, m4, m1, 8 870 mova [dstq ], m3 871 palignr m3, m4, m1, 10 872 mova [dstq+strideq ], m3 873 palignr m3, m4, m1, 12 874 mova [dstq+strideq*2], m3 875 palignr m3, m4, m1, 14 876 mova [dstq+stride3q ], m3 877 DEFINE_ARGS dst, stride, stride3, line 878 mov lined, 2 879 mova m0, [GLOBAL(sh_b23456789abcdefff)] 880.loop: 881 lea dstq, [dstq+strideq*4] 882 mova [dstq ], m4 883 pshufb m4, m0 884 mova [dstq+strideq ], m4 885 pshufb m4, m0 886 mova [dstq+strideq*2], m4 887 pshufb m4, m0 888 mova [dstq+stride3q ], m4 889 pshufb m4, m0 890 dec lined 891 jnz .loop 892 RESTORE_GOT 893 REP_RET 894 895INIT_XMM ssse3 896cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset 897 GET_GOT goffsetq 898 lea stride3q, [strideq*3] 899 mova m1, [leftq] ; 0-15 [byte] 900 mova m2, [leftq+16] ; 16-31 [byte] 901 pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] 902 pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] 903 904 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 905 palignr m6, m2, m1, 1 906 palignr m5, m2, m1, 2 907 pavgb m2, m4 ; high 16px even lines 908 909 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 910 pavgb m1, m6 ; low 16px even lines 911 912 punpckhbw m6, m1, m0 ; interleaved output 2 913 punpcklbw m1, m0 ; interleaved output 1 914 915 punpckhbw m7, m2, m3 ; interleaved output 4 916 punpcklbw m2, m3 ; interleaved output 3 917 918 ; output 1st 8 lines (and half of 2nd 8 lines) 919 DEFINE_ARGS dst, stride, stride3, dst8 920 lea dst8q, [dstq+strideq*8] 921 mova [dstq ], m1 922 mova [dstq +16], m6 923 mova [dst8q ], m6 924 palignr m0, m6, m1, 2 925 palignr m4, m2, m6, 2 926 mova [dstq +strideq ], m0 927 mova [dstq +strideq +16], m4 928 mova [dst8q+strideq ], m4 929 palignr m0, m6, m1, 4 930 palignr m4, m2, m6, 4 931 mova [dstq +strideq*2 ], m0 932 mova [dstq +strideq*2+16], m4 933 mova [dst8q+strideq*2 ], m4 934 palignr m0, m6, m1, 6 935 palignr m4, m2, m6, 6 936 mova [dstq +stride3q ], m0 937 mova [dstq +stride3q +16], m4 938 mova [dst8q+stride3q ], m4 939 lea dstq, [dstq +strideq*4] 940 lea dst8q, [dst8q+strideq*4] 941 palignr m0, m6, m1, 8 942 palignr m4, m2, m6, 8 943 mova [dstq ], m0 944 mova [dstq +16], m4 945 mova [dst8q ], m4 946 palignr m0, m6, m1, 10 947 palignr m4, m2, m6, 10 948 mova [dstq +strideq ], m0 949 mova [dstq +strideq +16], m4 950 mova [dst8q+strideq ], m4 951 palignr m0, m6, m1, 12 952 palignr m4, m2, m6, 12 953 mova [dstq +strideq*2 ], m0 954 mova [dstq +strideq*2+16], m4 955 mova [dst8q+strideq*2 ], m4 956 palignr m0, m6, m1, 14 957 palignr m4, m2, m6, 14 958 mova [dstq +stride3q ], m0 959 mova [dstq +stride3q +16], m4 960 mova [dst8q+stride3q ], m4 961 lea dstq, [dstq+strideq*4] 962 lea dst8q, [dst8q+strideq*4] 963 964 ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines 965 mova [dstq +16], m2 966 mova [dst8q ], m2 967 palignr m4, m7, m2, 2 968 mova [dstq +strideq +16], m4 969 mova [dst8q+strideq ], m4 970 palignr m4, m7, m2, 4 971 mova [dstq +strideq*2+16], m4 972 mova [dst8q+strideq*2 ], m4 973 palignr m4, m7, m2, 6 974 mova [dstq +stride3q +16], m4 975 mova [dst8q+stride3q ], m4 976 lea dstq, [dstq+strideq*4] 977 lea dst8q, [dst8q+strideq*4] 978 palignr m4, m7, m2, 8 979 mova [dstq +16], m4 980 mova [dst8q ], m4 981 palignr m4, m7, m2, 10 982 mova [dstq +strideq +16], m4 983 mova [dst8q+strideq ], m4 984 palignr m4, m7, m2, 12 985 mova [dstq +strideq*2+16], m4 986 mova [dst8q+strideq*2 ], m4 987 palignr m4, m7, m2, 14 988 mova [dstq +stride3q +16], m4 989 mova [dst8q+stride3q ], m4 990 lea dstq, [dstq+strideq*4] 991 lea dst8q, [dst8q+strideq*4] 992 993 ; output 2nd half of 3rd 8 lines and half of 4th 8 lines 994 mova m0, [GLOBAL(sh_b23456789abcdefff)] 995 mova [dstq +16], m7 996 mova [dst8q ], m7 997 pshufb m7, m0 998 mova [dstq +strideq +16], m7 999 mova [dst8q+strideq ], m7 1000 pshufb m7, m0 1001 mova [dstq +strideq*2+16], m7 1002 mova [dst8q+strideq*2 ], m7 1003 pshufb m7, m0 1004 mova [dstq +stride3q +16], m7 1005 mova [dst8q+stride3q ], m7 1006 pshufb m7, m0 1007 lea dstq, [dstq+strideq*4] 1008 lea dst8q, [dst8q+strideq*4] 1009 mova [dstq +16], m7 1010 mova [dst8q ], m7 1011 pshufb m7, m0 1012 mova [dstq +strideq +16], m7 1013 mova [dst8q+strideq ], m7 1014 pshufb m7, m0 1015 mova [dstq +strideq*2+16], m7 1016 mova [dst8q+strideq*2 ], m7 1017 pshufb m7, m0 1018 mova [dstq +stride3q +16], m7 1019 mova [dst8q+stride3q ], m7 1020 pshufb m7, m0 1021 lea dstq, [dstq+strideq*4] 1022 1023 ; output last half of 4th 8 lines 1024 mova [dstq +16], m7 1025 mova [dstq +strideq +16], m7 1026 mova [dstq +strideq*2+16], m7 1027 mova [dstq +stride3q +16], m7 1028 lea dstq, [dstq+strideq*4] 1029 mova [dstq +16], m7 1030 mova [dstq +strideq +16], m7 1031 mova [dstq +strideq*2+16], m7 1032 mova [dstq +stride3q +16], m7 1033 1034 ; done! 1035 RESTORE_GOT 1036 RET 1037