1;****************************************************************************** 2;* MMX/SSE2-optimized functions for the RV40 decoder 3;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> 4;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com> 5;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 29 30sixtap_filter_hb_m: times 8 db 1, -5 31 times 8 db 52, 20 32 ; multiplied by 2 to have the same shift 33 times 8 db 2, -10 34 times 8 db 40, 40 35 ; back to normal 36 times 8 db 1, -5 37 times 8 db 20, 52 38 39sixtap_filter_v_m: times 8 dw 1 40 times 8 dw -5 41 times 8 dw 52 42 times 8 dw 20 43 ; multiplied by 2 to have the same shift 44 times 8 dw 2 45 times 8 dw -10 46 times 8 dw 40 47 times 8 dw 40 48 ; back to normal 49 times 8 dw 1 50 times 8 dw -5 51 times 8 dw 20 52 times 8 dw 52 53 54%ifdef PIC 55%define sixtap_filter_hw picregq 56%define sixtap_filter_hb picregq 57%define sixtap_filter_v picregq 58%define npicregs 1 59%else 60%define sixtap_filter_hw sixtap_filter_hw_m 61%define sixtap_filter_hb sixtap_filter_hb_m 62%define sixtap_filter_v sixtap_filter_v_m 63%define npicregs 0 64%endif 65 66filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 67filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 68filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 69 70cextern pw_32 71cextern pw_16 72cextern pw_512 73 74SECTION .text 75 76;----------------------------------------------------------------------------- 77; subpel MC functions: 78; 79; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride, 80; uint8_t *src, int srcstride, 81; int len, int m); 82;---------------------------------------------------------------------- 83%macro LOAD 2 84%if WIN64 85 movsxd %1q, %1d 86%endif 87%ifdef PIC 88 add %1q, picregq 89%else 90 add %1q, %2 91%endif 92%endmacro 93 94%macro STORE 3 95%ifidn %3, avg 96 movh %2, [dstq] 97%endif 98 packuswb %1, %1 99%ifidn %3, avg 100 PAVGB %1, %2 101%endif 102 movh [dstq], %1 103%endmacro 104 105%macro FILTER_V 1 106cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg 107%ifdef PIC 108 lea picregq, [sixtap_filter_v_m] 109%endif 110 pxor m7, m7 111 LOAD my, sixtap_filter_v 112 113 ; read 5 lines 114 sub srcq, srcstrideq 115 sub srcq, srcstrideq 116 movh m0, [srcq] 117 movh m1, [srcq+srcstrideq] 118 movh m2, [srcq+srcstrideq*2] 119 lea srcq, [srcq+srcstrideq*2] 120 add srcq, srcstrideq 121 movh m3, [srcq] 122 movh m4, [srcq+srcstrideq] 123 punpcklbw m0, m7 124 punpcklbw m1, m7 125 punpcklbw m2, m7 126 punpcklbw m3, m7 127 punpcklbw m4, m7 128 129%ifdef m8 130 mova m8, [myq+ 0] 131 mova m9, [myq+16] 132 mova m10, [myq+32] 133 mova m11, [myq+48] 134%define COEFF05 m8 135%define COEFF14 m9 136%define COEFF2 m10 137%define COEFF3 m11 138%else 139%define COEFF05 [myq+ 0] 140%define COEFF14 [myq+16] 141%define COEFF2 [myq+32] 142%define COEFF3 [myq+48] 143%endif 144.nextrow: 145 mova m6, m1 146 movh m5, [srcq+2*srcstrideq] ; read new row 147 paddw m6, m4 148 punpcklbw m5, m7 149 pmullw m6, COEFF14 150 paddw m0, m5 151 pmullw m0, COEFF05 152 paddw m6, m0 153 mova m0, m1 154 paddw m6, [pw_32] 155 mova m1, m2 156 pmullw m2, COEFF2 157 paddw m6, m2 158 mova m2, m3 159 pmullw m3, COEFF3 160 paddw m6, m3 161 162 ; round/clip/store 163 mova m3, m4 164 psraw m6, 6 165 mova m4, m5 166 STORE m6, m5, %1 167 168 ; go to next line 169 add dstq, dststrideq 170 add srcq, srcstrideq 171 dec heightd ; next row 172 jg .nextrow 173 REP_RET 174%endmacro 175 176%macro FILTER_H 1 177cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg 178%ifdef PIC 179 lea picregq, [sixtap_filter_v_m] 180%endif 181 pxor m7, m7 182 LOAD mx, sixtap_filter_v 183 mova m6, [pw_32] 184%ifdef m8 185 mova m8, [mxq+ 0] 186 mova m9, [mxq+16] 187 mova m10, [mxq+32] 188 mova m11, [mxq+48] 189%define COEFF05 m8 190%define COEFF14 m9 191%define COEFF2 m10 192%define COEFF3 m11 193%else 194%define COEFF05 [mxq+ 0] 195%define COEFF14 [mxq+16] 196%define COEFF2 [mxq+32] 197%define COEFF3 [mxq+48] 198%endif 199.nextrow: 200 movq m0, [srcq-2] 201 movq m5, [srcq+3] 202 movq m1, [srcq-1] 203 movq m4, [srcq+2] 204 punpcklbw m0, m7 205 punpcklbw m5, m7 206 punpcklbw m1, m7 207 punpcklbw m4, m7 208 movq m2, [srcq-0] 209 movq m3, [srcq+1] 210 paddw m0, m5 211 paddw m1, m4 212 punpcklbw m2, m7 213 punpcklbw m3, m7 214 pmullw m0, COEFF05 215 pmullw m1, COEFF14 216 pmullw m2, COEFF2 217 pmullw m3, COEFF3 218 paddw m0, m6 219 paddw m1, m2 220 paddw m0, m3 221 paddw m0, m1 222 psraw m0, 6 223 STORE m0, m1, %1 224 225 ; go to next line 226 add dstq, dststrideq 227 add srcq, srcstrideq 228 dec heightd ; next row 229 jg .nextrow 230 REP_RET 231%endmacro 232 233%if ARCH_X86_32 234INIT_MMX mmx 235FILTER_V put 236FILTER_H put 237 238INIT_MMX mmxext 239FILTER_V avg 240FILTER_H avg 241 242INIT_MMX 3dnow 243FILTER_V avg 244FILTER_H avg 245%endif 246 247INIT_XMM sse2 248FILTER_H put 249FILTER_H avg 250FILTER_V put 251FILTER_V avg 252 253%macro FILTER_SSSE3 1 254cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg 255%ifdef PIC 256 lea picregq, [sixtap_filter_hb_m] 257%endif 258 259 ; read 5 lines 260 sub srcq, srcstrideq 261 LOAD my, sixtap_filter_hb 262 sub srcq, srcstrideq 263 movh m0, [srcq] 264 movh m1, [srcq+srcstrideq] 265 movh m2, [srcq+srcstrideq*2] 266 lea srcq, [srcq+srcstrideq*2] 267 add srcq, srcstrideq 268 mova m5, [myq] 269 movh m3, [srcq] 270 movh m4, [srcq+srcstrideq] 271 lea srcq, [srcq+2*srcstrideq] 272 273.nextrow: 274 mova m6, m2 275 punpcklbw m0, m1 276 punpcklbw m6, m3 277 pmaddubsw m0, m5 278 pmaddubsw m6, [myq+16] 279 movh m7, [srcq] ; read new row 280 paddw m6, m0 281 mova m0, m1 282 mova m1, m2 283 mova m2, m3 284 mova m3, m4 285 mova m4, m7 286 punpcklbw m7, m3 287 pmaddubsw m7, m5 288 paddw m6, m7 289 pmulhrsw m6, [pw_512] 290 STORE m6, m7, %1 291 292 ; go to next line 293 add dstq, dststrideq 294 add srcq, srcstrideq 295 dec heightd ; next row 296 jg .nextrow 297 REP_RET 298 299cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg 300%ifdef PIC 301 lea picregq, [sixtap_filter_hb_m] 302%endif 303 mova m3, [filter_h6_shuf2] 304 mova m4, [filter_h6_shuf3] 305 LOAD mx, sixtap_filter_hb 306 mova m5, [mxq] ; set up 6tap filter in bytes 307 mova m6, [mxq+16] 308 mova m7, [filter_h6_shuf1] 309 310.nextrow: 311 movu m0, [srcq-2] 312 mova m1, m0 313 mova m2, m0 314 pshufb m0, m7 315 pshufb m1, m3 316 pshufb m2, m4 317 pmaddubsw m0, m5 318 pmaddubsw m1, m6 319 pmaddubsw m2, m5 320 paddw m0, m1 321 paddw m0, m2 322 pmulhrsw m0, [pw_512] 323 STORE m0, m1, %1 324 325 ; go to next line 326 add dstq, dststrideq 327 add srcq, srcstrideq 328 dec heightd ; next row 329 jg .nextrow 330 REP_RET 331%endmacro 332 333INIT_XMM ssse3 334FILTER_SSSE3 put 335FILTER_SSSE3 avg 336 337; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2 338%macro RV40_WCORE 4-5 339 movh m4, [%3 + r6 + 0] 340 movh m5, [%4 + r6 + 0] 341%if %0 == 4 342%define OFFSET r6 + mmsize / 2 343%else 344 ; 8x8 block and SSE2, stride was provided 345%define OFFSET r6 346 add r6, r5 347%endif 348 movh m6, [%3 + OFFSET] 349 movh m7, [%4 + OFFSET] 350 351%if %1 == 0 352 ; 14-bit weights 353 punpcklbw m4, m0 354 punpcklbw m5, m0 355 punpcklbw m6, m0 356 punpcklbw m7, m0 357 358 psllw m4, 7 359 psllw m5, 7 360 psllw m6, 7 361 psllw m7, 7 362 pmulhw m4, m3 363 pmulhw m5, m2 364 pmulhw m6, m3 365 pmulhw m7, m2 366 367 paddw m4, m5 368 paddw m6, m7 369%else 370 ; 5-bit weights 371%if cpuflag(ssse3) 372 punpcklbw m4, m5 373 punpcklbw m6, m7 374 375 pmaddubsw m4, m3 376 pmaddubsw m6, m3 377%else 378 punpcklbw m4, m0 379 punpcklbw m5, m0 380 punpcklbw m6, m0 381 punpcklbw m7, m0 382 383 pmullw m4, m3 384 pmullw m5, m2 385 pmullw m6, m3 386 pmullw m7, m2 387 paddw m4, m5 388 paddw m6, m7 389%endif 390 391%endif 392 393 ; bias and shift down 394%if cpuflag(ssse3) 395 pmulhrsw m4, m1 396 pmulhrsw m6, m1 397%else 398 paddw m4, m1 399 paddw m6, m1 400 psrlw m4, 5 401 psrlw m6, 5 402%endif 403 404 packuswb m4, m6 405%if %0 == 5 406 ; Only called for 8x8 blocks and SSE2 407 sub r6, r5 408 movh [%2 + r6], m4 409 add r6, r5 410 movhps [%2 + r6], m4 411%else 412 mova [%2 + r6], m4 413%endif 414%endmacro 415 416 417%macro MAIN_LOOP 2 418%if mmsize == 8 419 RV40_WCORE %2, r0, r1, r2 420%if %1 == 16 421 RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8 422%endif 423 424 ; Prepare for next loop 425 add r6, r5 426%else 427%ifidn %1, 8 428 RV40_WCORE %2, r0, r1, r2, r5 429 ; Prepare 2 next lines 430 add r6, r5 431%else 432 RV40_WCORE %2, r0, r1, r2 433 ; Prepare single next line 434 add r6, r5 435%endif 436%endif 437 438%endmacro 439 440; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) 441; %1=size %2=num of xmm regs 442; The weights are FP0.14 notation of fractions depending on pts. 443; For timebases without rounding error (i.e. PAL), the fractions 444; can be simplified, and several operations can be avoided. 445; Therefore, we check here whether they are multiples of 2^9 for 446; those simplifications to occur. 447%macro RV40_WEIGHT 3 448cglobal rv40_weight_func_%1_%2, 6, 7, 8 449%if cpuflag(ssse3) 450 mova m1, [pw_1024] 451%else 452 mova m1, [pw_16] 453%endif 454 pxor m0, m0 455 ; Set loop counter and increments 456 mov r6, r5 457 shl r6, %3 458 add r0, r6 459 add r1, r6 460 add r2, r6 461 neg r6 462 463 movd m2, r3d 464 movd m3, r4d 465%ifidn %1,rnd 466%define RND 0 467 SPLATW m2, m2 468%else 469%define RND 1 470%if cpuflag(ssse3) 471 punpcklbw m3, m2 472%else 473 SPLATW m2, m2 474%endif 475%endif 476 SPLATW m3, m3 477 478.loop: 479 MAIN_LOOP %2, RND 480 jnz .loop 481 REP_RET 482%endmacro 483 484INIT_MMX mmxext 485RV40_WEIGHT rnd, 8, 3 486RV40_WEIGHT rnd, 16, 4 487RV40_WEIGHT nornd, 8, 3 488RV40_WEIGHT nornd, 16, 4 489 490INIT_XMM sse2 491RV40_WEIGHT rnd, 8, 3 492RV40_WEIGHT rnd, 16, 4 493RV40_WEIGHT nornd, 8, 3 494RV40_WEIGHT nornd, 16, 4 495 496INIT_XMM ssse3 497RV40_WEIGHT rnd, 8, 3 498RV40_WEIGHT rnd, 16, 4 499RV40_WEIGHT nornd, 8, 3 500RV40_WEIGHT nornd, 16, 4 501