1;****************************************************************************** 2;* Core video DSP functions 3;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26; slow vertical extension loop function. Works with variable-width, and 27; does per-line reading/writing of source data 28 29%macro V_COPY_ROW 2 ; type (top/body/bottom), h 30.%1_y_loop: ; do { 31 mov wq, r7mp ; initialize w (r7mp = wmp) 32.%1_x_loop: ; do { 33 movu m0, [srcq+wq] ; m0 = read($mmsize) 34 movu [dstq+wq], m0 ; write(m0, $mmsize) 35 add wq, mmsize ; w -= $mmsize 36 cmp wq, -mmsize ; } while (w > $mmsize); 37 jl .%1_x_loop 38 movu m0, [srcq-mmsize] ; m0 = read($mmsize) 39 movu [dstq-mmsize], m0 ; write(m0, $mmsize) 40%ifidn %1, body ; if ($type == body) { 41 add srcq, src_strideq ; src += src_stride 42%endif ; } 43 add dstq, dst_strideq ; dst += dst_stride 44 dec %2 ; } while (--$h); 45 jnz .%1_y_loop 46%endmacro 47 48%macro vvar_fn 0 49; .----. <- zero 50; | | <- top is copied from first line in body of source 51; |----| <- start_y 52; | | <- body is copied verbatim (line-by-line) from source 53; |----| <- end_y 54; | | <- bottom is copied from last line in body of source 55; '----' <- bh 56%if ARCH_X86_64 57cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ 58 start_y, end_y, bh, w 59%else ; x86-32 60cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w 61%define src_strideq r3mp 62%define dst_strideq r1mp 63 mov srcq, r2mp 64 mov start_yq, r4mp 65 mov end_yq, r5mp 66 mov bhq, r6mp 67%endif 68 sub bhq, end_yq ; bh -= end_q 69 sub end_yq, start_yq ; end_q -= start_q 70 add srcq, r7mp ; (r7mp = wmp) 71 add dstq, r7mp ; (r7mp = wmp) 72 neg r7mp ; (r7mp = wmp) 73 test start_yq, start_yq ; if (start_q) { 74 jz .body 75 V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) 76.body: ; } 77 V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) 78 test bhq, bhq ; if (bh) { 79 jz .end 80 sub srcq, src_strideq ; src -= src_stride 81 V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) 82.end: ; } 83 RET 84%endmacro 85 86%if ARCH_X86_32 87INIT_MMX mmx 88vvar_fn 89%endif 90 91INIT_XMM sse 92vvar_fn 93 94%macro hvar_fn 0 95cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w 96 lea dstq, [dstq+n_wordsq*2] 97 neg n_wordsq 98 lea start_xq, [start_xq+n_wordsq*2] 99.y_loop: ; do { 100%if cpuflag(avx2) 101 vpbroadcastb m0, [dstq+start_xq] 102 mov wq, n_wordsq ; initialize w 103%else 104 movzx wd, byte [dstq+start_xq] ; w = read(1) 105 imul wd, 0x01010101 ; w *= 0x01010101 106 movd m0, wd 107 mov wq, n_wordsq ; initialize w 108%if cpuflag(sse2) 109 pshufd m0, m0, q0000 ; splat 110%else ; mmx 111 punpckldq m0, m0 ; splat 112%endif ; mmx/sse 113%endif ; avx2 114.x_loop: ; do { 115 movu [dstq+wq*2], m0 ; write($reg, $mmsize) 116 add wq, mmsize/2 ; w -= $mmsize/2 117 cmp wq, -(mmsize/2) ; } while (w > $mmsize/2) 118 jl .x_loop 119 movu [dstq-mmsize], m0 ; write($reg, $mmsize) 120 add dstq, dst_strideq ; dst += dst_stride 121 dec hq ; } while (h--) 122 jnz .y_loop 123 RET 124%endmacro 125 126%if ARCH_X86_32 127INIT_MMX mmx 128hvar_fn 129%endif 130 131INIT_XMM sse2 132hvar_fn 133 134%if HAVE_AVX2_EXTERNAL 135INIT_XMM avx2 136hvar_fn 137%endif 138 139; macro to read/write a horizontal number of pixels (%2) to/from registers 140; on sse, - fills xmm0-15 for consecutive sets of 16 pixels 141; - if (%2 & 8) fills 8 bytes into xmm$next 142; - if (%2 & 4) fills 4 bytes into xmm$next 143; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 144; on mmx, - fills mm0-7 for consecutive sets of 8 pixels 145; - if (%2 & 4) fills 4 bytes into mm$next 146; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 147; writing data out is in the same way 148%macro READ_NUM_BYTES 2 149%assign %%off 0 ; offset in source buffer 150%assign %%mmx_idx 0 ; mmx register index 151%assign %%xmm_idx 0 ; xmm register index 152 153%rep %2/mmsize 154%if mmsize == 16 155 movu xmm %+ %%xmm_idx, [srcq+%%off] 156%assign %%xmm_idx %%xmm_idx+1 157%else ; mmx 158 movu mm %+ %%mmx_idx, [srcq+%%off] 159%assign %%mmx_idx %%mmx_idx+1 160%endif 161%assign %%off %%off+mmsize 162%endrep ; %2/mmsize 163 164%if mmsize == 16 165%if (%2-%%off) >= 8 166%if %2 > 16 && (%2-%%off) > 8 167 movu xmm %+ %%xmm_idx, [srcq+%2-16] 168%assign %%xmm_idx %%xmm_idx+1 169%assign %%off %2 170%else 171 movq mm %+ %%mmx_idx, [srcq+%%off] 172%assign %%mmx_idx %%mmx_idx+1 173%assign %%off %%off+8 174%endif 175%endif ; (%2-%%off) >= 8 176%endif 177 178%if (%2-%%off) >= 4 179%if %2 > 8 && (%2-%%off) > 4 180 movq mm %+ %%mmx_idx, [srcq+%2-8] 181%assign %%off %2 182%else 183 movd mm %+ %%mmx_idx, [srcq+%%off] 184%assign %%off %%off+4 185%endif 186%assign %%mmx_idx %%mmx_idx+1 187%endif ; (%2-%%off) >= 4 188 189%if (%2-%%off) >= 1 190%if %2 >= 4 191 movd mm %+ %%mmx_idx, [srcq+%2-4] 192%elif (%2-%%off) == 1 193 mov valb, [srcq+%2-1] 194%elif (%2-%%off) == 2 195 mov valw, [srcq+%2-2] 196%else 197 mov valb, [srcq+%2-1] 198 ror vald, 16 199 mov valw, [srcq+%2-3] 200%endif 201%endif ; (%2-%%off) >= 1 202%endmacro ; READ_NUM_BYTES 203 204%macro WRITE_NUM_BYTES 2 205%assign %%off 0 ; offset in destination buffer 206%assign %%mmx_idx 0 ; mmx register index 207%assign %%xmm_idx 0 ; xmm register index 208 209%rep %2/mmsize 210%if mmsize == 16 211 movu [dstq+%%off], xmm %+ %%xmm_idx 212%assign %%xmm_idx %%xmm_idx+1 213%else ; mmx 214 movu [dstq+%%off], mm %+ %%mmx_idx 215%assign %%mmx_idx %%mmx_idx+1 216%endif 217%assign %%off %%off+mmsize 218%endrep ; %2/mmsize 219 220%if mmsize == 16 221%if (%2-%%off) >= 8 222%if %2 > 16 && (%2-%%off) > 8 223 movu [dstq+%2-16], xmm %+ %%xmm_idx 224%assign %%xmm_idx %%xmm_idx+1 225%assign %%off %2 226%else 227 movq [dstq+%%off], mm %+ %%mmx_idx 228%assign %%mmx_idx %%mmx_idx+1 229%assign %%off %%off+8 230%endif 231%endif ; (%2-%%off) >= 8 232%endif 233 234%if (%2-%%off) >= 4 235%if %2 > 8 && (%2-%%off) > 4 236 movq [dstq+%2-8], mm %+ %%mmx_idx 237%assign %%off %2 238%else 239 movd [dstq+%%off], mm %+ %%mmx_idx 240%assign %%off %%off+4 241%endif 242%assign %%mmx_idx %%mmx_idx+1 243%endif ; (%2-%%off) >= 4 244 245%if (%2-%%off) >= 1 246%if %2 >= 4 247 movd [dstq+%2-4], mm %+ %%mmx_idx 248%elif (%2-%%off) == 1 249 mov [dstq+%2-1], valb 250%elif (%2-%%off) == 2 251 mov [dstq+%2-2], valw 252%else 253 mov [dstq+%2-3], valw 254 ror vald, 16 255 mov [dstq+%2-1], valb 256%ifnidn %1, body 257 ror vald, 16 258%endif 259%endif 260%endif ; (%2-%%off) >= 1 261%endmacro ; WRITE_NUM_BYTES 262 263; vertical top/bottom extend and body copy fast loops 264; these are function pointers to set-width line copy functions, i.e. 265; they read a fixed number of pixels into set registers, and write 266; those out into the destination buffer 267%macro VERTICAL_EXTEND 2 268%assign %%n %1 269%rep 1+%2-%1 270%if %%n <= 3 271%if ARCH_X86_64 272cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ 273 start_y, end_y, val, bh 274 mov bhq, r6mp ; r6mp = bhmp 275%else ; x86-32 276cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh 277 mov dstq, r0mp 278 mov srcq, r2mp 279 mov start_yq, r4mp 280 mov end_yq, r5mp 281 mov bhq, r6mp 282%define dst_strideq r1mp 283%define src_strideq r3mp 284%endif ; x86-64/32 285%else 286%if ARCH_X86_64 287cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ 288 start_y, end_y, bh 289%else ; x86-32 290cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh 291 mov srcq, r2mp 292 mov start_yq, r4mp 293 mov end_yq, r5mp 294 mov bhq, r6mp 295%define dst_strideq r1mp 296%define src_strideq r3mp 297%endif ; x86-64/32 298%endif 299 ; FIXME move this to c wrapper? 300 sub bhq, end_yq ; bh -= end_y 301 sub end_yq, start_yq ; end_y -= start_y 302 303 ; extend pixels above body 304 test start_yq, start_yq ; if (start_y) { 305 jz .body_loop 306 READ_NUM_BYTES top, %%n ; $variable_regs = read($n) 307.top_loop: ; do { 308 WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) 309 add dstq, dst_strideq ; dst += linesize 310 dec start_yq ; } while (--start_y) 311 jnz .top_loop ; } 312 313 ; copy body pixels 314.body_loop: ; do { 315 READ_NUM_BYTES body, %%n ; $variable_regs = read($n) 316 WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) 317 add dstq, dst_strideq ; dst += dst_stride 318 add srcq, src_strideq ; src += src_stride 319 dec end_yq ; } while (--end_y) 320 jnz .body_loop 321 322 ; copy bottom pixels 323 test bhq, bhq ; if (block_h) { 324 jz .end 325 sub srcq, src_strideq ; src -= linesize 326 READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) 327.bottom_loop: ; do { 328 WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) 329 add dstq, dst_strideq ; dst += linesize 330 dec bhq ; } while (--bh) 331 jnz .bottom_loop ; } 332 333.end: 334 RET 335%assign %%n %%n+1 336%endrep ; 1+%2-%1 337%endmacro ; VERTICAL_EXTEND 338 339INIT_MMX mmx 340VERTICAL_EXTEND 1, 15 341%if ARCH_X86_32 342VERTICAL_EXTEND 16, 22 343%endif 344 345INIT_XMM sse 346VERTICAL_EXTEND 16, 22 347 348; left/right (horizontal) fast extend functions 349; these are essentially identical to the vertical extend ones above, 350; just left/right separated because number of pixels to extend is 351; obviously not the same on both sides. 352 353%macro READ_V_PIXEL 2 354%if cpuflag(avx2) 355 vpbroadcastb m0, %2 356%else 357 movzx vald, byte %2 358 imul vald, 0x01010101 359%if %1 >= 8 360 movd m0, vald 361%if mmsize == 16 362 pshufd m0, m0, q0000 363%else 364 punpckldq m0, m0 365%endif ; mmsize == 16 366%endif ; %1 > 16 367%endif ; avx2 368%endmacro ; READ_V_PIXEL 369 370%macro WRITE_V_PIXEL 2 371%assign %%off 0 372 373%if %1 >= 8 374 375%rep %1/mmsize 376 movu [%2+%%off], m0 377%assign %%off %%off+mmsize 378%endrep ; %1/mmsize 379 380%if mmsize == 16 381%if %1-%%off >= 8 382%if %1 > 16 && %1-%%off > 8 383 movu [%2+%1-16], m0 384%assign %%off %1 385%else 386 movq [%2+%%off], m0 387%assign %%off %%off+8 388%endif 389%endif ; %1-%%off >= 8 390%endif ; mmsize == 16 391 392%if %1-%%off >= 4 393%if %1 > 8 && %1-%%off > 4 394 movq [%2+%1-8], m0 395%assign %%off %1 396%else 397 movd [%2+%%off], m0 398%assign %%off %%off+4 399%endif 400%endif ; %1-%%off >= 4 401 402%else ; %1 < 8 403 404%rep %1/4 405 mov [%2+%%off], vald 406%assign %%off %%off+4 407%endrep ; %1/4 408 409%endif ; %1 >=/< 8 410 411%if %1-%%off == 2 412%if cpuflag(avx2) 413 movd [%2+%%off-2], m0 414%else 415 mov [%2+%%off], valw 416%endif ; avx2 417%endif ; (%1-%%off)/2 418%endmacro ; WRITE_V_PIXEL 419 420%macro H_EXTEND 2 421%assign %%n %1 422%rep 1+(%2-%1)/2 423%if cpuflag(avx2) 424cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh 425%else 426cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val 427%endif 428.loop_y: ; do { 429 READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) 430 WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) 431 add dstq, dst_strideq ; dst += dst_stride 432 dec bhq ; } while (--bh) 433 jnz .loop_y 434 RET 435%assign %%n %%n+2 436%endrep ; 1+(%2-%1)/2 437%endmacro ; H_EXTEND 438 439INIT_MMX mmx 440H_EXTEND 2, 14 441%if ARCH_X86_32 442H_EXTEND 16, 22 443%endif 444 445INIT_XMM sse2 446H_EXTEND 16, 22 447 448%if HAVE_AVX2_EXTERNAL 449INIT_XMM avx2 450H_EXTEND 8, 22 451%endif 452 453%macro PREFETCH_FN 1 454cglobal prefetch, 3, 3, 0, buf, stride, h 455.loop: 456 %1 [bufq] 457 add bufq, strideq 458 dec hd 459 jg .loop 460 REP_RET 461%endmacro 462 463INIT_MMX mmxext 464PREFETCH_FN prefetcht0 465%if ARCH_X86_32 466INIT_MMX 3dnow 467PREFETCH_FN prefetch 468%endif 469