1;****************************************************************************** 2;* Core video DSP functions 3;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION .text 25 26; slow vertical extension loop function. Works with variable-width, and 27; does per-line reading/writing of source data 28 29%macro V_COPY_ROW 2 ; type (top/body/bottom), h 30.%1_y_loop: ; do { 31 mov wq, r7mp ; initialize w (r7mp = wmp) 32.%1_x_loop: ; do { 33 movu m0, [srcq+wq] ; m0 = read($mmsize) 34 movu [dstq+wq], m0 ; write(m0, $mmsize) 35 add wq, mmsize ; w -= $mmsize 36 cmp wq, -mmsize ; } while (w > $mmsize); 37 jl .%1_x_loop 38 movu m0, [srcq-mmsize] ; m0 = read($mmsize) 39 movu [dstq-mmsize], m0 ; write(m0, $mmsize) 40%ifidn %1, body ; if ($type == body) { 41 add srcq, src_strideq ; src += src_stride 42%endif ; } 43 add dstq, dst_strideq ; dst += dst_stride 44 dec %2 ; } while (--$h); 45 jnz .%1_y_loop 46%endmacro 47 48; .----. <- zero 49; | | <- top is copied from first line in body of source 50; |----| <- start_y 51; | | <- body is copied verbatim (line-by-line) from source 52; |----| <- end_y 53; | | <- bottom is copied from last line in body of source 54; '----' <- bh 55INIT_XMM sse 56%if ARCH_X86_64 57cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ 58 start_y, end_y, bh, w 59%else ; x86-32 60cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w 61%define src_strideq r3mp 62%define dst_strideq r1mp 63 mov srcq, r2mp 64 mov start_yq, r4mp 65 mov end_yq, r5mp 66 mov bhq, r6mp 67%endif 68 sub bhq, end_yq ; bh -= end_q 69 sub end_yq, start_yq ; end_q -= start_q 70 add srcq, r7mp ; (r7mp = wmp) 71 add dstq, r7mp ; (r7mp = wmp) 72 neg r7mp ; (r7mp = wmp) 73 test start_yq, start_yq ; if (start_q) { 74 jz .body 75 V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) 76.body: ; } 77 V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) 78 test bhq, bhq ; if (bh) { 79 jz .end 80 sub srcq, src_strideq ; src -= src_stride 81 V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) 82.end: ; } 83 RET 84 85%macro hvar_fn 0 86cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w 87 lea dstq, [dstq+n_wordsq*2] 88 neg n_wordsq 89 lea start_xq, [start_xq+n_wordsq*2] 90.y_loop: ; do { 91%if cpuflag(avx2) 92 vpbroadcastb m0, [dstq+start_xq] 93 mov wq, n_wordsq ; initialize w 94%else 95 movzx wd, byte [dstq+start_xq] ; w = read(1) 96 imul wd, 0x01010101 ; w *= 0x01010101 97 movd m0, wd 98 mov wq, n_wordsq ; initialize w 99 pshufd m0, m0, q0000 ; splat 100%endif ; avx2 101.x_loop: ; do { 102 movu [dstq+wq*2], m0 ; write($reg, $mmsize) 103 add wq, mmsize/2 ; w -= $mmsize/2 104 cmp wq, -(mmsize/2) ; } while (w > $mmsize/2) 105 jl .x_loop 106 movu [dstq-mmsize], m0 ; write($reg, $mmsize) 107 add dstq, dst_strideq ; dst += dst_stride 108 dec hq ; } while (h--) 109 jnz .y_loop 110 RET 111%endmacro 112 113INIT_XMM sse2 114hvar_fn 115 116%if HAVE_AVX2_EXTERNAL 117INIT_XMM avx2 118hvar_fn 119%endif 120 121; macro to read/write a horizontal number of pixels (%2) to/from registers 122; on sse, - fills xmm0-15 for consecutive sets of 16 pixels 123; - if (%2 & 8) fills 8 bytes into xmm$next 124; - if (%2 & 4) fills 4 bytes into xmm$next 125; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 126; on mmx, - fills mm0-7 for consecutive sets of 8 pixels 127; - if (%2 & 4) fills 4 bytes into mm$next 128; - if (%2 & 3) fills 1, 2 or 4 bytes in eax 129; writing data out is in the same way 130%macro READ_NUM_BYTES 2 131%assign %%off 0 ; offset in source buffer 132%assign %%mmx_idx 0 ; mmx register index 133%assign %%xmm_idx 0 ; xmm register index 134 135%rep %2/mmsize 136%if mmsize == 16 137 movu xmm %+ %%xmm_idx, [srcq+%%off] 138%assign %%xmm_idx %%xmm_idx+1 139%else ; mmx 140 movu mm %+ %%mmx_idx, [srcq+%%off] 141%assign %%mmx_idx %%mmx_idx+1 142%endif 143%assign %%off %%off+mmsize 144%endrep ; %2/mmsize 145 146%if mmsize == 16 147%if (%2-%%off) >= 8 148%if %2 > 16 && (%2-%%off) > 8 149 movu xmm %+ %%xmm_idx, [srcq+%2-16] 150%assign %%xmm_idx %%xmm_idx+1 151%assign %%off %2 152%else 153 movq mm %+ %%mmx_idx, [srcq+%%off] 154%assign %%mmx_idx %%mmx_idx+1 155%assign %%off %%off+8 156%endif 157%endif ; (%2-%%off) >= 8 158%endif 159 160%if (%2-%%off) >= 4 161%if %2 > 8 && (%2-%%off) > 4 162 movq mm %+ %%mmx_idx, [srcq+%2-8] 163%assign %%off %2 164%else 165 movd mm %+ %%mmx_idx, [srcq+%%off] 166%assign %%off %%off+4 167%endif 168%assign %%mmx_idx %%mmx_idx+1 169%endif ; (%2-%%off) >= 4 170 171%if (%2-%%off) >= 1 172%if %2 >= 4 173 movd mm %+ %%mmx_idx, [srcq+%2-4] 174%elif (%2-%%off) == 1 175 mov valb, [srcq+%2-1] 176%elif (%2-%%off) == 2 177 mov valw, [srcq+%2-2] 178%else 179 mov valb, [srcq+%2-1] 180 ror vald, 16 181 mov valw, [srcq+%2-3] 182%endif 183%endif ; (%2-%%off) >= 1 184%endmacro ; READ_NUM_BYTES 185 186%macro WRITE_NUM_BYTES 2 187%assign %%off 0 ; offset in destination buffer 188%assign %%mmx_idx 0 ; mmx register index 189%assign %%xmm_idx 0 ; xmm register index 190 191%rep %2/mmsize 192%if mmsize == 16 193 movu [dstq+%%off], xmm %+ %%xmm_idx 194%assign %%xmm_idx %%xmm_idx+1 195%else ; mmx 196 movu [dstq+%%off], mm %+ %%mmx_idx 197%assign %%mmx_idx %%mmx_idx+1 198%endif 199%assign %%off %%off+mmsize 200%endrep ; %2/mmsize 201 202%if mmsize == 16 203%if (%2-%%off) >= 8 204%if %2 > 16 && (%2-%%off) > 8 205 movu [dstq+%2-16], xmm %+ %%xmm_idx 206%assign %%xmm_idx %%xmm_idx+1 207%assign %%off %2 208%else 209 movq [dstq+%%off], mm %+ %%mmx_idx 210%assign %%mmx_idx %%mmx_idx+1 211%assign %%off %%off+8 212%endif 213%endif ; (%2-%%off) >= 8 214%endif 215 216%if (%2-%%off) >= 4 217%if %2 > 8 && (%2-%%off) > 4 218 movq [dstq+%2-8], mm %+ %%mmx_idx 219%assign %%off %2 220%else 221 movd [dstq+%%off], mm %+ %%mmx_idx 222%assign %%off %%off+4 223%endif 224%assign %%mmx_idx %%mmx_idx+1 225%endif ; (%2-%%off) >= 4 226 227%if (%2-%%off) >= 1 228%if %2 >= 4 229 movd [dstq+%2-4], mm %+ %%mmx_idx 230%elif (%2-%%off) == 1 231 mov [dstq+%2-1], valb 232%elif (%2-%%off) == 2 233 mov [dstq+%2-2], valw 234%else 235 mov [dstq+%2-3], valw 236 ror vald, 16 237 mov [dstq+%2-1], valb 238%ifnidn %1, body 239 ror vald, 16 240%endif 241%endif 242%endif ; (%2-%%off) >= 1 243%endmacro ; WRITE_NUM_BYTES 244 245; vertical top/bottom extend and body copy fast loops 246; these are function pointers to set-width line copy functions, i.e. 247; they read a fixed number of pixels into set registers, and write 248; those out into the destination buffer 249%macro VERTICAL_EXTEND 2 250%assign %%n %1 251%rep 1+%2-%1 252%if %%n <= 3 253%if ARCH_X86_64 254cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ 255 start_y, end_y, val, bh 256 mov bhq, r6mp ; r6mp = bhmp 257%else ; x86-32 258cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh 259 mov dstq, r0mp 260 mov srcq, r2mp 261 mov start_yq, r4mp 262 mov end_yq, r5mp 263 mov bhq, r6mp 264%define dst_strideq r1mp 265%define src_strideq r3mp 266%endif ; x86-64/32 267%else 268%if ARCH_X86_64 269cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ 270 start_y, end_y, bh 271%else ; x86-32 272cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh 273 mov srcq, r2mp 274 mov start_yq, r4mp 275 mov end_yq, r5mp 276 mov bhq, r6mp 277%define dst_strideq r1mp 278%define src_strideq r3mp 279%endif ; x86-64/32 280%endif 281 ; FIXME move this to c wrapper? 282 sub bhq, end_yq ; bh -= end_y 283 sub end_yq, start_yq ; end_y -= start_y 284 285 ; extend pixels above body 286 test start_yq, start_yq ; if (start_y) { 287 jz .body_loop 288 READ_NUM_BYTES top, %%n ; $variable_regs = read($n) 289.top_loop: ; do { 290 WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) 291 add dstq, dst_strideq ; dst += linesize 292 dec start_yq ; } while (--start_y) 293 jnz .top_loop ; } 294 295 ; copy body pixels 296.body_loop: ; do { 297 READ_NUM_BYTES body, %%n ; $variable_regs = read($n) 298 WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) 299 add dstq, dst_strideq ; dst += dst_stride 300 add srcq, src_strideq ; src += src_stride 301 dec end_yq ; } while (--end_y) 302 jnz .body_loop 303 304 ; copy bottom pixels 305 test bhq, bhq ; if (block_h) { 306 jz .end 307 sub srcq, src_strideq ; src -= linesize 308 READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) 309.bottom_loop: ; do { 310 WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) 311 add dstq, dst_strideq ; dst += linesize 312 dec bhq ; } while (--bh) 313 jnz .bottom_loop ; } 314 315.end: 316 RET 317%assign %%n %%n+1 318%endrep ; 1+%2-%1 319%endmacro ; VERTICAL_EXTEND 320 321INIT_MMX mmx 322VERTICAL_EXTEND 1, 15 323 324INIT_XMM sse 325VERTICAL_EXTEND 16, 22 326 327; left/right (horizontal) fast extend functions 328; these are essentially identical to the vertical extend ones above, 329; just left/right separated because number of pixels to extend is 330; obviously not the same on both sides. 331 332%macro READ_V_PIXEL 2 333%if cpuflag(avx2) 334 vpbroadcastb m0, %2 335%else 336 movzx vald, byte %2 337 imul vald, 0x01010101 338%if %1 >= 8 339 movd m0, vald 340%if mmsize == 16 341 pshufd m0, m0, q0000 342%else 343 punpckldq m0, m0 344%endif ; mmsize == 16 345%endif ; %1 > 16 346%endif ; avx2 347%endmacro ; READ_V_PIXEL 348 349%macro WRITE_V_PIXEL 2 350%assign %%off 0 351 352%if %1 >= 8 353 354%rep %1/mmsize 355 movu [%2+%%off], m0 356%assign %%off %%off+mmsize 357%endrep ; %1/mmsize 358 359%if mmsize == 16 360%if %1-%%off >= 8 361%if %1 > 16 && %1-%%off > 8 362 movu [%2+%1-16], m0 363%assign %%off %1 364%else 365 movq [%2+%%off], m0 366%assign %%off %%off+8 367%endif 368%endif ; %1-%%off >= 8 369%endif ; mmsize == 16 370 371%if %1-%%off >= 4 372%if %1 > 8 && %1-%%off > 4 373 movq [%2+%1-8], m0 374%assign %%off %1 375%else 376 movd [%2+%%off], m0 377%assign %%off %%off+4 378%endif 379%endif ; %1-%%off >= 4 380 381%else ; %1 < 8 382 383%rep %1/4 384 mov [%2+%%off], vald 385%assign %%off %%off+4 386%endrep ; %1/4 387 388%endif ; %1 >=/< 8 389 390%if %1-%%off == 2 391%if cpuflag(avx2) 392 movd [%2+%%off-2], m0 393%else 394 mov [%2+%%off], valw 395%endif ; avx2 396%endif ; (%1-%%off)/2 397%endmacro ; WRITE_V_PIXEL 398 399%macro H_EXTEND 2 400%assign %%n %1 401%rep 1+(%2-%1)/2 402%if cpuflag(avx2) 403cglobal emu_edge_hfix %+ %%n, 4, 4, 1, dst, dst_stride, start_x, bh 404%else 405cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val 406%endif 407.loop_y: ; do { 408 READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) 409 WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) 410 add dstq, dst_strideq ; dst += dst_stride 411 dec bhq ; } while (--bh) 412 jnz .loop_y 413 RET 414%assign %%n %%n+2 415%endrep ; 1+(%2-%1)/2 416%endmacro ; H_EXTEND 417 418INIT_MMX mmx 419H_EXTEND 2, 14 420 421INIT_XMM sse2 422H_EXTEND 16, 22 423 424%if HAVE_AVX2_EXTERNAL 425INIT_XMM avx2 426H_EXTEND 8, 22 427%endif 428 429INIT_MMX mmxext 430cglobal prefetch, 3, 3, 0, buf, stride, h 431.loop: 432 prefetcht0 [bufq] 433 add bufq, strideq 434 dec hd 435 jg .loop 436 REP_RET 437