1;****************************************************************************** 2;* x86-optimized horizontal line scaling functions 3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26max_19bit_int: times 4 dd 0x7ffff 27max_19bit_flt: times 4 dd 524287.0 28minshort: times 8 dw 0x8000 29unicoeff: times 4 dd 0x20000000 30 31SECTION .text 32 33;----------------------------------------------------------------------------- 34; horizontal line scaling 35; 36; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt> 37; (SwsContext *c, int{16,32}_t *dst, 38; int dstW, const uint{8,16}_t *src, 39; const int16_t *filter, 40; const int32_t *filterPos, int filterSize); 41; 42; Scale one horizontal line. Input is either 8-bit width or 16-bit width 43; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to 44; downscale before multiplying). Filter is 14 bits. Output is either 15 bits 45; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each 46; output pixel is generated from $filterSize input pixels, the position of 47; the first pixel is given in filterPos[nOutputPixel]. 48;----------------------------------------------------------------------------- 49 50; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm 51%macro SCALE_FUNC 6 52%ifnidn %3, X 53cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 54%else 55cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize 56%endif 57%if ARCH_X86_64 58 movsxd wq, wd 59%define mov32 movsxd 60%else ; x86-32 61%define mov32 mov 62%endif ; x86-64 63%if %2 == 19 64%if mmsize == 8 ; mmx 65 mova m2, [max_19bit_int] 66%elif cpuflag(sse4) 67 mova m2, [max_19bit_int] 68%else ; ssse3/sse2 69 mova m2, [max_19bit_flt] 70%endif ; mmx/sse2/ssse3/sse4 71%endif ; %2 == 19 72%if %1 == 16 73 mova m6, [minshort] 74 mova m7, [unicoeff] 75%elif %1 == 8 76 pxor m3, m3 77%endif ; %1 == 8/16 78 79%if %1 == 8 80%define movlh movd 81%define movbh movh 82%define srcmul 1 83%else ; %1 == 9-16 84%define movlh movq 85%define movbh movu 86%define srcmul 2 87%endif ; %1 == 8/9-16 88 89%ifnidn %3, X 90 91 ; setup loop 92%if %3 == 8 93 shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter 94%define wshr 1 95%else ; %3 == 4 96%define wshr 0 97%endif ; %3 == 8 98 lea filterq, [filterq+wq*8] 99%if %2 == 15 100 lea dstq, [dstq+wq*(2>>wshr)] 101%else ; %2 == 19 102 lea dstq, [dstq+wq*(4>>wshr)] 103%endif ; %2 == 15/19 104 lea fltposq, [fltposq+wq*(4>>wshr)] 105 neg wq 106 107.loop: 108%if %3 == 4 ; filterSize == 4 scaling 109 ; load 2x4 or 4x4 source pixels into m0/m1 110 mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0] 111 mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1] 112 movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}] 113%if mmsize == 8 114 movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] 115%else ; mmsize == 16 116%if %1 > 8 117 movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] 118%else ; %1 == 8 119 movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] 120%endif 121 mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2] 122 mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3] 123 movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}] 124%if %1 > 8 125 movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] 126%else ; %1 == 8 127 movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] 128 punpckldq m0, m4 129 punpckldq m1, m5 130%endif ; %1 == 8 131%endif ; mmsize == 8/16 132%if %1 == 8 133 punpcklbw m0, m3 ; byte -> word 134 punpcklbw m1, m3 ; byte -> word 135%endif ; %1 == 8 136 137 ; multiply with filter coefficients 138%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 139 ; add back 0x8000 * sum(coeffs) after the horizontal add 140 psubw m0, m6 141 psubw m1, m6 142%endif ; %1 == 16 143 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] 144 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] 145 146 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) 147%if mmsize == 8 ; mmx 148 movq m4, m0 149 punpckldq m0, m1 150 punpckhdq m4, m1 151 paddd m0, m4 152%elif notcpuflag(ssse3) ; sse2 153 mova m4, m0 154 shufps m0, m1, 10001000b 155 shufps m4, m1, 11011101b 156 paddd m0, m4 157%else ; ssse3/sse4 158 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], 159 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], 160 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], 161 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] 162%endif ; mmx/sse2/ssse3/sse4 163%else ; %3 == 8, i.e. filterSize == 8 scaling 164 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 165 mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0] 166 mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1] 167 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] 168%if mmsize == 8 169 movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] 170 movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}] 171 movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] 172%else ; mmsize == 16 173 movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] 174 mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2] 175 mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3] 176 movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] 177 movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] 178%endif ; mmsize == 8/16 179%if %1 == 8 180 punpcklbw m0, m3 ; byte -> word 181 punpcklbw m1, m3 ; byte -> word 182 punpcklbw m4, m3 ; byte -> word 183 punpcklbw m5, m3 ; byte -> word 184%endif ; %1 == 8 185 186 ; multiply 187%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 188 ; add back 0x8000 * sum(coeffs) after the horizontal add 189 psubw m0, m6 190 psubw m1, m6 191 psubw m4, m6 192 psubw m5, m6 193%endif ; %1 == 16 194 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] 195 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] 196 pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}] 197 pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}] 198 199 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) 200%if mmsize == 8 201 paddd m0, m1 202 paddd m4, m5 203 movq m1, m0 204 punpckldq m0, m4 205 punpckhdq m1, m4 206 paddd m0, m1 207%elif notcpuflag(ssse3) ; sse2 208%if %1 == 8 209%define mex m6 210%else 211%define mex m3 212%endif 213 ; emulate horizontal add as transpose + vertical add 214 mova mex, m0 215 punpckldq m0, m1 216 punpckhdq mex, m1 217 paddd m0, mex 218 mova m1, m4 219 punpckldq m4, m5 220 punpckhdq m1, m5 221 paddd m4, m1 222 mova m1, m0 223 punpcklqdq m0, m4 224 punpckhqdq m1, m4 225 paddd m0, m1 226%else ; ssse3/sse4 227 ; FIXME if we rearrange the filter in pairs of 4, we can 228 ; load pixels likewise and use 2 x paddd + phaddd instead 229 ; of 3 x phaddd here, faster on older cpus 230 phaddd m0, m1 231 phaddd m4, m5 232 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], 233 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], 234 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], 235 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] 236%endif ; mmx/sse2/ssse3/sse4 237%endif ; %3 == 4/8 238 239%else ; %3 == X, i.e. any filterSize scaling 240 241%ifidn %4, X4 242%define dlt 4 243%else ; %4 == X || %4 == X8 244%define dlt 0 245%endif ; %4 ==/!= X4 246%if ARCH_X86_64 247%define srcq r8 248%define pos1q r7 249%define srcendq r9 250 movsxd fltsizeq, fltsized ; filterSize 251 lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] 252%else ; x86-32 253%define srcq srcmemq 254%define pos1q dstq 255%define srcendq r6m 256 lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] 257 mov srcendq, pos0q 258%endif ; x86-32/64 259 lea fltposq, [fltposq+wq*4] 260%if %2 == 15 261 lea dstq, [dstq+wq*2] 262%else ; %2 == 19 263 lea dstq, [dstq+wq*4] 264%endif ; %2 == 15/19 265 movifnidn dstmp, dstq 266 neg wq 267 268.loop: 269 mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0] 270 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] 271 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? 272 pxor m4, m4 273 pxor m5, m5 274 mov srcq, srcmemmp 275 276.innerloop: 277 ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5 278 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] 279 movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] 280%if %1 == 8 281 punpcklbw m0, m3 282 punpcklbw m1, m3 283%endif ; %1 == 8 284 285 ; multiply 286%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 287 ; add back 0x8000 * sum(coeffs) after the horizontal add 288 psubw m0, m6 289 psubw m1, m6 290%endif ; %1 == 16 291 pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}] 292 pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}] 293 paddd m4, m0 294 paddd m5, m1 295 add filterq, mmsize 296 add srcq, srcmul*mmsize/2 297 cmp srcq, srcendq ; while (src += 4) < &src[filterSize] 298 jl .innerloop 299 300%ifidn %4, X4 301 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] 302 movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0] 303 sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1] 304%if %1 > 8 305 movhps m0, [srcq+(pos1q+dlt)*srcmul] 306%else ; %1 == 8 307 movd m1, [srcq+(pos1q+dlt)*srcmul] 308 punpckldq m0, m1 309%endif ; %1 == 8 310%if %1 == 8 311 punpcklbw m0, m3 312%endif ; %1 == 8 313%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 314 ; add back 0x8000 * sum(coeffs) after the horizontal add 315 psubw m0, m6 316%endif ; %1 == 16 317 pmaddwd m0, [filterq] 318%endif ; %4 == X4 319 320 lea filterq, [filterq+(fltsizeq+dlt)*2] 321 322%if mmsize == 8 ; mmx 323 movq m0, m4 324 punpckldq m4, m5 325 punpckhdq m0, m5 326 paddd m0, m4 327%else ; mmsize == 16 328%if notcpuflag(ssse3) ; sse2 329 mova m1, m4 330 punpcklqdq m4, m5 331 punpckhqdq m1, m5 332 paddd m4, m1 333%else ; ssse3/sse4 334 phaddd m4, m5 335%endif ; sse2/ssse3/sse4 336%ifidn %4, X4 337 paddd m4, m0 338%endif ; %3 == X4 339%if notcpuflag(ssse3) ; sse2 340 pshufd m4, m4, 11011000b 341 movhlps m0, m4 342 paddd m0, m4 343%else ; ssse3/sse4 344 phaddd m4, m4 345 SWAP 0, 4 346%endif ; sse2/ssse3/sse4 347%endif ; mmsize == 8/16 348%endif ; %3 ==/!= X 349 350%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned 351 paddd m0, m7 352%endif ; %1 == 16 353 354 ; clip, store 355 psrad m0, 14 + %1 - %2 356%ifidn %3, X 357 movifnidn dstq, dstmp 358%endif ; %3 == X 359%if %2 == 15 360 packssdw m0, m0 361%ifnidn %3, X 362 movh [dstq+wq*(2>>wshr)], m0 363%else ; %3 == X 364 movd [dstq+wq*2], m0 365%endif ; %3 ==/!= X 366%else ; %2 == 19 367 PMINSD m0, m2, m4 368%ifnidn %3, X 369 mova [dstq+wq*(4>>wshr)], m0 370%else ; %3 == X 371 movq [dstq+wq*4], m0 372%endif ; %3 ==/!= X 373%endif ; %2 == 15/19 374%ifnidn %3, X 375 add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels) 376 ; per iteration. see "shl wq,1" above as for why we do this 377%else ; %3 == X 378 add wq, 2 379%endif ; %3 ==/!= X 380 jl .loop 381 REP_RET 382%endmacro 383 384; SCALE_FUNCS source_width, intermediate_nbits, n_xmm 385%macro SCALE_FUNCS 3 386SCALE_FUNC %1, %2, 4, 4, 6, %3 387SCALE_FUNC %1, %2, 8, 8, 6, %3 388%if mmsize == 8 389SCALE_FUNC %1, %2, X, X, 7, %3 390%else 391SCALE_FUNC %1, %2, X, X4, 7, %3 392SCALE_FUNC %1, %2, X, X8, 7, %3 393%endif 394%endmacro 395 396; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args 397%macro SCALE_FUNCS2 3 398%if notcpuflag(sse4) 399SCALE_FUNCS 8, 15, %1 400SCALE_FUNCS 9, 15, %2 401SCALE_FUNCS 10, 15, %2 402SCALE_FUNCS 12, 15, %2 403SCALE_FUNCS 14, 15, %2 404SCALE_FUNCS 16, 15, %3 405%endif ; !sse4 406SCALE_FUNCS 8, 19, %1 407SCALE_FUNCS 9, 19, %2 408SCALE_FUNCS 10, 19, %2 409SCALE_FUNCS 12, 19, %2 410SCALE_FUNCS 14, 19, %2 411SCALE_FUNCS 16, 19, %3 412%endmacro 413 414%if ARCH_X86_32 415INIT_MMX mmx 416SCALE_FUNCS2 0, 0, 0 417%endif 418INIT_XMM sse2 419SCALE_FUNCS2 7, 6, 8 420INIT_XMM ssse3 421SCALE_FUNCS2 6, 6, 8 422INIT_XMM sse4 423SCALE_FUNCS2 6, 6, 8 424