1;****************************************************************************** 2;* x86-optimized horizontal line scaling functions 3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 4;* 5;* This file is part of FFmpeg. 6;* 7;* FFmpeg is free software; you can redistribute it and/or 8;* modify it under the terms of the GNU Lesser General Public 9;* License as published by the Free Software Foundation; either 10;* version 2.1 of the License, or (at your option) any later version. 11;* 12;* FFmpeg is distributed in the hope that it will be useful, 13;* but WITHOUT ANY WARRANTY; without even the implied warranty of 14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;* Lesser General Public License for more details. 16;* 17;* You should have received a copy of the GNU Lesser General Public 18;* License along with FFmpeg; if not, write to the Free Software 19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20;****************************************************************************** 21 22%include "libavutil/x86/x86util.asm" 23 24SECTION_RODATA 25 26max_19bit_int: times 4 dd 0x7ffff 27max_19bit_flt: times 4 dd 524287.0 28minshort: times 8 dw 0x8000 29unicoeff: times 4 dd 0x20000000 30 31SECTION .text 32 33;----------------------------------------------------------------------------- 34; horizontal line scaling 35; 36; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt> 37; (SwsContext *c, int{16,32}_t *dst, 38; int dstW, const uint{8,16}_t *src, 39; const int16_t *filter, 40; const int32_t *filterPos, int filterSize); 41; 42; Scale one horizontal line. Input is either 8-bit width or 16-bit width 43; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to 44; downscale before multiplying). Filter is 14 bits. Output is either 15 bits 45; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each 46; output pixel is generated from $filterSize input pixels, the position of 47; the first pixel is given in filterPos[nOutputPixel]. 48;----------------------------------------------------------------------------- 49 50; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm 51%macro SCALE_FUNC 6 52%ifnidn %3, X 53cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1 54%else 55cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize 56%endif 57%if ARCH_X86_64 58 movsxd wq, wd 59%define mov32 movsxd 60%else ; x86-32 61%define mov32 mov 62%endif ; x86-64 63%if %2 == 19 64%if cpuflag(sse4) 65 mova m2, [max_19bit_int] 66%else ; ssse3/sse2 67 mova m2, [max_19bit_flt] 68%endif ; sse2/ssse3/sse4 69%endif ; %2 == 19 70%if %1 == 16 71 mova m6, [minshort] 72 mova m7, [unicoeff] 73%elif %1 == 8 74 pxor m3, m3 75%endif ; %1 == 8/16 76 77%if %1 == 8 78%define movlh movd 79%define movbh movh 80%define srcmul 1 81%else ; %1 == 9-16 82%define movlh movq 83%define movbh movu 84%define srcmul 2 85%endif ; %1 == 8/9-16 86 87%ifnidn %3, X 88 89 ; setup loop 90%if %3 == 8 91 shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter 92%define wshr 1 93%else ; %3 == 4 94%define wshr 0 95%endif ; %3 == 8 96 lea filterq, [filterq+wq*8] 97%if %2 == 15 98 lea dstq, [dstq+wq*(2>>wshr)] 99%else ; %2 == 19 100 lea dstq, [dstq+wq*(4>>wshr)] 101%endif ; %2 == 15/19 102 lea fltposq, [fltposq+wq*(4>>wshr)] 103 neg wq 104 105.loop: 106%if %3 == 4 ; filterSize == 4 scaling 107 ; load 2x4 or 4x4 source pixels into m0/m1 108 mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0] 109 mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1] 110 movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}] 111%if mmsize == 8 112 movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] 113%else ; mmsize == 16 114%if %1 > 8 115 movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] 116%else ; %1 == 8 117 movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}] 118%endif 119 mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2] 120 mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3] 121 movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}] 122%if %1 > 8 123 movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] 124%else ; %1 == 8 125 movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}] 126 punpckldq m0, m4 127 punpckldq m1, m5 128%endif ; %1 == 8 129%endif ; mmsize == 8/16 130%if %1 == 8 131 punpcklbw m0, m3 ; byte -> word 132 punpcklbw m1, m3 ; byte -> word 133%endif ; %1 == 8 134 135 ; multiply with filter coefficients 136%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 137 ; add back 0x8000 * sum(coeffs) after the horizontal add 138 psubw m0, m6 139 psubw m1, m6 140%endif ; %1 == 16 141 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] 142 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] 143 144 ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix) 145%if notcpuflag(ssse3) ; sse2 146 mova m4, m0 147 shufps m0, m1, 10001000b 148 shufps m4, m1, 11011101b 149 paddd m0, m4 150%else ; ssse3/sse4 151 phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}], 152 ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}], 153 ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}], 154 ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}] 155%endif ; sse2/ssse3/sse4 156%else ; %3 == 8, i.e. filterSize == 8 scaling 157 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 158 mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0] 159 mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1] 160 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] 161%if mmsize == 8 162 movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] 163 movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}] 164 movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] 165%else ; mmsize == 16 166 movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] 167 mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2] 168 mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3] 169 movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] 170 movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] 171%endif ; mmsize == 8/16 172%if %1 == 8 173 punpcklbw m0, m3 ; byte -> word 174 punpcklbw m1, m3 ; byte -> word 175 punpcklbw m4, m3 ; byte -> word 176 punpcklbw m5, m3 ; byte -> word 177%endif ; %1 == 8 178 179 ; multiply 180%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 181 ; add back 0x8000 * sum(coeffs) after the horizontal add 182 psubw m0, m6 183 psubw m1, m6 184 psubw m4, m6 185 psubw m5, m6 186%endif ; %1 == 16 187 pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}] 188 pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}] 189 pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}] 190 pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}] 191 192 ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix) 193%if notcpuflag(ssse3) ; sse2 194%if %1 == 8 195%define mex m6 196%else 197%define mex m3 198%endif 199 ; emulate horizontal add as transpose + vertical add 200 mova mex, m0 201 punpckldq m0, m1 202 punpckhdq mex, m1 203 paddd m0, mex 204 mova m1, m4 205 punpckldq m4, m5 206 punpckhdq m1, m5 207 paddd m4, m1 208 mova m1, m0 209 punpcklqdq m0, m4 210 punpckhqdq m1, m4 211 paddd m0, m1 212%else ; ssse3/sse4 213 ; FIXME if we rearrange the filter in pairs of 4, we can 214 ; load pixels likewise and use 2 x paddd + phaddd instead 215 ; of 3 x phaddd here, faster on older cpus 216 phaddd m0, m1 217 phaddd m4, m5 218 phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}], 219 ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}], 220 ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}], 221 ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}] 222%endif ; sse2/ssse3/sse4 223%endif ; %3 == 4/8 224 225%else ; %3 == X, i.e. any filterSize scaling 226 227%ifidn %4, X4 228%define dlt 4 229%else ; %4 == X || %4 == X8 230%define dlt 0 231%endif ; %4 ==/!= X4 232%if ARCH_X86_64 233%define srcq r8 234%define pos1q r7 235%define srcendq r9 236 movsxd fltsizeq, fltsized ; filterSize 237 lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] 238%else ; x86-32 239%define srcq srcmemq 240%define pos1q dstq 241%define srcendq r6m 242 lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4] 243 mov srcendq, pos0q 244%endif ; x86-32/64 245 lea fltposq, [fltposq+wq*4] 246%if %2 == 15 247 lea dstq, [dstq+wq*2] 248%else ; %2 == 19 249 lea dstq, [dstq+wq*4] 250%endif ; %2 == 15/19 251 movifnidn dstmp, dstq 252 neg wq 253 254.loop: 255 mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0] 256 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] 257 ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? 258 pxor m4, m4 259 pxor m5, m5 260 mov srcq, srcmemmp 261 262.innerloop: 263 ; load 2x8 (sse) source pixels into m0/m1 -> m4/m5 264 movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}] 265 movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}] 266%if %1 == 8 267 punpcklbw m0, m3 268 punpcklbw m1, m3 269%endif ; %1 == 8 270 271 ; multiply 272%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 273 ; add back 0x8000 * sum(coeffs) after the horizontal add 274 psubw m0, m6 275 psubw m1, m6 276%endif ; %1 == 16 277 pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}] 278 pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}] 279 paddd m4, m0 280 paddd m5, m1 281 add filterq, mmsize 282 add srcq, srcmul*mmsize/2 283 cmp srcq, srcendq ; while (src += 4) < &src[filterSize] 284 jl .innerloop 285 286%ifidn %4, X4 287 mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1] 288 movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0] 289 sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1] 290%if %1 > 8 291 movhps m0, [srcq+(pos1q+dlt)*srcmul] 292%else ; %1 == 8 293 movd m1, [srcq+(pos1q+dlt)*srcmul] 294 punpckldq m0, m1 295%endif ; %1 == 8 296%if %1 == 8 297 punpcklbw m0, m3 298%endif ; %1 == 8 299%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll 300 ; add back 0x8000 * sum(coeffs) after the horizontal add 301 psubw m0, m6 302%endif ; %1 == 16 303 pmaddwd m0, [filterq] 304%endif ; %4 == X4 305 306 lea filterq, [filterq+(fltsizeq+dlt)*2] 307 308%if notcpuflag(ssse3) ; sse2 309 mova m1, m4 310 punpcklqdq m4, m5 311 punpckhqdq m1, m5 312 paddd m4, m1 313%else ; ssse3/sse4 314 phaddd m4, m5 315%endif ; sse2/ssse3/sse4 316%ifidn %4, X4 317 paddd m4, m0 318%endif ; %3 == X4 319%if notcpuflag(ssse3) ; sse2 320 pshufd m4, m4, 11011000b 321 movhlps m0, m4 322 paddd m0, m4 323%else ; ssse3/sse4 324 phaddd m4, m4 325 SWAP 0, 4 326%endif ; sse2/ssse3/sse4 327%endif ; %3 ==/!= X 328 329%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned 330 paddd m0, m7 331%endif ; %1 == 16 332 333 ; clip, store 334 psrad m0, 14 + %1 - %2 335%ifidn %3, X 336 movifnidn dstq, dstmp 337%endif ; %3 == X 338%if %2 == 15 339 packssdw m0, m0 340%ifnidn %3, X 341 movh [dstq+wq*(2>>wshr)], m0 342%else ; %3 == X 343 movd [dstq+wq*2], m0 344%endif ; %3 ==/!= X 345%else ; %2 == 19 346 PMINSD m0, m2, m4 347%ifnidn %3, X 348 mova [dstq+wq*(4>>wshr)], m0 349%else ; %3 == X 350 movq [dstq+wq*4], m0 351%endif ; %3 ==/!= X 352%endif ; %2 == 15/19 353%ifnidn %3, X 354 add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels 355 ; per iteration. see "shl wq,1" above as for why we do this 356%else ; %3 == X 357 add wq, 2 358%endif ; %3 ==/!= X 359 jl .loop 360 REP_RET 361%endmacro 362 363; SCALE_FUNCS source_width, intermediate_nbits, n_xmm 364%macro SCALE_FUNCS 3 365SCALE_FUNC %1, %2, 4, 4, 6, %3 366SCALE_FUNC %1, %2, 8, 8, 6, %3 367SCALE_FUNC %1, %2, X, X4, 7, %3 368SCALE_FUNC %1, %2, X, X8, 7, %3 369%endmacro 370 371; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args 372%macro SCALE_FUNCS2 3 373%if notcpuflag(sse4) 374SCALE_FUNCS 8, 15, %1 375SCALE_FUNCS 9, 15, %2 376SCALE_FUNCS 10, 15, %2 377SCALE_FUNCS 12, 15, %2 378SCALE_FUNCS 14, 15, %2 379SCALE_FUNCS 16, 15, %3 380%endif ; !sse4 381SCALE_FUNCS 8, 19, %1 382SCALE_FUNCS 9, 19, %2 383SCALE_FUNCS 10, 19, %2 384SCALE_FUNCS 12, 19, %2 385SCALE_FUNCS 14, 19, %2 386SCALE_FUNCS 16, 19, %3 387%endmacro 388 389INIT_XMM sse2 390SCALE_FUNCS2 7, 6, 8 391INIT_XMM ssse3 392SCALE_FUNCS2 6, 6, 8 393INIT_XMM sse4 394SCALE_FUNCS2 6, 6, 8 395