1; 2; jdsample.asm - upsampling (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18 19; -------------------------------------------------------------------------- 20 SECTION SEG_CONST 21 22 alignz 32 23 GLOBAL_DATA(jconst_fancy_upsample_sse2) 24 25EXTN(jconst_fancy_upsample_sse2): 26 27PW_ONE times 8 dw 1 28PW_TWO times 8 dw 2 29PW_THREE times 8 dw 3 30PW_SEVEN times 8 dw 7 31PW_EIGHT times 8 dw 8 32 33 alignz 32 34 35; -------------------------------------------------------------------------- 36 SECTION SEG_TEXT 37 BITS 32 38; 39; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 40; 41; The upsampling algorithm is linear interpolation between pixel centers, 42; also known as a "triangle filter". This is a good compromise between 43; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 44; of the way between input pixel centers. 45; 46; GLOBAL(void) 47; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor, 48; JDIMENSION downsampled_width, 49; JSAMPARRAY input_data, 50; JSAMPARRAY *output_data_ptr); 51; 52 53%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 54%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 55%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 56%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 57 58 align 32 59 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2) 60 61EXTN(jsimd_h2v1_fancy_upsample_sse2): 62 push ebp 63 mov ebp, esp 64 pushpic ebx 65; push ecx ; need not be preserved 66; push edx ; need not be preserved 67 push esi 68 push edi 69 70 get_GOT ebx ; get GOT address 71 72 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 73 test eax, eax 74 jz near .return 75 76 mov ecx, INT [max_v_samp(ebp)] ; rowctr 77 test ecx, ecx 78 jz near .return 79 80 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 81 mov edi, POINTER [output_data_ptr(ebp)] 82 mov edi, JSAMPARRAY [edi] ; output_data 83 alignx 16, 7 84.rowloop: 85 push eax ; colctr 86 push edi 87 push esi 88 89 mov esi, JSAMPROW [esi] ; inptr 90 mov edi, JSAMPROW [edi] ; outptr 91 92 test eax, SIZEOF_XMMWORD-1 93 jz short .skip 94 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 95 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 96.skip: 97 pxor xmm0, xmm0 ; xmm0=(all 0's) 98 pcmpeqb xmm7, xmm7 99 psrldq xmm7, (SIZEOF_XMMWORD-1) 100 pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] 101 102 add eax, byte SIZEOF_XMMWORD-1 103 and eax, byte -SIZEOF_XMMWORD 104 cmp eax, byte SIZEOF_XMMWORD 105 ja short .columnloop 106 alignx 16, 7 107 108.columnloop_last: 109 pcmpeqb xmm6, xmm6 110 pslldq xmm6, (SIZEOF_XMMWORD-1) 111 pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] 112 jmp short .upsample 113 alignx 16, 7 114 115.columnloop: 116 movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] 117 pslldq xmm6, (SIZEOF_XMMWORD-1) 118 119.upsample: 120 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 121 movdqa xmm2, xmm1 122 movdqa xmm3, xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 123 pslldq xmm2, 1 ; xmm2=(-- 0 1 ... 12 13 14) 124 psrldq xmm3, 1 ; xmm3=( 1 2 3 ... 14 15 --) 125 126 por xmm2, xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 127 por xmm3, xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 128 129 movdqa xmm7, xmm1 130 psrldq xmm7, (SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 131 132 movdqa xmm4, xmm1 133 punpcklbw xmm1, xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 134 punpckhbw xmm4, xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 135 movdqa xmm5, xmm2 136 punpcklbw xmm2, xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 137 punpckhbw xmm5, xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 138 movdqa xmm6, xmm3 139 punpcklbw xmm3, xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 140 punpckhbw xmm6, xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 141 142 pmullw xmm1, [GOTOFF(ebx,PW_THREE)] 143 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 144 paddw xmm2, [GOTOFF(ebx,PW_ONE)] 145 paddw xmm5, [GOTOFF(ebx,PW_ONE)] 146 paddw xmm3, [GOTOFF(ebx,PW_TWO)] 147 paddw xmm6, [GOTOFF(ebx,PW_TWO)] 148 149 paddw xmm2, xmm1 150 paddw xmm5, xmm4 151 psrlw xmm2, 2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 152 psrlw xmm5, 2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 153 paddw xmm3, xmm1 154 paddw xmm6, xmm4 155 psrlw xmm3, 2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 156 psrlw xmm6, 2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 157 158 psllw xmm3, BYTE_BIT 159 psllw xmm6, BYTE_BIT 160 por xmm2, xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 161 por xmm5, xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 162 163 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 164 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 165 166 sub eax, byte SIZEOF_XMMWORD 167 add esi, byte 1*SIZEOF_XMMWORD ; inptr 168 add edi, byte 2*SIZEOF_XMMWORD ; outptr 169 cmp eax, byte SIZEOF_XMMWORD 170 ja near .columnloop 171 test eax, eax 172 jnz near .columnloop_last 173 174 pop esi 175 pop edi 176 pop eax 177 178 add esi, byte SIZEOF_JSAMPROW ; input_data 179 add edi, byte SIZEOF_JSAMPROW ; output_data 180 dec ecx ; rowctr 181 jg near .rowloop 182 183.return: 184 pop edi 185 pop esi 186; pop edx ; need not be preserved 187; pop ecx ; need not be preserved 188 poppic ebx 189 pop ebp 190 ret 191 192; -------------------------------------------------------------------------- 193; 194; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 195; Again a triangle filter; see comments for h2v1 case, above. 196; 197; GLOBAL(void) 198; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor, 199; JDIMENSION downsampled_width, 200; JSAMPARRAY input_data, 201; JSAMPARRAY *output_data_ptr); 202; 203 204%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 205%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 206%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 207%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 208 209%define original_ebp ebp + 0 210%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 211 ; xmmword wk[WK_NUM] 212%define WK_NUM 4 213%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 214 215 align 32 216 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2) 217 218EXTN(jsimd_h2v2_fancy_upsample_sse2): 219 push ebp 220 mov eax, esp ; eax = original ebp 221 sub esp, byte 4 222 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 223 mov [esp], eax 224 mov ebp, esp ; ebp = aligned ebp 225 lea esp, [wk(0)] 226 pushpic eax ; make a room for GOT address 227 push ebx 228; push ecx ; need not be preserved 229; push edx ; need not be preserved 230 push esi 231 push edi 232 233 get_GOT ebx ; get GOT address 234 movpic POINTER [gotptr], ebx ; save GOT address 235 236 mov edx, eax ; edx = original ebp 237 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 238 test eax, eax 239 jz near .return 240 241 mov ecx, INT [max_v_samp(edx)] ; rowctr 242 test ecx, ecx 243 jz near .return 244 245 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 246 mov edi, POINTER [output_data_ptr(edx)] 247 mov edi, JSAMPARRAY [edi] ; output_data 248 alignx 16, 7 249.rowloop: 250 push eax ; colctr 251 push ecx 252 push edi 253 push esi 254 255 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 256 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 258 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 259 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 260 261 test eax, SIZEOF_XMMWORD-1 262 jz short .skip 263 push edx 264 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 265 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 266 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 267 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 268 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 269 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 270 pop edx 271.skip: 272 ; -- process the first column block 273 274 movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 275 movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 276 movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 277 278 pushpic ebx 279 movpic ebx, POINTER [gotptr] ; load GOT address 280 281 pxor xmm3, xmm3 ; xmm3=(all 0's) 282 movdqa xmm4, xmm0 283 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 284 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 285 movdqa xmm5, xmm1 286 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 287 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 288 movdqa xmm6, xmm2 289 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 290 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 291 292 pmullw xmm0, [GOTOFF(ebx,PW_THREE)] 293 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 294 295 pcmpeqb xmm7, xmm7 296 psrldq xmm7, (SIZEOF_XMMWORD-2) 297 298 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 299 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 300 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 301 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 302 303 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 304 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 305 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 306 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 307 308 pand xmm1, xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 309 pand xmm2, xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 310 311 movdqa XMMWORD [wk(0)], xmm1 312 movdqa XMMWORD [wk(1)], xmm2 313 314 poppic ebx 315 316 add eax, byte SIZEOF_XMMWORD-1 317 and eax, byte -SIZEOF_XMMWORD 318 cmp eax, byte SIZEOF_XMMWORD 319 ja short .columnloop 320 alignx 16, 7 321 322.columnloop_last: 323 ; -- process the last column block 324 325 pushpic ebx 326 movpic ebx, POINTER [gotptr] ; load GOT address 327 328 pcmpeqb xmm1, xmm1 329 pslldq xmm1, (SIZEOF_XMMWORD-2) 330 movdqa xmm2, xmm1 331 332 pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] 333 pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] 334 335 movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 336 movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 337 338 jmp near .upsample 339 alignx 16, 7 340 341.columnloop: 342 ; -- process the next column block 343 344 movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 345 movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 346 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 347 348 pushpic ebx 349 movpic ebx, POINTER [gotptr] ; load GOT address 350 351 pxor xmm3, xmm3 ; xmm3=(all 0's) 352 movdqa xmm4, xmm0 353 punpcklbw xmm0, xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 354 punpckhbw xmm4, xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 355 movdqa xmm5, xmm1 356 punpcklbw xmm1, xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 357 punpckhbw xmm5, xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 358 movdqa xmm6, xmm2 359 punpcklbw xmm2, xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 360 punpckhbw xmm6, xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 361 362 pmullw xmm0, [GOTOFF(ebx,PW_THREE)] 363 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 364 365 paddw xmm1, xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 366 paddw xmm5, xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 367 paddw xmm2, xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 368 paddw xmm6, xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 369 370 movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 371 movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 372 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 373 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 374 375 pslldq xmm1, (SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 376 pslldq xmm2, (SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 377 378 movdqa XMMWORD [wk(2)], xmm1 379 movdqa XMMWORD [wk(3)], xmm2 380 381.upsample: 382 ; -- process the upper row 383 384 movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] 385 movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] 386 387 movdqa xmm0, xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 388 movdqa xmm4, xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 389 psrldq xmm0, 2 ; xmm0=( 1 2 3 4 5 6 7 --) 390 pslldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 391 movdqa xmm5, xmm7 392 movdqa xmm6, xmm3 393 psrldq xmm5, (SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 394 pslldq xmm6, 2 ; xmm6=(-- 8 9 10 11 12 13 14) 395 396 por xmm0, xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 397 por xmm5, xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 398 399 movdqa xmm1, xmm7 400 movdqa xmm2, xmm3 401 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 402 psrldq xmm2, 2 ; xmm2=( 9 10 11 12 13 14 15 --) 403 movdqa xmm4, xmm3 404 psrldq xmm4, (SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 405 406 por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 407 por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 408 409 movdqa XMMWORD [wk(0)], xmm4 410 411 pmullw xmm7, [GOTOFF(ebx,PW_THREE)] 412 pmullw xmm3, [GOTOFF(ebx,PW_THREE)] 413 paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] 414 paddw xmm5, [GOTOFF(ebx,PW_EIGHT)] 415 paddw xmm0, [GOTOFF(ebx,PW_SEVEN)] 416 paddw xmm2, [GOTOFF(ebx,PW_SEVEN)] 417 418 paddw xmm1, xmm7 419 paddw xmm5, xmm3 420 psrlw xmm1, 4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 421 psrlw xmm5, 4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 422 paddw xmm0, xmm7 423 paddw xmm2, xmm3 424 psrlw xmm0, 4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 425 psrlw xmm2, 4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 426 427 psllw xmm0, BYTE_BIT 428 psllw xmm2, BYTE_BIT 429 por xmm1, xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 430 por xmm5, xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 431 432 movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 433 movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 434 435 ; -- process the lower row 436 437 movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] 438 movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] 439 440 movdqa xmm7, xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 441 movdqa xmm3, xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 442 psrldq xmm7, 2 ; xmm7=( 1 2 3 4 5 6 7 --) 443 pslldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 444 movdqa xmm0, xmm6 445 movdqa xmm2, xmm4 446 psrldq xmm0, (SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 447 pslldq xmm2, 2 ; xmm2=(-- 8 9 10 11 12 13 14) 448 449 por xmm7, xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 450 por xmm0, xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 451 452 movdqa xmm1, xmm6 453 movdqa xmm5, xmm4 454 pslldq xmm1, 2 ; xmm1=(-- 0 1 2 3 4 5 6) 455 psrldq xmm5, 2 ; xmm5=( 9 10 11 12 13 14 15 --) 456 movdqa xmm3, xmm4 457 psrldq xmm3, (SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 458 459 por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 460 por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 461 462 movdqa XMMWORD [wk(1)], xmm3 463 464 pmullw xmm6, [GOTOFF(ebx,PW_THREE)] 465 pmullw xmm4, [GOTOFF(ebx,PW_THREE)] 466 paddw xmm1, [GOTOFF(ebx,PW_EIGHT)] 467 paddw xmm0, [GOTOFF(ebx,PW_EIGHT)] 468 paddw xmm7, [GOTOFF(ebx,PW_SEVEN)] 469 paddw xmm5, [GOTOFF(ebx,PW_SEVEN)] 470 471 paddw xmm1, xmm6 472 paddw xmm0, xmm4 473 psrlw xmm1, 4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 474 psrlw xmm0, 4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 475 paddw xmm7, xmm6 476 paddw xmm5, xmm4 477 psrlw xmm7, 4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 478 psrlw xmm5, 4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 479 480 psllw xmm7, BYTE_BIT 481 psllw xmm5, BYTE_BIT 482 por xmm1, xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 483 por xmm0, xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 484 485 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 486 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 487 488 poppic ebx 489 490 sub eax, byte SIZEOF_XMMWORD 491 add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 492 add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 493 add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 494 add edx, byte 2*SIZEOF_XMMWORD ; outptr0 495 add edi, byte 2*SIZEOF_XMMWORD ; outptr1 496 cmp eax, byte SIZEOF_XMMWORD 497 ja near .columnloop 498 test eax, eax 499 jnz near .columnloop_last 500 501 pop esi 502 pop edi 503 pop ecx 504 pop eax 505 506 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 507 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 508 sub ecx, byte 2 ; rowctr 509 jg near .rowloop 510 511.return: 512 pop edi 513 pop esi 514; pop edx ; need not be preserved 515; pop ecx ; need not be preserved 516 pop ebx 517 mov esp, ebp ; esp <- aligned ebp 518 pop esp ; esp <- original ebp 519 pop ebp 520 ret 521 522; -------------------------------------------------------------------------- 523; 524; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 525; It's still a box filter. 526; 527; GLOBAL(void) 528; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 529; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 530; 531 532%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 533%define output_width(b) (b) + 12 ; JDIMENSION output_width 534%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 535%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 536 537 align 32 538 GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2) 539 540EXTN(jsimd_h2v1_upsample_sse2): 541 push ebp 542 mov ebp, esp 543; push ebx ; unused 544; push ecx ; need not be preserved 545; push edx ; need not be preserved 546 push esi 547 push edi 548 549 mov edx, JDIMENSION [output_width(ebp)] 550 add edx, byte (2*SIZEOF_XMMWORD)-1 551 and edx, byte -(2*SIZEOF_XMMWORD) 552 jz short .return 553 554 mov ecx, INT [max_v_samp(ebp)] ; rowctr 555 test ecx, ecx 556 jz short .return 557 558 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 559 mov edi, POINTER [output_data_ptr(ebp)] 560 mov edi, JSAMPARRAY [edi] ; output_data 561 alignx 16, 7 562.rowloop: 563 push edi 564 push esi 565 566 mov esi, JSAMPROW [esi] ; inptr 567 mov edi, JSAMPROW [edi] ; outptr 568 mov eax, edx ; colctr 569 alignx 16, 7 570.columnloop: 571 572 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 573 574 movdqa xmm1, xmm0 575 punpcklbw xmm0, xmm0 576 punpckhbw xmm1, xmm1 577 578 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 579 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 580 581 sub eax, byte 2*SIZEOF_XMMWORD 582 jz short .nextrow 583 584 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 585 586 movdqa xmm3, xmm2 587 punpcklbw xmm2, xmm2 588 punpckhbw xmm3, xmm3 589 590 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 591 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 592 593 sub eax, byte 2*SIZEOF_XMMWORD 594 jz short .nextrow 595 596 add esi, byte 2*SIZEOF_XMMWORD ; inptr 597 add edi, byte 4*SIZEOF_XMMWORD ; outptr 598 jmp short .columnloop 599 alignx 16, 7 600 601.nextrow: 602 pop esi 603 pop edi 604 605 add esi, byte SIZEOF_JSAMPROW ; input_data 606 add edi, byte SIZEOF_JSAMPROW ; output_data 607 dec ecx ; rowctr 608 jg short .rowloop 609 610.return: 611 pop edi 612 pop esi 613; pop edx ; need not be preserved 614; pop ecx ; need not be preserved 615; pop ebx ; unused 616 pop ebp 617 ret 618 619; -------------------------------------------------------------------------- 620; 621; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 622; It's still a box filter. 623; 624; GLOBAL(void) 625; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width, 626; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 627; 628 629%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 630%define output_width(b) (b) + 12 ; JDIMENSION output_width 631%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 632%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 633 634 align 32 635 GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2) 636 637EXTN(jsimd_h2v2_upsample_sse2): 638 push ebp 639 mov ebp, esp 640 push ebx 641; push ecx ; need not be preserved 642; push edx ; need not be preserved 643 push esi 644 push edi 645 646 mov edx, JDIMENSION [output_width(ebp)] 647 add edx, byte (2*SIZEOF_XMMWORD)-1 648 and edx, byte -(2*SIZEOF_XMMWORD) 649 jz near .return 650 651 mov ecx, INT [max_v_samp(ebp)] ; rowctr 652 test ecx, ecx 653 jz near .return 654 655 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 656 mov edi, POINTER [output_data_ptr(ebp)] 657 mov edi, JSAMPARRAY [edi] ; output_data 658 alignx 16, 7 659.rowloop: 660 push edi 661 push esi 662 663 mov esi, JSAMPROW [esi] ; inptr 664 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 665 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 666 mov eax, edx ; colctr 667 alignx 16, 7 668.columnloop: 669 670 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 671 672 movdqa xmm1, xmm0 673 punpcklbw xmm0, xmm0 674 punpckhbw xmm1, xmm1 675 676 movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 677 movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 678 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 679 movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 680 681 sub eax, byte 2*SIZEOF_XMMWORD 682 jz short .nextrow 683 684 movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 685 686 movdqa xmm3, xmm2 687 punpcklbw xmm2, xmm2 688 punpckhbw xmm3, xmm3 689 690 movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 691 movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 692 movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 693 movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 694 695 sub eax, byte 2*SIZEOF_XMMWORD 696 jz short .nextrow 697 698 add esi, byte 2*SIZEOF_XMMWORD ; inptr 699 add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 700 add edi, byte 4*SIZEOF_XMMWORD ; outptr1 701 jmp short .columnloop 702 alignx 16, 7 703 704.nextrow: 705 pop esi 706 pop edi 707 708 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 709 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 710 sub ecx, byte 2 ; rowctr 711 jg short .rowloop 712 713.return: 714 pop edi 715 pop esi 716; pop edx ; need not be preserved 717; pop ecx ; need not be preserved 718 pop ebx 719 pop ebp 720 ret 721 722; For some reason, the OS X linker does not honor the request to align the 723; segment unless we do this. 724 align 32 725