1; 2; jdsample.asm - upsampling (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_CONST 23 24 alignz 32 25 GLOBAL_DATA(jconst_fancy_upsample_mmx) 26 27EXTN(jconst_fancy_upsample_mmx): 28 29PW_ONE times 4 dw 1 30PW_TWO times 4 dw 2 31PW_THREE times 4 dw 3 32PW_SEVEN times 4 dw 7 33PW_EIGHT times 4 dw 8 34 35 alignz 32 36 37; -------------------------------------------------------------------------- 38 SECTION SEG_TEXT 39 BITS 32 40; 41; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 42; 43; The upsampling algorithm is linear interpolation between pixel centers, 44; also known as a "triangle filter". This is a good compromise between 45; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 46; of the way between input pixel centers. 47; 48; GLOBAL(void) 49; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor, 50; JDIMENSION downsampled_width, 51; JSAMPARRAY input_data, 52; JSAMPARRAY *output_data_ptr); 53; 54 55%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 56%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 57%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 58%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 59 60 align 32 61 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx) 62 63EXTN(jsimd_h2v1_fancy_upsample_mmx): 64 push ebp 65 mov ebp, esp 66 pushpic ebx 67; push ecx ; need not be preserved 68; push edx ; need not be preserved 69 push esi 70 push edi 71 72 get_GOT ebx ; get GOT address 73 74 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 75 test eax, eax 76 jz near .return 77 78 mov ecx, INT [max_v_samp(ebp)] ; rowctr 79 test ecx, ecx 80 jz near .return 81 82 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 83 mov edi, POINTER [output_data_ptr(ebp)] 84 mov edi, JSAMPARRAY [edi] ; output_data 85 alignx 16, 7 86.rowloop: 87 push eax ; colctr 88 push edi 89 push esi 90 91 mov esi, JSAMPROW [esi] ; inptr 92 mov edi, JSAMPROW [edi] ; outptr 93 94 test eax, SIZEOF_MMWORD-1 95 jz short .skip 96 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 97 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 98.skip: 99 pxor mm0, mm0 ; mm0=(all 0's) 100 pcmpeqb mm7, mm7 101 psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT 102 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] 103 104 add eax, byte SIZEOF_MMWORD-1 105 and eax, byte -SIZEOF_MMWORD 106 cmp eax, byte SIZEOF_MMWORD 107 ja short .columnloop 108 alignx 16, 7 109 110.columnloop_last: 111 pcmpeqb mm6, mm6 112 psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT 113 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] 114 jmp short .upsample 115 alignx 16, 7 116 117.columnloop: 118 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] 119 psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT 120 121.upsample: 122 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 123 movq mm2, mm1 124 movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7) 125 psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) 126 psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) 127 128 por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6) 129 por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8) 130 131 movq mm7, mm1 132 psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) 133 134 movq mm4, mm1 135 punpcklbw mm1, mm0 ; mm1=( 0 1 2 3) 136 punpckhbw mm4, mm0 ; mm4=( 4 5 6 7) 137 movq mm5, mm2 138 punpcklbw mm2, mm0 ; mm2=(-1 0 1 2) 139 punpckhbw mm5, mm0 ; mm5=( 3 4 5 6) 140 movq mm6, mm3 141 punpcklbw mm3, mm0 ; mm3=( 1 2 3 4) 142 punpckhbw mm6, mm0 ; mm6=( 5 6 7 8) 143 144 pmullw mm1, [GOTOFF(ebx,PW_THREE)] 145 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 146 paddw mm2, [GOTOFF(ebx,PW_ONE)] 147 paddw mm5, [GOTOFF(ebx,PW_ONE)] 148 paddw mm3, [GOTOFF(ebx,PW_TWO)] 149 paddw mm6, [GOTOFF(ebx,PW_TWO)] 150 151 paddw mm2, mm1 152 paddw mm5, mm4 153 psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6) 154 psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14) 155 paddw mm3, mm1 156 paddw mm6, mm4 157 psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7) 158 psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15) 159 160 psllw mm3, BYTE_BIT 161 psllw mm6, BYTE_BIT 162 por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) 163 por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) 164 165 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 166 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 167 168 sub eax, byte SIZEOF_MMWORD 169 add esi, byte 1*SIZEOF_MMWORD ; inptr 170 add edi, byte 2*SIZEOF_MMWORD ; outptr 171 cmp eax, byte SIZEOF_MMWORD 172 ja near .columnloop 173 test eax, eax 174 jnz near .columnloop_last 175 176 pop esi 177 pop edi 178 pop eax 179 180 add esi, byte SIZEOF_JSAMPROW ; input_data 181 add edi, byte SIZEOF_JSAMPROW ; output_data 182 dec ecx ; rowctr 183 jg near .rowloop 184 185 emms ; empty MMX state 186 187.return: 188 pop edi 189 pop esi 190; pop edx ; need not be preserved 191; pop ecx ; need not be preserved 192 poppic ebx 193 pop ebp 194 ret 195 196; -------------------------------------------------------------------------- 197; 198; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 199; Again a triangle filter; see comments for h2v1 case, above. 200; 201; GLOBAL(void) 202; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor, 203; JDIMENSION downsampled_width, 204; JSAMPARRAY input_data, 205; JSAMPARRAY *output_data_ptr); 206; 207 208%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 209%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 210%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 211%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 212 213%define original_ebp ebp + 0 214%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 215%define WK_NUM 4 216%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 217 218 align 32 219 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx) 220 221EXTN(jsimd_h2v2_fancy_upsample_mmx): 222 push ebp 223 mov eax, esp ; eax = original ebp 224 sub esp, byte 4 225 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 226 mov [esp], eax 227 mov ebp, esp ; ebp = aligned ebp 228 lea esp, [wk(0)] 229 pushpic eax ; make a room for GOT address 230 push ebx 231; push ecx ; need not be preserved 232; push edx ; need not be preserved 233 push esi 234 push edi 235 236 get_GOT ebx ; get GOT address 237 movpic POINTER [gotptr], ebx ; save GOT address 238 239 mov edx, eax ; edx = original ebp 240 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 241 test eax, eax 242 jz near .return 243 244 mov ecx, INT [max_v_samp(edx)] ; rowctr 245 test ecx, ecx 246 jz near .return 247 248 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 249 mov edi, POINTER [output_data_ptr(edx)] 250 mov edi, JSAMPARRAY [edi] ; output_data 251 alignx 16, 7 252.rowloop: 253 push eax ; colctr 254 push ecx 255 push edi 256 push esi 257 258 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 259 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 260 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 261 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 262 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 263 264 test eax, SIZEOF_MMWORD-1 265 jz short .skip 266 push edx 267 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 268 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 269 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 270 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 271 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 272 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 273 pop edx 274.skip: 275 ; -- process the first column block 276 277 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] 278 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] 279 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] 280 281 pushpic ebx 282 movpic ebx, POINTER [gotptr] ; load GOT address 283 284 pxor mm3, mm3 ; mm3=(all 0's) 285 movq mm4, mm0 286 punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3) 287 punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7) 288 movq mm5, mm1 289 punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3) 290 punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7) 291 movq mm6, mm2 292 punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3) 293 punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7) 294 295 pmullw mm0, [GOTOFF(ebx,PW_THREE)] 296 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 297 298 pcmpeqb mm7, mm7 299 psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT 300 301 paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) 302 paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) 303 paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) 304 paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) 305 306 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save 307 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data 308 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 309 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 310 311 pand mm1, mm7 ; mm1=( 0 - - -) 312 pand mm2, mm7 ; mm2=( 0 - - -) 313 314 movq MMWORD [wk(0)], mm1 315 movq MMWORD [wk(1)], mm2 316 317 poppic ebx 318 319 add eax, byte SIZEOF_MMWORD-1 320 and eax, byte -SIZEOF_MMWORD 321 cmp eax, byte SIZEOF_MMWORD 322 ja short .columnloop 323 alignx 16, 7 324 325.columnloop_last: 326 ; -- process the last column block 327 328 pushpic ebx 329 movpic ebx, POINTER [gotptr] ; load GOT address 330 331 pcmpeqb mm1, mm1 332 psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT 333 movq mm2, mm1 334 335 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) 336 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) 337 338 movq MMWORD [wk(2)], mm1 339 movq MMWORD [wk(3)], mm2 340 341 jmp short .upsample 342 alignx 16, 7 343 344.columnloop: 345 ; -- process the next column block 346 347 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] 348 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] 349 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] 350 351 pushpic ebx 352 movpic ebx, POINTER [gotptr] ; load GOT address 353 354 pxor mm3, mm3 ; mm3=(all 0's) 355 movq mm4, mm0 356 punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3) 357 punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7) 358 movq mm5, mm1 359 punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3) 360 punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7) 361 movq mm6, mm2 362 punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3) 363 punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7) 364 365 pmullw mm0, [GOTOFF(ebx,PW_THREE)] 366 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 367 368 paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) 369 paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) 370 paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) 371 paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) 372 373 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save 374 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data 375 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 376 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 377 378 psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) 379 psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) 380 381 movq MMWORD [wk(2)], mm1 382 movq MMWORD [wk(3)], mm2 383 384.upsample: 385 ; -- process the upper row 386 387 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) 388 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) 389 390 movq mm0, mm7 391 movq mm4, mm3 392 psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -) 393 psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) 394 movq mm5, mm7 395 movq mm6, mm3 396 psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) 397 psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6) 398 399 por mm0, mm4 ; mm0=( 1 2 3 4) 400 por mm5, mm6 ; mm5=( 3 4 5 6) 401 402 movq mm1, mm7 403 movq mm2, mm3 404 psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) 405 psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -) 406 movq mm4, mm3 407 psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) 408 409 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) 410 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) 411 412 movq MMWORD [wk(0)], mm4 413 414 pmullw mm7, [GOTOFF(ebx,PW_THREE)] 415 pmullw mm3, [GOTOFF(ebx,PW_THREE)] 416 paddw mm1, [GOTOFF(ebx,PW_EIGHT)] 417 paddw mm5, [GOTOFF(ebx,PW_EIGHT)] 418 paddw mm0, [GOTOFF(ebx,PW_SEVEN)] 419 paddw mm2, [GOTOFF(ebx,PW_SEVEN)] 420 421 paddw mm1, mm7 422 paddw mm5, mm3 423 psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6) 424 psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14) 425 paddw mm0, mm7 426 paddw mm2, mm3 427 psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7) 428 psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15) 429 430 psllw mm0, BYTE_BIT 431 psllw mm2, BYTE_BIT 432 por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) 433 por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) 434 435 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 436 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 437 438 ; -- process the lower row 439 440 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) 441 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) 442 443 movq mm7, mm6 444 movq mm3, mm4 445 psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -) 446 psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) 447 movq mm0, mm6 448 movq mm2, mm4 449 psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) 450 psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6) 451 452 por mm7, mm3 ; mm7=( 1 2 3 4) 453 por mm0, mm2 ; mm0=( 3 4 5 6) 454 455 movq mm1, mm6 456 movq mm5, mm4 457 psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) 458 psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -) 459 movq mm3, mm4 460 psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) 461 462 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) 463 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) 464 465 movq MMWORD [wk(1)], mm3 466 467 pmullw mm6, [GOTOFF(ebx,PW_THREE)] 468 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 469 paddw mm1, [GOTOFF(ebx,PW_EIGHT)] 470 paddw mm0, [GOTOFF(ebx,PW_EIGHT)] 471 paddw mm7, [GOTOFF(ebx,PW_SEVEN)] 472 paddw mm5, [GOTOFF(ebx,PW_SEVEN)] 473 474 paddw mm1, mm6 475 paddw mm0, mm4 476 psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6) 477 psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14) 478 paddw mm7, mm6 479 paddw mm5, mm4 480 psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7) 481 psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15) 482 483 psllw mm7, BYTE_BIT 484 psllw mm5, BYTE_BIT 485 por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) 486 por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) 487 488 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 489 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 490 491 poppic ebx 492 493 sub eax, byte SIZEOF_MMWORD 494 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) 495 add ebx, byte 1*SIZEOF_MMWORD ; inptr0 496 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) 497 add edx, byte 2*SIZEOF_MMWORD ; outptr0 498 add edi, byte 2*SIZEOF_MMWORD ; outptr1 499 cmp eax, byte SIZEOF_MMWORD 500 ja near .columnloop 501 test eax, eax 502 jnz near .columnloop_last 503 504 pop esi 505 pop edi 506 pop ecx 507 pop eax 508 509 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 510 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 511 sub ecx, byte 2 ; rowctr 512 jg near .rowloop 513 514 emms ; empty MMX state 515 516.return: 517 pop edi 518 pop esi 519; pop edx ; need not be preserved 520; pop ecx ; need not be preserved 521 pop ebx 522 mov esp, ebp ; esp <- aligned ebp 523 pop esp ; esp <- original ebp 524 pop ebp 525 ret 526 527; -------------------------------------------------------------------------- 528; 529; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 530; It's still a box filter. 531; 532; GLOBAL(void) 533; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, 534; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 535; 536 537%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 538%define output_width(b) (b) + 12 ; JDIMENSION output_width 539%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 540%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 541 542 align 32 543 GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx) 544 545EXTN(jsimd_h2v1_upsample_mmx): 546 push ebp 547 mov ebp, esp 548; push ebx ; unused 549; push ecx ; need not be preserved 550; push edx ; need not be preserved 551 push esi 552 push edi 553 554 mov edx, JDIMENSION [output_width(ebp)] 555 add edx, byte (2*SIZEOF_MMWORD)-1 556 and edx, byte -(2*SIZEOF_MMWORD) 557 jz short .return 558 559 mov ecx, INT [max_v_samp(ebp)] ; rowctr 560 test ecx, ecx 561 jz short .return 562 563 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 564 mov edi, POINTER [output_data_ptr(ebp)] 565 mov edi, JSAMPARRAY [edi] ; output_data 566 alignx 16, 7 567.rowloop: 568 push edi 569 push esi 570 571 mov esi, JSAMPROW [esi] ; inptr 572 mov edi, JSAMPROW [edi] ; outptr 573 mov eax, edx ; colctr 574 alignx 16, 7 575.columnloop: 576 577 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 578 579 movq mm1, mm0 580 punpcklbw mm0, mm0 581 punpckhbw mm1, mm1 582 583 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 584 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 585 586 sub eax, byte 2*SIZEOF_MMWORD 587 jz short .nextrow 588 589 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 590 591 movq mm3, mm2 592 punpcklbw mm2, mm2 593 punpckhbw mm3, mm3 594 595 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 596 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 597 598 sub eax, byte 2*SIZEOF_MMWORD 599 jz short .nextrow 600 601 add esi, byte 2*SIZEOF_MMWORD ; inptr 602 add edi, byte 4*SIZEOF_MMWORD ; outptr 603 jmp short .columnloop 604 alignx 16, 7 605 606.nextrow: 607 pop esi 608 pop edi 609 610 add esi, byte SIZEOF_JSAMPROW ; input_data 611 add edi, byte SIZEOF_JSAMPROW ; output_data 612 dec ecx ; rowctr 613 jg short .rowloop 614 615 emms ; empty MMX state 616 617.return: 618 pop edi 619 pop esi 620; pop edx ; need not be preserved 621; pop ecx ; need not be preserved 622; pop ebx ; unused 623 pop ebp 624 ret 625 626; -------------------------------------------------------------------------- 627; 628; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 629; It's still a box filter. 630; 631; GLOBAL(void) 632; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, 633; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 634; 635 636%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 637%define output_width(b) (b) + 12 ; JDIMENSION output_width 638%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 639%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 640 641 align 32 642 GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx) 643 644EXTN(jsimd_h2v2_upsample_mmx): 645 push ebp 646 mov ebp, esp 647 push ebx 648; push ecx ; need not be preserved 649; push edx ; need not be preserved 650 push esi 651 push edi 652 653 mov edx, JDIMENSION [output_width(ebp)] 654 add edx, byte (2*SIZEOF_MMWORD)-1 655 and edx, byte -(2*SIZEOF_MMWORD) 656 jz near .return 657 658 mov ecx, INT [max_v_samp(ebp)] ; rowctr 659 test ecx, ecx 660 jz short .return 661 662 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 663 mov edi, POINTER [output_data_ptr(ebp)] 664 mov edi, JSAMPARRAY [edi] ; output_data 665 alignx 16, 7 666.rowloop: 667 push edi 668 push esi 669 670 mov esi, JSAMPROW [esi] ; inptr 671 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 672 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 673 mov eax, edx ; colctr 674 alignx 16, 7 675.columnloop: 676 677 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 678 679 movq mm1, mm0 680 punpcklbw mm0, mm0 681 punpckhbw mm1, mm1 682 683 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 684 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 685 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 686 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 687 688 sub eax, byte 2*SIZEOF_MMWORD 689 jz short .nextrow 690 691 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 692 693 movq mm3, mm2 694 punpcklbw mm2, mm2 695 punpckhbw mm3, mm3 696 697 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 698 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 699 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 700 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 701 702 sub eax, byte 2*SIZEOF_MMWORD 703 jz short .nextrow 704 705 add esi, byte 2*SIZEOF_MMWORD ; inptr 706 add ebx, byte 4*SIZEOF_MMWORD ; outptr0 707 add edi, byte 4*SIZEOF_MMWORD ; outptr1 708 jmp short .columnloop 709 alignx 16, 7 710 711.nextrow: 712 pop esi 713 pop edi 714 715 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 716 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 717 sub ecx, byte 2 ; rowctr 718 jg short .rowloop 719 720 emms ; empty MMX state 721 722.return: 723 pop edi 724 pop esi 725; pop edx ; need not be preserved 726; pop ecx ; need not be preserved 727 pop ebx 728 pop ebp 729 ret 730 731; For some reason, the OS X linker does not honor the request to align the 732; segment unless we do this. 733 align 32 734