1; 2; jdsample.asm - upsampling (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jsimdext.inc" 18 19; -------------------------------------------------------------------------- 20 SECTION SEG_CONST 21 22 alignz 32 23 GLOBAL_DATA(jconst_fancy_upsample_mmx) 24 25EXTN(jconst_fancy_upsample_mmx): 26 27PW_ONE times 4 dw 1 28PW_TWO times 4 dw 2 29PW_THREE times 4 dw 3 30PW_SEVEN times 4 dw 7 31PW_EIGHT times 4 dw 8 32 33 alignz 32 34 35; -------------------------------------------------------------------------- 36 SECTION SEG_TEXT 37 BITS 32 38; 39; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 40; 41; The upsampling algorithm is linear interpolation between pixel centers, 42; also known as a "triangle filter". This is a good compromise between 43; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 44; of the way between input pixel centers. 45; 46; GLOBAL(void) 47; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor, 48; JDIMENSION downsampled_width, 49; JSAMPARRAY input_data, 50; JSAMPARRAY *output_data_ptr); 51; 52 53%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 54%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 55%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 56%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 57 58 align 32 59 GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx) 60 61EXTN(jsimd_h2v1_fancy_upsample_mmx): 62 push ebp 63 mov ebp, esp 64 pushpic ebx 65; push ecx ; need not be preserved 66; push edx ; need not be preserved 67 push esi 68 push edi 69 70 get_GOT ebx ; get GOT address 71 72 mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 73 test eax, eax 74 jz near .return 75 76 mov ecx, INT [max_v_samp(ebp)] ; rowctr 77 test ecx, ecx 78 jz near .return 79 80 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 81 mov edi, POINTER [output_data_ptr(ebp)] 82 mov edi, JSAMPARRAY [edi] ; output_data 83 alignx 16, 7 84.rowloop: 85 push eax ; colctr 86 push edi 87 push esi 88 89 mov esi, JSAMPROW [esi] ; inptr 90 mov edi, JSAMPROW [edi] ; outptr 91 92 test eax, SIZEOF_MMWORD-1 93 jz short .skip 94 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 95 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 96.skip: 97 pxor mm0, mm0 ; mm0=(all 0's) 98 pcmpeqb mm7, mm7 99 psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT 100 pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] 101 102 add eax, byte SIZEOF_MMWORD-1 103 and eax, byte -SIZEOF_MMWORD 104 cmp eax, byte SIZEOF_MMWORD 105 ja short .columnloop 106 alignx 16, 7 107 108.columnloop_last: 109 pcmpeqb mm6, mm6 110 psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT 111 pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] 112 jmp short .upsample 113 alignx 16, 7 114 115.columnloop: 116 movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] 117 psllq mm6, (SIZEOF_MMWORD-1)*BYTE_BIT 118 119.upsample: 120 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 121 movq mm2, mm1 122 movq mm3, mm1 ; mm1=( 0 1 2 3 4 5 6 7) 123 psllq mm2, BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) 124 psrlq mm3, BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) 125 126 por mm2, mm7 ; mm2=(-1 0 1 2 3 4 5 6) 127 por mm3, mm6 ; mm3=( 1 2 3 4 5 6 7 8) 128 129 movq mm7, mm1 130 psrlq mm7, (SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) 131 132 movq mm4, mm1 133 punpcklbw mm1, mm0 ; mm1=( 0 1 2 3) 134 punpckhbw mm4, mm0 ; mm4=( 4 5 6 7) 135 movq mm5, mm2 136 punpcklbw mm2, mm0 ; mm2=(-1 0 1 2) 137 punpckhbw mm5, mm0 ; mm5=( 3 4 5 6) 138 movq mm6, mm3 139 punpcklbw mm3, mm0 ; mm3=( 1 2 3 4) 140 punpckhbw mm6, mm0 ; mm6=( 5 6 7 8) 141 142 pmullw mm1, [GOTOFF(ebx,PW_THREE)] 143 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 144 paddw mm2, [GOTOFF(ebx,PW_ONE)] 145 paddw mm5, [GOTOFF(ebx,PW_ONE)] 146 paddw mm3, [GOTOFF(ebx,PW_TWO)] 147 paddw mm6, [GOTOFF(ebx,PW_TWO)] 148 149 paddw mm2, mm1 150 paddw mm5, mm4 151 psrlw mm2, 2 ; mm2=OutLE=( 0 2 4 6) 152 psrlw mm5, 2 ; mm5=OutHE=( 8 10 12 14) 153 paddw mm3, mm1 154 paddw mm6, mm4 155 psrlw mm3, 2 ; mm3=OutLO=( 1 3 5 7) 156 psrlw mm6, 2 ; mm6=OutHO=( 9 11 13 15) 157 158 psllw mm3, BYTE_BIT 159 psllw mm6, BYTE_BIT 160 por mm2, mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) 161 por mm5, mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) 162 163 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 164 movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 165 166 sub eax, byte SIZEOF_MMWORD 167 add esi, byte 1*SIZEOF_MMWORD ; inptr 168 add edi, byte 2*SIZEOF_MMWORD ; outptr 169 cmp eax, byte SIZEOF_MMWORD 170 ja near .columnloop 171 test eax, eax 172 jnz near .columnloop_last 173 174 pop esi 175 pop edi 176 pop eax 177 178 add esi, byte SIZEOF_JSAMPROW ; input_data 179 add edi, byte SIZEOF_JSAMPROW ; output_data 180 dec ecx ; rowctr 181 jg near .rowloop 182 183 emms ; empty MMX state 184 185.return: 186 pop edi 187 pop esi 188; pop edx ; need not be preserved 189; pop ecx ; need not be preserved 190 poppic ebx 191 pop ebp 192 ret 193 194; -------------------------------------------------------------------------- 195; 196; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 197; Again a triangle filter; see comments for h2v1 case, above. 198; 199; GLOBAL(void) 200; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor, 201; JDIMENSION downsampled_width, 202; JSAMPARRAY input_data, 203; JSAMPARRAY *output_data_ptr); 204; 205 206%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 207%define downsamp_width(b) (b) + 12 ; JDIMENSION downsampled_width 208%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 209%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 210 211%define original_ebp ebp + 0 212%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 213%define WK_NUM 4 214%define gotptr wk(0) - SIZEOF_POINTER ; void *gotptr 215 216 align 32 217 GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx) 218 219EXTN(jsimd_h2v2_fancy_upsample_mmx): 220 push ebp 221 mov eax, esp ; eax = original ebp 222 sub esp, byte 4 223 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 224 mov [esp], eax 225 mov ebp, esp ; ebp = aligned ebp 226 lea esp, [wk(0)] 227 pushpic eax ; make a room for GOT address 228 push ebx 229; push ecx ; need not be preserved 230; push edx ; need not be preserved 231 push esi 232 push edi 233 234 get_GOT ebx ; get GOT address 235 movpic POINTER [gotptr], ebx ; save GOT address 236 237 mov edx, eax ; edx = original ebp 238 mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 239 test eax, eax 240 jz near .return 241 242 mov ecx, INT [max_v_samp(edx)] ; rowctr 243 test ecx, ecx 244 jz near .return 245 246 mov esi, JSAMPARRAY [input_data(edx)] ; input_data 247 mov edi, POINTER [output_data_ptr(edx)] 248 mov edi, JSAMPARRAY [edi] ; output_data 249 alignx 16, 7 250.rowloop: 251 push eax ; colctr 252 push ecx 253 push edi 254 push esi 255 256 mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 257 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 258 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 259 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 260 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 261 262 test eax, SIZEOF_MMWORD-1 263 jz short .skip 264 push edx 265 mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 266 mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 267 mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 268 mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 269 mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 270 mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 271 pop edx 272.skip: 273 ; -- process the first column block 274 275 movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] 276 movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] 277 movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] 278 279 pushpic ebx 280 movpic ebx, POINTER [gotptr] ; load GOT address 281 282 pxor mm3, mm3 ; mm3=(all 0's) 283 movq mm4, mm0 284 punpcklbw mm0, mm3 ; mm0=row[ 0][0]( 0 1 2 3) 285 punpckhbw mm4, mm3 ; mm4=row[ 0][0]( 4 5 6 7) 286 movq mm5, mm1 287 punpcklbw mm1, mm3 ; mm1=row[-1][0]( 0 1 2 3) 288 punpckhbw mm5, mm3 ; mm5=row[-1][0]( 4 5 6 7) 289 movq mm6, mm2 290 punpcklbw mm2, mm3 ; mm2=row[+1][0]( 0 1 2 3) 291 punpckhbw mm6, mm3 ; mm6=row[+1][0]( 4 5 6 7) 292 293 pmullw mm0, [GOTOFF(ebx,PW_THREE)] 294 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 295 296 pcmpeqb mm7, mm7 297 psrlq mm7, (SIZEOF_MMWORD-2)*BYTE_BIT 298 299 paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) 300 paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) 301 paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) 302 paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) 303 304 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save 305 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data 306 movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 307 movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 308 309 pand mm1, mm7 ; mm1=( 0 - - -) 310 pand mm2, mm7 ; mm2=( 0 - - -) 311 312 movq MMWORD [wk(0)], mm1 313 movq MMWORD [wk(1)], mm2 314 315 poppic ebx 316 317 add eax, byte SIZEOF_MMWORD-1 318 and eax, byte -SIZEOF_MMWORD 319 cmp eax, byte SIZEOF_MMWORD 320 ja short .columnloop 321 alignx 16, 7 322 323.columnloop_last: 324 ; -- process the last column block 325 326 pushpic ebx 327 movpic ebx, POINTER [gotptr] ; load GOT address 328 329 pcmpeqb mm1, mm1 330 psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT 331 movq mm2, mm1 332 333 pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) 334 pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) 335 336 movq MMWORD [wk(2)], mm1 337 movq MMWORD [wk(3)], mm2 338 339 jmp short .upsample 340 alignx 16, 7 341 342.columnloop: 343 ; -- process the next column block 344 345 movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] 346 movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] 347 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] 348 349 pushpic ebx 350 movpic ebx, POINTER [gotptr] ; load GOT address 351 352 pxor mm3, mm3 ; mm3=(all 0's) 353 movq mm4, mm0 354 punpcklbw mm0, mm3 ; mm0=row[ 0][1]( 0 1 2 3) 355 punpckhbw mm4, mm3 ; mm4=row[ 0][1]( 4 5 6 7) 356 movq mm5, mm1 357 punpcklbw mm1, mm3 ; mm1=row[-1][1]( 0 1 2 3) 358 punpckhbw mm5, mm3 ; mm5=row[-1][1]( 4 5 6 7) 359 movq mm6, mm2 360 punpcklbw mm2, mm3 ; mm2=row[+1][1]( 0 1 2 3) 361 punpckhbw mm6, mm3 ; mm6=row[+1][1]( 4 5 6 7) 362 363 pmullw mm0, [GOTOFF(ebx,PW_THREE)] 364 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 365 366 paddw mm1, mm0 ; mm1=Int0L=( 0 1 2 3) 367 paddw mm5, mm4 ; mm5=Int0H=( 4 5 6 7) 368 paddw mm2, mm0 ; mm2=Int1L=( 0 1 2 3) 369 paddw mm6, mm4 ; mm6=Int1H=( 4 5 6 7) 370 371 movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save 372 movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data 373 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 374 movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 375 376 psllq mm1, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) 377 psllq mm2, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) 378 379 movq MMWORD [wk(2)], mm1 380 movq MMWORD [wk(3)], mm2 381 382.upsample: 383 ; -- process the upper row 384 385 movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) 386 movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) 387 388 movq mm0, mm7 389 movq mm4, mm3 390 psrlq mm0, 2*BYTE_BIT ; mm0=( 1 2 3 -) 391 psllq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) 392 movq mm5, mm7 393 movq mm6, mm3 394 psrlq mm5, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) 395 psllq mm6, 2*BYTE_BIT ; mm6=( - 4 5 6) 396 397 por mm0, mm4 ; mm0=( 1 2 3 4) 398 por mm5, mm6 ; mm5=( 3 4 5 6) 399 400 movq mm1, mm7 401 movq mm2, mm3 402 psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) 403 psrlq mm2, 2*BYTE_BIT ; mm2=( 5 6 7 -) 404 movq mm4, mm3 405 psrlq mm4, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) 406 407 por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) 408 por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) 409 410 movq MMWORD [wk(0)], mm4 411 412 pmullw mm7, [GOTOFF(ebx,PW_THREE)] 413 pmullw mm3, [GOTOFF(ebx,PW_THREE)] 414 paddw mm1, [GOTOFF(ebx,PW_EIGHT)] 415 paddw mm5, [GOTOFF(ebx,PW_EIGHT)] 416 paddw mm0, [GOTOFF(ebx,PW_SEVEN)] 417 paddw mm2, [GOTOFF(ebx,PW_SEVEN)] 418 419 paddw mm1, mm7 420 paddw mm5, mm3 421 psrlw mm1, 4 ; mm1=Out0LE=( 0 2 4 6) 422 psrlw mm5, 4 ; mm5=Out0HE=( 8 10 12 14) 423 paddw mm0, mm7 424 paddw mm2, mm3 425 psrlw mm0, 4 ; mm0=Out0LO=( 1 3 5 7) 426 psrlw mm2, 4 ; mm2=Out0HO=( 9 11 13 15) 427 428 psllw mm0, BYTE_BIT 429 psllw mm2, BYTE_BIT 430 por mm1, mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) 431 por mm5, mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) 432 433 movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 434 movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 435 436 ; -- process the lower row 437 438 movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) 439 movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) 440 441 movq mm7, mm6 442 movq mm3, mm4 443 psrlq mm7, 2*BYTE_BIT ; mm7=( 1 2 3 -) 444 psllq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) 445 movq mm0, mm6 446 movq mm2, mm4 447 psrlq mm0, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) 448 psllq mm2, 2*BYTE_BIT ; mm2=( - 4 5 6) 449 450 por mm7, mm3 ; mm7=( 1 2 3 4) 451 por mm0, mm2 ; mm0=( 3 4 5 6) 452 453 movq mm1, mm6 454 movq mm5, mm4 455 psllq mm1, 2*BYTE_BIT ; mm1=( - 0 1 2) 456 psrlq mm5, 2*BYTE_BIT ; mm5=( 5 6 7 -) 457 movq mm3, mm4 458 psrlq mm3, (SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) 459 460 por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) 461 por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) 462 463 movq MMWORD [wk(1)], mm3 464 465 pmullw mm6, [GOTOFF(ebx,PW_THREE)] 466 pmullw mm4, [GOTOFF(ebx,PW_THREE)] 467 paddw mm1, [GOTOFF(ebx,PW_EIGHT)] 468 paddw mm0, [GOTOFF(ebx,PW_EIGHT)] 469 paddw mm7, [GOTOFF(ebx,PW_SEVEN)] 470 paddw mm5, [GOTOFF(ebx,PW_SEVEN)] 471 472 paddw mm1, mm6 473 paddw mm0, mm4 474 psrlw mm1, 4 ; mm1=Out1LE=( 0 2 4 6) 475 psrlw mm0, 4 ; mm0=Out1HE=( 8 10 12 14) 476 paddw mm7, mm6 477 paddw mm5, mm4 478 psrlw mm7, 4 ; mm7=Out1LO=( 1 3 5 7) 479 psrlw mm5, 4 ; mm5=Out1HO=( 9 11 13 15) 480 481 psllw mm7, BYTE_BIT 482 psllw mm5, BYTE_BIT 483 por mm1, mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) 484 por mm0, mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) 485 486 movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 487 movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 488 489 poppic ebx 490 491 sub eax, byte SIZEOF_MMWORD 492 add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) 493 add ebx, byte 1*SIZEOF_MMWORD ; inptr0 494 add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) 495 add edx, byte 2*SIZEOF_MMWORD ; outptr0 496 add edi, byte 2*SIZEOF_MMWORD ; outptr1 497 cmp eax, byte SIZEOF_MMWORD 498 ja near .columnloop 499 test eax, eax 500 jnz near .columnloop_last 501 502 pop esi 503 pop edi 504 pop ecx 505 pop eax 506 507 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 508 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 509 sub ecx, byte 2 ; rowctr 510 jg near .rowloop 511 512 emms ; empty MMX state 513 514.return: 515 pop edi 516 pop esi 517; pop edx ; need not be preserved 518; pop ecx ; need not be preserved 519 pop ebx 520 mov esp, ebp ; esp <- aligned ebp 521 pop esp ; esp <- original ebp 522 pop ebp 523 ret 524 525; -------------------------------------------------------------------------- 526; 527; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 528; It's still a box filter. 529; 530; GLOBAL(void) 531; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, 532; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 533; 534 535%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 536%define output_width(b) (b) + 12 ; JDIMENSION output_width 537%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 538%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 539 540 align 32 541 GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx) 542 543EXTN(jsimd_h2v1_upsample_mmx): 544 push ebp 545 mov ebp, esp 546; push ebx ; unused 547; push ecx ; need not be preserved 548; push edx ; need not be preserved 549 push esi 550 push edi 551 552 mov edx, JDIMENSION [output_width(ebp)] 553 add edx, byte (2*SIZEOF_MMWORD)-1 554 and edx, byte -(2*SIZEOF_MMWORD) 555 jz short .return 556 557 mov ecx, INT [max_v_samp(ebp)] ; rowctr 558 test ecx, ecx 559 jz short .return 560 561 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 562 mov edi, POINTER [output_data_ptr(ebp)] 563 mov edi, JSAMPARRAY [edi] ; output_data 564 alignx 16, 7 565.rowloop: 566 push edi 567 push esi 568 569 mov esi, JSAMPROW [esi] ; inptr 570 mov edi, JSAMPROW [edi] ; outptr 571 mov eax, edx ; colctr 572 alignx 16, 7 573.columnloop: 574 575 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 576 577 movq mm1, mm0 578 punpcklbw mm0, mm0 579 punpckhbw mm1, mm1 580 581 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 582 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 583 584 sub eax, byte 2*SIZEOF_MMWORD 585 jz short .nextrow 586 587 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 588 589 movq mm3, mm2 590 punpcklbw mm2, mm2 591 punpckhbw mm3, mm3 592 593 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 594 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 595 596 sub eax, byte 2*SIZEOF_MMWORD 597 jz short .nextrow 598 599 add esi, byte 2*SIZEOF_MMWORD ; inptr 600 add edi, byte 4*SIZEOF_MMWORD ; outptr 601 jmp short .columnloop 602 alignx 16, 7 603 604.nextrow: 605 pop esi 606 pop edi 607 608 add esi, byte SIZEOF_JSAMPROW ; input_data 609 add edi, byte SIZEOF_JSAMPROW ; output_data 610 dec ecx ; rowctr 611 jg short .rowloop 612 613 emms ; empty MMX state 614 615.return: 616 pop edi 617 pop esi 618; pop edx ; need not be preserved 619; pop ecx ; need not be preserved 620; pop ebx ; unused 621 pop ebp 622 ret 623 624; -------------------------------------------------------------------------- 625; 626; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 627; It's still a box filter. 628; 629; GLOBAL(void) 630; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width, 631; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr); 632; 633 634%define max_v_samp(b) (b) + 8 ; int max_v_samp_factor 635%define output_width(b) (b) + 12 ; JDIMENSION output_width 636%define input_data(b) (b) + 16 ; JSAMPARRAY input_data 637%define output_data_ptr(b) (b) + 20 ; JSAMPARRAY *output_data_ptr 638 639 align 32 640 GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx) 641 642EXTN(jsimd_h2v2_upsample_mmx): 643 push ebp 644 mov ebp, esp 645 push ebx 646; push ecx ; need not be preserved 647; push edx ; need not be preserved 648 push esi 649 push edi 650 651 mov edx, JDIMENSION [output_width(ebp)] 652 add edx, byte (2*SIZEOF_MMWORD)-1 653 and edx, byte -(2*SIZEOF_MMWORD) 654 jz near .return 655 656 mov ecx, INT [max_v_samp(ebp)] ; rowctr 657 test ecx, ecx 658 jz short .return 659 660 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 661 mov edi, POINTER [output_data_ptr(ebp)] 662 mov edi, JSAMPARRAY [edi] ; output_data 663 alignx 16, 7 664.rowloop: 665 push edi 666 push esi 667 668 mov esi, JSAMPROW [esi] ; inptr 669 mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 670 mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 671 mov eax, edx ; colctr 672 alignx 16, 7 673.columnloop: 674 675 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 676 677 movq mm1, mm0 678 punpcklbw mm0, mm0 679 punpckhbw mm1, mm1 680 681 movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 682 movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 683 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 684 movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 685 686 sub eax, byte 2*SIZEOF_MMWORD 687 jz short .nextrow 688 689 movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 690 691 movq mm3, mm2 692 punpcklbw mm2, mm2 693 punpckhbw mm3, mm3 694 695 movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 696 movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 697 movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 698 movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 699 700 sub eax, byte 2*SIZEOF_MMWORD 701 jz short .nextrow 702 703 add esi, byte 2*SIZEOF_MMWORD ; inptr 704 add ebx, byte 4*SIZEOF_MMWORD ; outptr0 705 add edi, byte 4*SIZEOF_MMWORD ; outptr1 706 jmp short .columnloop 707 alignx 16, 7 708 709.nextrow: 710 pop esi 711 pop edi 712 713 add esi, byte 1*SIZEOF_JSAMPROW ; input_data 714 add edi, byte 2*SIZEOF_JSAMPROW ; output_data 715 sub ecx, byte 2 ; rowctr 716 jg short .rowloop 717 718 emms ; empty MMX state 719 720.return: 721 pop edi 722 pop esi 723; pop edx ; need not be preserved 724; pop ecx ; need not be preserved 725 pop ebx 726 pop ebp 727 ret 728 729; For some reason, the OS X linker does not honor the request to align the 730; segment unless we do this. 731 align 32 732