1; 2; jdmrgext.asm - merged upsampling/color conversion (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jcolsamp.inc" 18 19; -------------------------------------------------------------------------- 20; 21; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 22; 23; GLOBAL(void) 24; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf, 25; JDIMENSION in_row_group_ctr, 26; JSAMPARRAY output_buf); 27; 28 29%define output_width(b) (b) + 8 ; JDIMENSION output_width 30%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 31%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr 32%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 33 34%define original_ebp ebp + 0 35%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 36%define WK_NUM 3 37%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 38 39 align 32 40 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx) 41 42EXTN(jsimd_h2v1_merged_upsample_mmx): 43 push ebp 44 mov eax, esp ; eax = original ebp 45 sub esp, byte 4 46 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 47 mov [esp], eax 48 mov ebp, esp ; ebp = aligned ebp 49 lea esp, [wk(0)] 50 pushpic eax ; make a room for GOT address 51 push ebx 52; push ecx ; need not be preserved 53; push edx ; need not be preserved 54 push esi 55 push edi 56 57 get_GOT ebx ; get GOT address 58 movpic POINTER [gotptr], ebx ; save GOT address 59 60 mov ecx, JDIMENSION [output_width(eax)] ; col 61 test ecx, ecx 62 jz near .return 63 64 push ecx 65 66 mov edi, JSAMPIMAGE [input_buf(eax)] 67 mov ecx, JDIMENSION [in_row_group_ctr(eax)] 68 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 69 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 70 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 71 mov edi, JSAMPARRAY [output_buf(eax)] 72 mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 73 mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 74 mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 75 mov edi, JSAMPROW [edi] ; outptr 76 77 pop ecx ; col 78 79 alignx 16, 7 80.columnloop: 81 movpic eax, POINTER [gotptr] ; load GOT address (eax) 82 83 movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) 84 movq mm7, MMWORD [edx] ; mm7=Cr(01234567) 85 86 pxor mm1, mm1 ; mm1=(all 0's) 87 pcmpeqw mm3, mm3 88 psllw mm3, 7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} 89 90 movq mm4, mm6 91 punpckhbw mm6, mm1 ; mm6=Cb(4567)=CbH 92 punpcklbw mm4, mm1 ; mm4=Cb(0123)=CbL 93 movq mm0, mm7 94 punpckhbw mm7, mm1 ; mm7=Cr(4567)=CrH 95 punpcklbw mm0, mm1 ; mm0=Cr(0123)=CrL 96 97 paddw mm6, mm3 98 paddw mm4, mm3 99 paddw mm7, mm3 100 paddw mm0, mm3 101 102 ; (Original) 103 ; R = Y + 1.40200 * Cr 104 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 105 ; B = Y + 1.77200 * Cb 106 ; 107 ; (This implementation) 108 ; R = Y + 0.40200 * Cr + Cr 109 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 110 ; B = Y - 0.22800 * Cb + Cb + Cb 111 112 movq mm5, mm6 ; mm5=CbH 113 movq mm2, mm4 ; mm2=CbL 114 paddw mm6, mm6 ; mm6=2*CbH 115 paddw mm4, mm4 ; mm4=2*CbL 116 movq mm1, mm7 ; mm1=CrH 117 movq mm3, mm0 ; mm3=CrL 118 paddw mm7, mm7 ; mm7=2*CrH 119 paddw mm0, mm0 ; mm0=2*CrL 120 121 pmulhw mm6, [GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) 122 pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) 123 pmulhw mm7, [GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) 124 pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) 125 126 paddw mm6, [GOTOFF(eax,PW_ONE)] 127 paddw mm4, [GOTOFF(eax,PW_ONE)] 128 psraw mm6, 1 ; mm6=(CbH * -FIX(0.22800)) 129 psraw mm4, 1 ; mm4=(CbL * -FIX(0.22800)) 130 paddw mm7, [GOTOFF(eax,PW_ONE)] 131 paddw mm0, [GOTOFF(eax,PW_ONE)] 132 psraw mm7, 1 ; mm7=(CrH * FIX(0.40200)) 133 psraw mm0, 1 ; mm0=(CrL * FIX(0.40200)) 134 135 paddw mm6, mm5 136 paddw mm4, mm2 137 paddw mm6, mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H 138 paddw mm4, mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L 139 paddw mm7, mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H 140 paddw mm0, mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L 141 142 movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H 143 movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H 144 145 movq mm6, mm5 146 movq mm7, mm2 147 punpcklwd mm5, mm1 148 punpckhwd mm6, mm1 149 pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] 150 pmaddwd mm6, [GOTOFF(eax,PW_MF0344_F0285)] 151 punpcklwd mm2, mm3 152 punpckhwd mm7, mm3 153 pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] 154 pmaddwd mm7, [GOTOFF(eax,PW_MF0344_F0285)] 155 156 paddd mm5, [GOTOFF(eax,PD_ONEHALF)] 157 paddd mm6, [GOTOFF(eax,PD_ONEHALF)] 158 psrad mm5, SCALEBITS 159 psrad mm6, SCALEBITS 160 paddd mm2, [GOTOFF(eax,PD_ONEHALF)] 161 paddd mm7, [GOTOFF(eax,PD_ONEHALF)] 162 psrad mm2, SCALEBITS 163 psrad mm7, SCALEBITS 164 165 packssdw mm5, mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 166 packssdw mm2, mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 167 psubw mm5, mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 168 psubw mm2, mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 169 170 movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H 171 172 mov al, 2 ; Yctr 173 jmp short .Yloop_1st 174 alignx 16, 7 175 176.Yloop_2nd: 177 movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H 178 movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H 179 movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H 180 alignx 16, 7 181 182.Yloop_1st: 183 movq mm7, MMWORD [esi] ; mm7=Y(01234567) 184 185 pcmpeqw mm6, mm6 186 psrlw mm6, BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 187 pand mm6, mm7 ; mm6=Y(0246)=YE 188 psrlw mm7, BYTE_BIT ; mm7=Y(1357)=YO 189 190 movq mm1, mm0 ; mm1=mm0=(R-Y)(L/H) 191 movq mm3, mm2 ; mm3=mm2=(G-Y)(L/H) 192 movq mm5, mm4 ; mm5=mm4=(B-Y)(L/H) 193 194 paddw mm0, mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) 195 paddw mm1, mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) 196 packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) 197 packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) 198 199 paddw mm2, mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) 200 paddw mm3, mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) 201 packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) 202 packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) 203 204 paddw mm4, mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) 205 paddw mm5, mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) 206 packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) 207 packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) 208 209%if RGB_PIXELSIZE == 3 ; --------------- 210 211 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 212 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 213 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 214 ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) 215 216 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 217 punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) 218 punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) 219 220 movq mmG, mmA 221 movq mmH, mmA 222 punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) 223 punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) 224 225 psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) 226 psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) 227 228 movq mmC, mmD 229 movq mmB, mmD 230 punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) 231 punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) 232 233 psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) 234 235 movq mmF, mmE 236 punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) 237 punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) 238 239 punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) 240 punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) 241 punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) 242 243 cmp ecx, byte SIZEOF_MMWORD 244 jb short .column_st16 245 246 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 247 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 248 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 249 250 sub ecx, byte SIZEOF_MMWORD 251 jz near .endcolumn 252 253 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 254 add esi, byte SIZEOF_MMWORD ; inptr0 255 dec al ; Yctr 256 jnz near .Yloop_2nd 257 258 add ebx, byte SIZEOF_MMWORD ; inptr1 259 add edx, byte SIZEOF_MMWORD ; inptr2 260 jmp near .columnloop 261 alignx 16, 7 262 263.column_st16: 264 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 265 cmp ecx, byte 2*SIZEOF_MMWORD 266 jb short .column_st8 267 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 268 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 269 movq mmA, mmC 270 sub ecx, byte 2*SIZEOF_MMWORD 271 add edi, byte 2*SIZEOF_MMWORD 272 jmp short .column_st4 273.column_st8: 274 cmp ecx, byte SIZEOF_MMWORD 275 jb short .column_st4 276 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 277 movq mmA, mmE 278 sub ecx, byte SIZEOF_MMWORD 279 add edi, byte SIZEOF_MMWORD 280.column_st4: 281 movd eax, mmA 282 cmp ecx, byte SIZEOF_DWORD 283 jb short .column_st2 284 mov dword [edi+0*SIZEOF_DWORD], eax 285 psrlq mmA, DWORD_BIT 286 movd eax, mmA 287 sub ecx, byte SIZEOF_DWORD 288 add edi, byte SIZEOF_DWORD 289.column_st2: 290 cmp ecx, byte SIZEOF_WORD 291 jb short .column_st1 292 mov word [edi+0*SIZEOF_WORD], ax 293 shr eax, WORD_BIT 294 sub ecx, byte SIZEOF_WORD 295 add edi, byte SIZEOF_WORD 296.column_st1: 297 cmp ecx, byte SIZEOF_BYTE 298 jb short .endcolumn 299 mov byte [edi+0*SIZEOF_BYTE], al 300 301%else ; RGB_PIXELSIZE == 4 ; ----------- 302 303%ifdef RGBX_FILLER_0XFF 304 pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 305 pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 306%else 307 pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 308 pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 309%endif 310 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 311 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 312 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 313 ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) 314 315 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 316 punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) 317 punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) 318 punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) 319 320 movq mmC, mmA 321 punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) 322 punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) 323 movq mmG, mmB 324 punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) 325 punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) 326 327 movq mmD, mmA 328 punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) 329 punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) 330 movq mmH, mmC 331 punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) 332 punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) 333 334 cmp ecx, byte SIZEOF_MMWORD 335 jb short .column_st16 336 337 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 338 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 339 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 340 movq MMWORD [edi+3*SIZEOF_MMWORD], mmH 341 342 sub ecx, byte SIZEOF_MMWORD 343 jz short .endcolumn 344 345 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 346 add esi, byte SIZEOF_MMWORD ; inptr0 347 dec al ; Yctr 348 jnz near .Yloop_2nd 349 350 add ebx, byte SIZEOF_MMWORD ; inptr1 351 add edx, byte SIZEOF_MMWORD ; inptr2 352 jmp near .columnloop 353 alignx 16, 7 354 355.column_st16: 356 cmp ecx, byte SIZEOF_MMWORD/2 357 jb short .column_st8 358 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 359 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 360 movq mmA, mmC 361 movq mmD, mmH 362 sub ecx, byte SIZEOF_MMWORD/2 363 add edi, byte 2*SIZEOF_MMWORD 364.column_st8: 365 cmp ecx, byte SIZEOF_MMWORD/4 366 jb short .column_st4 367 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 368 movq mmA, mmD 369 sub ecx, byte SIZEOF_MMWORD/4 370 add edi, byte 1*SIZEOF_MMWORD 371.column_st4: 372 cmp ecx, byte SIZEOF_MMWORD/8 373 jb short .endcolumn 374 movd dword [edi+0*SIZEOF_DWORD], mmA 375 376%endif ; RGB_PIXELSIZE ; --------------- 377 378.endcolumn: 379 emms ; empty MMX state 380 381.return: 382 pop edi 383 pop esi 384; pop edx ; need not be preserved 385; pop ecx ; need not be preserved 386 pop ebx 387 mov esp, ebp ; esp <- aligned ebp 388 pop esp ; esp <- original ebp 389 pop ebp 390 ret 391 392; -------------------------------------------------------------------------- 393; 394; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 395; 396; GLOBAL(void) 397; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf, 398; JDIMENSION in_row_group_ctr, 399; JSAMPARRAY output_buf); 400; 401 402%define output_width(b) (b) + 8 ; JDIMENSION output_width 403%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 404%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr 405%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 406 407 align 32 408 GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx) 409 410EXTN(jsimd_h2v2_merged_upsample_mmx): 411 push ebp 412 mov ebp, esp 413 push ebx 414; push ecx ; need not be preserved 415; push edx ; need not be preserved 416 push esi 417 push edi 418 419 mov eax, JDIMENSION [output_width(ebp)] 420 421 mov edi, JSAMPIMAGE [input_buf(ebp)] 422 mov ecx, JDIMENSION [in_row_group_ctr(ebp)] 423 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 424 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 425 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 426 mov edi, JSAMPARRAY [output_buf(ebp)] 427 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 428 429 push edx ; inptr2 430 push ebx ; inptr1 431 push esi ; inptr00 432 mov ebx, esp 433 434 push edi ; output_buf (outptr0) 435 push ecx ; in_row_group_ctr 436 push ebx ; input_buf 437 push eax ; output_width 438 439 call near EXTN(jsimd_h2v1_merged_upsample_mmx) 440 441 add esi, byte SIZEOF_JSAMPROW ; inptr01 442 add edi, byte SIZEOF_JSAMPROW ; outptr1 443 mov POINTER [ebx+0*SIZEOF_POINTER], esi 444 mov POINTER [ebx-1*SIZEOF_POINTER], edi 445 446 call near EXTN(jsimd_h2v1_merged_upsample_mmx) 447 448 add esp, byte 7*SIZEOF_DWORD 449 450 pop edi 451 pop esi 452; pop edx ; need not be preserved 453; pop ecx ; need not be preserved 454 pop ebx 455 pop ebp 456 ret 457 458; For some reason, the OS X linker does not honor the request to align the 459; segment unless we do this. 460 align 32 461