1; 2; jdcolext.asm - colorspace conversion (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; [TAB8] 17 18%include "jcolsamp.inc" 19 20; -------------------------------------------------------------------------- 21; 22; Convert some rows of samples to the output colorspace. 23; 24; GLOBAL(void) 25; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width, 26; JSAMPIMAGE input_buf, JDIMENSION input_row, 27; JSAMPARRAY output_buf, int num_rows) 28; 29 30%define out_width(b) (b)+8 ; JDIMENSION out_width 31%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf 32%define input_row(b) (b)+16 ; JDIMENSION input_row 33%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf 34%define num_rows(b) (b)+24 ; int num_rows 35 36%define original_ebp ebp+0 37%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 38%define WK_NUM 2 39%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 40 41 align 16 42 global EXTN(jsimd_ycc_rgb_convert_mmx) 43 44EXTN(jsimd_ycc_rgb_convert_mmx): 45 push ebp 46 mov eax,esp ; eax = original ebp 47 sub esp, byte 4 48 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 49 mov [esp],eax 50 mov ebp,esp ; ebp = aligned ebp 51 lea esp, [wk(0)] 52 pushpic eax ; make a room for GOT address 53 push ebx 54; push ecx ; need not be preserved 55; push edx ; need not be preserved 56 push esi 57 push edi 58 59 get_GOT ebx ; get GOT address 60 movpic POINTER [gotptr], ebx ; save GOT address 61 62 mov ecx, JDIMENSION [out_width(eax)] ; num_cols 63 test ecx,ecx 64 jz near .return 65 66 push ecx 67 68 mov edi, JSAMPIMAGE [input_buf(eax)] 69 mov ecx, JDIMENSION [input_row(eax)] 70 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 71 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 72 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 73 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 74 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 75 lea edx, [edx+ecx*SIZEOF_JSAMPROW] 76 77 pop ecx 78 79 mov edi, JSAMPARRAY [output_buf(eax)] 80 mov eax, INT [num_rows(eax)] 81 test eax,eax 82 jle near .return 83 alignx 16,7 84.rowloop: 85 push eax 86 push edi 87 push edx 88 push ebx 89 push esi 90 push ecx ; col 91 92 mov esi, JSAMPROW [esi] ; inptr0 93 mov ebx, JSAMPROW [ebx] ; inptr1 94 mov edx, JSAMPROW [edx] ; inptr2 95 mov edi, JSAMPROW [edi] ; outptr 96 movpic eax, POINTER [gotptr] ; load GOT address (eax) 97 alignx 16,7 98.columnloop: 99 100 movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) 101 movq mm1, MMWORD [edx] ; mm1=Cr(01234567) 102 103 pcmpeqw mm4,mm4 104 pcmpeqw mm7,mm7 105 psrlw mm4,BYTE_BIT 106 psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 107 movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} 108 109 pand mm4,mm5 ; mm4=Cb(0246)=CbE 110 psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO 111 pand mm0,mm1 ; mm0=Cr(0246)=CrE 112 psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO 113 114 paddw mm4,mm7 115 paddw mm5,mm7 116 paddw mm0,mm7 117 paddw mm1,mm7 118 119 ; (Original) 120 ; R = Y + 1.40200 * Cr 121 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 122 ; B = Y + 1.77200 * Cb 123 ; 124 ; (This implementation) 125 ; R = Y + 0.40200 * Cr + Cr 126 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 127 ; B = Y - 0.22800 * Cb + Cb + Cb 128 129 movq mm2,mm4 ; mm2=CbE 130 movq mm3,mm5 ; mm3=CbO 131 paddw mm4,mm4 ; mm4=2*CbE 132 paddw mm5,mm5 ; mm5=2*CbO 133 movq mm6,mm0 ; mm6=CrE 134 movq mm7,mm1 ; mm7=CrO 135 paddw mm0,mm0 ; mm0=2*CrE 136 paddw mm1,mm1 ; mm1=2*CrO 137 138 pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) 139 pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) 140 pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) 141 pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) 142 143 paddw mm4,[GOTOFF(eax,PW_ONE)] 144 paddw mm5,[GOTOFF(eax,PW_ONE)] 145 psraw mm4,1 ; mm4=(CbE * -FIX(0.22800)) 146 psraw mm5,1 ; mm5=(CbO * -FIX(0.22800)) 147 paddw mm0,[GOTOFF(eax,PW_ONE)] 148 paddw mm1,[GOTOFF(eax,PW_ONE)] 149 psraw mm0,1 ; mm0=(CrE * FIX(0.40200)) 150 psraw mm1,1 ; mm1=(CrO * FIX(0.40200)) 151 152 paddw mm4,mm2 153 paddw mm5,mm3 154 paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E 155 paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O 156 paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E 157 paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O 158 159 movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E 160 movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O 161 162 movq mm4,mm2 163 movq mm5,mm3 164 punpcklwd mm2,mm6 165 punpckhwd mm4,mm6 166 pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] 167 pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)] 168 punpcklwd mm3,mm7 169 punpckhwd mm5,mm7 170 pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)] 171 pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] 172 173 paddd mm2,[GOTOFF(eax,PD_ONEHALF)] 174 paddd mm4,[GOTOFF(eax,PD_ONEHALF)] 175 psrad mm2,SCALEBITS 176 psrad mm4,SCALEBITS 177 paddd mm3,[GOTOFF(eax,PD_ONEHALF)] 178 paddd mm5,[GOTOFF(eax,PD_ONEHALF)] 179 psrad mm3,SCALEBITS 180 psrad mm5,SCALEBITS 181 182 packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 183 packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 184 psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 185 psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 186 187 movq mm5, MMWORD [esi] ; mm5=Y(01234567) 188 189 pcmpeqw mm4,mm4 190 psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} 191 pand mm4,mm5 ; mm4=Y(0246)=YE 192 psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO 193 194 paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) 195 paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) 196 packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) 197 packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) 198 199 paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) 200 paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) 201 packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) 202 packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) 203 204 paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) 205 paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) 206 packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) 207 packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) 208 209%if RGB_PIXELSIZE == 3 ; --------------- 210 211 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 212 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 213 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 214 ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) 215 216 punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) 217 punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) 218 punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) 219 220 movq mmG,mmA 221 movq mmH,mmA 222 punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) 223 punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) 224 225 psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) 226 psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) 227 228 movq mmC,mmD 229 movq mmB,mmD 230 punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) 231 punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) 232 233 psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) 234 235 movq mmF,mmE 236 punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) 237 punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) 238 239 punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) 240 punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) 241 punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) 242 243 cmp ecx, byte SIZEOF_MMWORD 244 jb short .column_st16 245 246 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 247 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 248 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 249 250 sub ecx, byte SIZEOF_MMWORD 251 jz short .nextrow 252 253 add esi, byte SIZEOF_MMWORD ; inptr0 254 add ebx, byte SIZEOF_MMWORD ; inptr1 255 add edx, byte SIZEOF_MMWORD ; inptr2 256 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 257 jmp near .columnloop 258 alignx 16,7 259 260.column_st16: 261 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 262 cmp ecx, byte 2*SIZEOF_MMWORD 263 jb short .column_st8 264 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 265 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 266 movq mmA,mmC 267 sub ecx, byte 2*SIZEOF_MMWORD 268 add edi, byte 2*SIZEOF_MMWORD 269 jmp short .column_st4 270.column_st8: 271 cmp ecx, byte SIZEOF_MMWORD 272 jb short .column_st4 273 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 274 movq mmA,mmE 275 sub ecx, byte SIZEOF_MMWORD 276 add edi, byte SIZEOF_MMWORD 277.column_st4: 278 movd eax,mmA 279 cmp ecx, byte SIZEOF_DWORD 280 jb short .column_st2 281 mov DWORD [edi+0*SIZEOF_DWORD], eax 282 psrlq mmA,DWORD_BIT 283 movd eax,mmA 284 sub ecx, byte SIZEOF_DWORD 285 add edi, byte SIZEOF_DWORD 286.column_st2: 287 cmp ecx, byte SIZEOF_WORD 288 jb short .column_st1 289 mov WORD [edi+0*SIZEOF_WORD], ax 290 shr eax,WORD_BIT 291 sub ecx, byte SIZEOF_WORD 292 add edi, byte SIZEOF_WORD 293.column_st1: 294 cmp ecx, byte SIZEOF_BYTE 295 jb short .nextrow 296 mov BYTE [edi+0*SIZEOF_BYTE], al 297 298%else ; RGB_PIXELSIZE == 4 ; ----------- 299 300%ifdef RGBX_FILLER_0XFF 301 pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 302 pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 303%else 304 pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 305 pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 306%endif 307 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 308 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 309 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 310 ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) 311 312 punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) 313 punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) 314 punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) 315 punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) 316 317 movq mmC,mmA 318 punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) 319 punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) 320 movq mmG,mmB 321 punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) 322 punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) 323 324 movq mmD,mmA 325 punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) 326 punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) 327 movq mmH,mmC 328 punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) 329 punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) 330 331 cmp ecx, byte SIZEOF_MMWORD 332 jb short .column_st16 333 334 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 335 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 336 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 337 movq MMWORD [edi+3*SIZEOF_MMWORD], mmH 338 339 sub ecx, byte SIZEOF_MMWORD 340 jz short .nextrow 341 342 add esi, byte SIZEOF_MMWORD ; inptr0 343 add ebx, byte SIZEOF_MMWORD ; inptr1 344 add edx, byte SIZEOF_MMWORD ; inptr2 345 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 346 jmp near .columnloop 347 alignx 16,7 348 349.column_st16: 350 cmp ecx, byte SIZEOF_MMWORD/2 351 jb short .column_st8 352 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 353 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 354 movq mmA,mmC 355 movq mmD,mmH 356 sub ecx, byte SIZEOF_MMWORD/2 357 add edi, byte 2*SIZEOF_MMWORD 358.column_st8: 359 cmp ecx, byte SIZEOF_MMWORD/4 360 jb short .column_st4 361 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 362 movq mmA,mmD 363 sub ecx, byte SIZEOF_MMWORD/4 364 add edi, byte 1*SIZEOF_MMWORD 365.column_st4: 366 cmp ecx, byte SIZEOF_MMWORD/8 367 jb short .nextrow 368 movd DWORD [edi+0*SIZEOF_DWORD], mmA 369 370%endif ; RGB_PIXELSIZE ; --------------- 371 372 alignx 16,7 373 374.nextrow: 375 pop ecx 376 pop esi 377 pop ebx 378 pop edx 379 pop edi 380 pop eax 381 382 add esi, byte SIZEOF_JSAMPROW 383 add ebx, byte SIZEOF_JSAMPROW 384 add edx, byte SIZEOF_JSAMPROW 385 add edi, byte SIZEOF_JSAMPROW ; output_buf 386 dec eax ; num_rows 387 jg near .rowloop 388 389 emms ; empty MMX state 390 391.return: 392 pop edi 393 pop esi 394; pop edx ; need not be preserved 395; pop ecx ; need not be preserved 396 pop ebx 397 mov esp,ebp ; esp <- aligned ebp 398 pop esp ; esp <- original ebp 399 pop ebp 400 ret 401 402; For some reason, the OS X linker does not honor the request to align the 403; segment unless we do this. 404 align 16 405