1; 2; jdcolext.asm - colorspace conversion (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jcolsamp.inc" 20 21; -------------------------------------------------------------------------- 22; 23; Convert some rows of samples to the output colorspace. 24; 25; GLOBAL(void) 26; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf, 27; JDIMENSION input_row, JSAMPARRAY output_buf, 28; int num_rows) 29; 30 31%define out_width(b) (b) + 8 ; JDIMENSION out_width 32%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 33%define input_row(b) (b) + 16 ; JDIMENSION input_row 34%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 35%define num_rows(b) (b) + 24 ; int num_rows 36 37%define original_ebp ebp + 0 38%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 39 ; mmword wk[WK_NUM] 40%define WK_NUM 2 41%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 42 43 align 32 44 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx) 45 46EXTN(jsimd_ycc_rgb_convert_mmx): 47 push ebp 48 mov eax, esp ; eax = original ebp 49 sub esp, byte 4 50 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 51 mov [esp], eax 52 mov ebp, esp ; ebp = aligned ebp 53 lea esp, [wk(0)] 54 pushpic eax ; make a room for GOT address 55 push ebx 56; push ecx ; need not be preserved 57; push edx ; need not be preserved 58 push esi 59 push edi 60 61 get_GOT ebx ; get GOT address 62 movpic POINTER [gotptr], ebx ; save GOT address 63 64 mov ecx, JDIMENSION [out_width(eax)] ; num_cols 65 test ecx, ecx 66 jz near .return 67 68 push ecx 69 70 mov edi, JSAMPIMAGE [input_buf(eax)] 71 mov ecx, JDIMENSION [input_row(eax)] 72 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 73 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 74 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 75 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 76 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 77 lea edx, [edx+ecx*SIZEOF_JSAMPROW] 78 79 pop ecx 80 81 mov edi, JSAMPARRAY [output_buf(eax)] 82 mov eax, INT [num_rows(eax)] 83 test eax, eax 84 jle near .return 85 alignx 16, 7 86.rowloop: 87 push eax 88 push edi 89 push edx 90 push ebx 91 push esi 92 push ecx ; col 93 94 mov esi, JSAMPROW [esi] ; inptr0 95 mov ebx, JSAMPROW [ebx] ; inptr1 96 mov edx, JSAMPROW [edx] ; inptr2 97 mov edi, JSAMPROW [edi] ; outptr 98 movpic eax, POINTER [gotptr] ; load GOT address (eax) 99 alignx 16, 7 100.columnloop: 101 102 movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) 103 movq mm1, MMWORD [edx] ; mm1=Cr(01234567) 104 105 pcmpeqw mm4, mm4 106 pcmpeqw mm7, mm7 107 psrlw mm4, BYTE_BIT 108 psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 109 movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} 110 111 pand mm4, mm5 ; mm4=Cb(0246)=CbE 112 psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO 113 pand mm0, mm1 ; mm0=Cr(0246)=CrE 114 psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO 115 116 paddw mm4, mm7 117 paddw mm5, mm7 118 paddw mm0, mm7 119 paddw mm1, mm7 120 121 ; (Original) 122 ; R = Y + 1.40200 * Cr 123 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 124 ; B = Y + 1.77200 * Cb 125 ; 126 ; (This implementation) 127 ; R = Y + 0.40200 * Cr + Cr 128 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 129 ; B = Y - 0.22800 * Cb + Cb + Cb 130 131 movq mm2, mm4 ; mm2=CbE 132 movq mm3, mm5 ; mm3=CbO 133 paddw mm4, mm4 ; mm4=2*CbE 134 paddw mm5, mm5 ; mm5=2*CbO 135 movq mm6, mm0 ; mm6=CrE 136 movq mm7, mm1 ; mm7=CrO 137 paddw mm0, mm0 ; mm0=2*CrE 138 paddw mm1, mm1 ; mm1=2*CrO 139 140 pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) 141 pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) 142 pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) 143 pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) 144 145 paddw mm4, [GOTOFF(eax,PW_ONE)] 146 paddw mm5, [GOTOFF(eax,PW_ONE)] 147 psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800)) 148 psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800)) 149 paddw mm0, [GOTOFF(eax,PW_ONE)] 150 paddw mm1, [GOTOFF(eax,PW_ONE)] 151 psraw mm0, 1 ; mm0=(CrE * FIX(0.40200)) 152 psraw mm1, 1 ; mm1=(CrO * FIX(0.40200)) 153 154 paddw mm4, mm2 155 paddw mm5, mm3 156 paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E 157 paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O 158 paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E 159 paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O 160 161 movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E 162 movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O 163 164 movq mm4, mm2 165 movq mm5, mm3 166 punpcklwd mm2, mm6 167 punpckhwd mm4, mm6 168 pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] 169 pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)] 170 punpcklwd mm3, mm7 171 punpckhwd mm5, mm7 172 pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)] 173 pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] 174 175 paddd mm2, [GOTOFF(eax,PD_ONEHALF)] 176 paddd mm4, [GOTOFF(eax,PD_ONEHALF)] 177 psrad mm2, SCALEBITS 178 psrad mm4, SCALEBITS 179 paddd mm3, [GOTOFF(eax,PD_ONEHALF)] 180 paddd mm5, [GOTOFF(eax,PD_ONEHALF)] 181 psrad mm3, SCALEBITS 182 psrad mm5, SCALEBITS 183 184 packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 185 packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 186 psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 187 psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 188 189 movq mm5, MMWORD [esi] ; mm5=Y(01234567) 190 191 pcmpeqw mm4, mm4 192 psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} 193 pand mm4, mm5 ; mm4=Y(0246)=YE 194 psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO 195 196 paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) 197 paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) 198 packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) 199 packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) 200 201 paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) 202 paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) 203 packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) 204 packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) 205 206 paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) 207 paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) 208 packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) 209 packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) 210 211%if RGB_PIXELSIZE == 3 ; --------------- 212 213 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 214 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 215 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 216 ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) 217 218 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 219 punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) 220 punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) 221 222 movq mmG, mmA 223 movq mmH, mmA 224 punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) 225 punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) 226 227 psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) 228 psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) 229 230 movq mmC, mmD 231 movq mmB, mmD 232 punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) 233 punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) 234 235 psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) 236 237 movq mmF, mmE 238 punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) 239 punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) 240 241 punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) 242 punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) 243 punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) 244 245 cmp ecx, byte SIZEOF_MMWORD 246 jb short .column_st16 247 248 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 249 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 250 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 251 252 sub ecx, byte SIZEOF_MMWORD 253 jz short .nextrow 254 255 add esi, byte SIZEOF_MMWORD ; inptr0 256 add ebx, byte SIZEOF_MMWORD ; inptr1 257 add edx, byte SIZEOF_MMWORD ; inptr2 258 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 259 jmp near .columnloop 260 alignx 16, 7 261 262.column_st16: 263 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 264 cmp ecx, byte 2*SIZEOF_MMWORD 265 jb short .column_st8 266 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 267 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 268 movq mmA, mmC 269 sub ecx, byte 2*SIZEOF_MMWORD 270 add edi, byte 2*SIZEOF_MMWORD 271 jmp short .column_st4 272.column_st8: 273 cmp ecx, byte SIZEOF_MMWORD 274 jb short .column_st4 275 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 276 movq mmA, mmE 277 sub ecx, byte SIZEOF_MMWORD 278 add edi, byte SIZEOF_MMWORD 279.column_st4: 280 movd eax, mmA 281 cmp ecx, byte SIZEOF_DWORD 282 jb short .column_st2 283 mov DWORD [edi+0*SIZEOF_DWORD], eax 284 psrlq mmA, DWORD_BIT 285 movd eax, mmA 286 sub ecx, byte SIZEOF_DWORD 287 add edi, byte SIZEOF_DWORD 288.column_st2: 289 cmp ecx, byte SIZEOF_WORD 290 jb short .column_st1 291 mov WORD [edi+0*SIZEOF_WORD], ax 292 shr eax, WORD_BIT 293 sub ecx, byte SIZEOF_WORD 294 add edi, byte SIZEOF_WORD 295.column_st1: 296 cmp ecx, byte SIZEOF_BYTE 297 jb short .nextrow 298 mov BYTE [edi+0*SIZEOF_BYTE], al 299 300%else ; RGB_PIXELSIZE == 4 ; ----------- 301 302%ifdef RGBX_FILLER_0XFF 303 pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 304 pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 305%else 306 pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 307 pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 308%endif 309 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 310 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 311 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 312 ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) 313 314 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 315 punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) 316 punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) 317 punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) 318 319 movq mmC, mmA 320 punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) 321 punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) 322 movq mmG, mmB 323 punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) 324 punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) 325 326 movq mmD, mmA 327 punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) 328 punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) 329 movq mmH, mmC 330 punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) 331 punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) 332 333 cmp ecx, byte SIZEOF_MMWORD 334 jb short .column_st16 335 336 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 337 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 338 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 339 movq MMWORD [edi+3*SIZEOF_MMWORD], mmH 340 341 sub ecx, byte SIZEOF_MMWORD 342 jz short .nextrow 343 344 add esi, byte SIZEOF_MMWORD ; inptr0 345 add ebx, byte SIZEOF_MMWORD ; inptr1 346 add edx, byte SIZEOF_MMWORD ; inptr2 347 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 348 jmp near .columnloop 349 alignx 16, 7 350 351.column_st16: 352 cmp ecx, byte SIZEOF_MMWORD/2 353 jb short .column_st8 354 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 355 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 356 movq mmA, mmC 357 movq mmD, mmH 358 sub ecx, byte SIZEOF_MMWORD/2 359 add edi, byte 2*SIZEOF_MMWORD 360.column_st8: 361 cmp ecx, byte SIZEOF_MMWORD/4 362 jb short .column_st4 363 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 364 movq mmA, mmD 365 sub ecx, byte SIZEOF_MMWORD/4 366 add edi, byte 1*SIZEOF_MMWORD 367.column_st4: 368 cmp ecx, byte SIZEOF_MMWORD/8 369 jb short .nextrow 370 movd DWORD [edi+0*SIZEOF_DWORD], mmA 371 372%endif ; RGB_PIXELSIZE ; --------------- 373 374 alignx 16, 7 375 376.nextrow: 377 pop ecx 378 pop esi 379 pop ebx 380 pop edx 381 pop edi 382 pop eax 383 384 add esi, byte SIZEOF_JSAMPROW 385 add ebx, byte SIZEOF_JSAMPROW 386 add edx, byte SIZEOF_JSAMPROW 387 add edi, byte SIZEOF_JSAMPROW ; output_buf 388 dec eax ; num_rows 389 jg near .rowloop 390 391 emms ; empty MMX state 392 393.return: 394 pop edi 395 pop esi 396; pop edx ; need not be preserved 397; pop ecx ; need not be preserved 398 pop ebx 399 mov esp, ebp ; esp <- aligned ebp 400 pop esp ; esp <- original ebp 401 pop ebp 402 ret 403 404; For some reason, the OS X linker does not honor the request to align the 405; segment unless we do this. 406 align 32 407