1; 2; jdcolext.asm - colorspace conversion (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jcolsamp.inc" 18 19; -------------------------------------------------------------------------- 20; 21; Convert some rows of samples to the output colorspace. 22; 23; GLOBAL(void) 24; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf, 25; JDIMENSION input_row, JSAMPARRAY output_buf, 26; int num_rows) 27; 28 29%define out_width(b) (b) + 8 ; JDIMENSION out_width 30%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 31%define input_row(b) (b) + 16 ; JDIMENSION input_row 32%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 33%define num_rows(b) (b) + 24 ; int num_rows 34 35%define original_ebp ebp + 0 36%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 37 ; mmword wk[WK_NUM] 38%define WK_NUM 2 39%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 40 41 align 32 42 GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx) 43 44EXTN(jsimd_ycc_rgb_convert_mmx): 45 push ebp 46 mov eax, esp ; eax = original ebp 47 sub esp, byte 4 48 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 49 mov [esp], eax 50 mov ebp, esp ; ebp = aligned ebp 51 lea esp, [wk(0)] 52 pushpic eax ; make a room for GOT address 53 push ebx 54; push ecx ; need not be preserved 55; push edx ; need not be preserved 56 push esi 57 push edi 58 59 get_GOT ebx ; get GOT address 60 movpic POINTER [gotptr], ebx ; save GOT address 61 62 mov ecx, JDIMENSION [out_width(eax)] ; num_cols 63 test ecx, ecx 64 jz near .return 65 66 push ecx 67 68 mov edi, JSAMPIMAGE [input_buf(eax)] 69 mov ecx, JDIMENSION [input_row(eax)] 70 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 71 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 72 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 73 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 74 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 75 lea edx, [edx+ecx*SIZEOF_JSAMPROW] 76 77 pop ecx 78 79 mov edi, JSAMPARRAY [output_buf(eax)] 80 mov eax, INT [num_rows(eax)] 81 test eax, eax 82 jle near .return 83 alignx 16, 7 84.rowloop: 85 push eax 86 push edi 87 push edx 88 push ebx 89 push esi 90 push ecx ; col 91 92 mov esi, JSAMPROW [esi] ; inptr0 93 mov ebx, JSAMPROW [ebx] ; inptr1 94 mov edx, JSAMPROW [edx] ; inptr2 95 mov edi, JSAMPROW [edi] ; outptr 96 movpic eax, POINTER [gotptr] ; load GOT address (eax) 97 alignx 16, 7 98.columnloop: 99 100 movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) 101 movq mm1, MMWORD [edx] ; mm1=Cr(01234567) 102 103 pcmpeqw mm4, mm4 104 pcmpeqw mm7, mm7 105 psrlw mm4, BYTE_BIT 106 psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 107 movq mm0, mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} 108 109 pand mm4, mm5 ; mm4=Cb(0246)=CbE 110 psrlw mm5, BYTE_BIT ; mm5=Cb(1357)=CbO 111 pand mm0, mm1 ; mm0=Cr(0246)=CrE 112 psrlw mm1, BYTE_BIT ; mm1=Cr(1357)=CrO 113 114 paddw mm4, mm7 115 paddw mm5, mm7 116 paddw mm0, mm7 117 paddw mm1, mm7 118 119 ; (Original) 120 ; R = Y + 1.40200 * Cr 121 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 122 ; B = Y + 1.77200 * Cb 123 ; 124 ; (This implementation) 125 ; R = Y + 0.40200 * Cr + Cr 126 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 127 ; B = Y - 0.22800 * Cb + Cb + Cb 128 129 movq mm2, mm4 ; mm2=CbE 130 movq mm3, mm5 ; mm3=CbO 131 paddw mm4, mm4 ; mm4=2*CbE 132 paddw mm5, mm5 ; mm5=2*CbO 133 movq mm6, mm0 ; mm6=CrE 134 movq mm7, mm1 ; mm7=CrO 135 paddw mm0, mm0 ; mm0=2*CrE 136 paddw mm1, mm1 ; mm1=2*CrO 137 138 pmulhw mm4, [GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) 139 pmulhw mm5, [GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) 140 pmulhw mm0, [GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) 141 pmulhw mm1, [GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) 142 143 paddw mm4, [GOTOFF(eax,PW_ONE)] 144 paddw mm5, [GOTOFF(eax,PW_ONE)] 145 psraw mm4, 1 ; mm4=(CbE * -FIX(0.22800)) 146 psraw mm5, 1 ; mm5=(CbO * -FIX(0.22800)) 147 paddw mm0, [GOTOFF(eax,PW_ONE)] 148 paddw mm1, [GOTOFF(eax,PW_ONE)] 149 psraw mm0, 1 ; mm0=(CrE * FIX(0.40200)) 150 psraw mm1, 1 ; mm1=(CrO * FIX(0.40200)) 151 152 paddw mm4, mm2 153 paddw mm5, mm3 154 paddw mm4, mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E 155 paddw mm5, mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O 156 paddw mm0, mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E 157 paddw mm1, mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O 158 159 movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E 160 movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O 161 162 movq mm4, mm2 163 movq mm5, mm3 164 punpcklwd mm2, mm6 165 punpckhwd mm4, mm6 166 pmaddwd mm2, [GOTOFF(eax,PW_MF0344_F0285)] 167 pmaddwd mm4, [GOTOFF(eax,PW_MF0344_F0285)] 168 punpcklwd mm3, mm7 169 punpckhwd mm5, mm7 170 pmaddwd mm3, [GOTOFF(eax,PW_MF0344_F0285)] 171 pmaddwd mm5, [GOTOFF(eax,PW_MF0344_F0285)] 172 173 paddd mm2, [GOTOFF(eax,PD_ONEHALF)] 174 paddd mm4, [GOTOFF(eax,PD_ONEHALF)] 175 psrad mm2, SCALEBITS 176 psrad mm4, SCALEBITS 177 paddd mm3, [GOTOFF(eax,PD_ONEHALF)] 178 paddd mm5, [GOTOFF(eax,PD_ONEHALF)] 179 psrad mm3, SCALEBITS 180 psrad mm5, SCALEBITS 181 182 packssdw mm2, mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 183 packssdw mm3, mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 184 psubw mm2, mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 185 psubw mm3, mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 186 187 movq mm5, MMWORD [esi] ; mm5=Y(01234567) 188 189 pcmpeqw mm4, mm4 190 psrlw mm4, BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} 191 pand mm4, mm5 ; mm4=Y(0246)=YE 192 psrlw mm5, BYTE_BIT ; mm5=Y(1357)=YO 193 194 paddw mm0, mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) 195 paddw mm1, mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) 196 packuswb mm0, mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) 197 packuswb mm1, mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) 198 199 paddw mm2, mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) 200 paddw mm3, mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) 201 packuswb mm2, mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) 202 packuswb mm3, mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) 203 204 paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) 205 paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) 206 packuswb mm4, mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) 207 packuswb mm5, mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) 208 209%if RGB_PIXELSIZE == 3 ; --------------- 210 211 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 212 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 213 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 214 ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) 215 216 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 217 punpcklbw mmE, mmB ; mmE=(20 01 22 03 24 05 26 07) 218 punpcklbw mmD, mmF ; mmD=(11 21 13 23 15 25 17 27) 219 220 movq mmG, mmA 221 movq mmH, mmA 222 punpcklwd mmA, mmE ; mmA=(00 10 20 01 02 12 22 03) 223 punpckhwd mmG, mmE ; mmG=(04 14 24 05 06 16 26 07) 224 225 psrlq mmH, 2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) 226 psrlq mmE, 2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) 227 228 movq mmC, mmD 229 movq mmB, mmD 230 punpcklwd mmD, mmH ; mmD=(11 21 02 12 13 23 04 14) 231 punpckhwd mmC, mmH ; mmC=(15 25 06 16 17 27 -- --) 232 233 psrlq mmB, 2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) 234 235 movq mmF, mmE 236 punpcklwd mmE, mmB ; mmE=(22 03 13 23 24 05 15 25) 237 punpckhwd mmF, mmB ; mmF=(26 07 17 27 -- -- -- --) 238 239 punpckldq mmA, mmD ; mmA=(00 10 20 01 11 21 02 12) 240 punpckldq mmE, mmG ; mmE=(22 03 13 23 04 14 24 05) 241 punpckldq mmC, mmF ; mmC=(15 25 06 16 26 07 17 27) 242 243 cmp ecx, byte SIZEOF_MMWORD 244 jb short .column_st16 245 246 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 247 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 248 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 249 250 sub ecx, byte SIZEOF_MMWORD 251 jz short .nextrow 252 253 add esi, byte SIZEOF_MMWORD ; inptr0 254 add ebx, byte SIZEOF_MMWORD ; inptr1 255 add edx, byte SIZEOF_MMWORD ; inptr2 256 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 257 jmp near .columnloop 258 alignx 16, 7 259 260.column_st16: 261 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 262 cmp ecx, byte 2*SIZEOF_MMWORD 263 jb short .column_st8 264 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 265 movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 266 movq mmA, mmC 267 sub ecx, byte 2*SIZEOF_MMWORD 268 add edi, byte 2*SIZEOF_MMWORD 269 jmp short .column_st4 270.column_st8: 271 cmp ecx, byte SIZEOF_MMWORD 272 jb short .column_st4 273 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 274 movq mmA, mmE 275 sub ecx, byte SIZEOF_MMWORD 276 add edi, byte SIZEOF_MMWORD 277.column_st4: 278 movd eax, mmA 279 cmp ecx, byte SIZEOF_DWORD 280 jb short .column_st2 281 mov dword [edi+0*SIZEOF_DWORD], eax 282 psrlq mmA, DWORD_BIT 283 movd eax, mmA 284 sub ecx, byte SIZEOF_DWORD 285 add edi, byte SIZEOF_DWORD 286.column_st2: 287 cmp ecx, byte SIZEOF_WORD 288 jb short .column_st1 289 mov word [edi+0*SIZEOF_WORD], ax 290 shr eax, WORD_BIT 291 sub ecx, byte SIZEOF_WORD 292 add edi, byte SIZEOF_WORD 293.column_st1: 294 cmp ecx, byte SIZEOF_BYTE 295 jb short .nextrow 296 mov byte [edi+0*SIZEOF_BYTE], al 297 298%else ; RGB_PIXELSIZE == 4 ; ----------- 299 300%ifdef RGBX_FILLER_0XFF 301 pcmpeqb mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 302 pcmpeqb mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 303%else 304 pxor mm6, mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 305 pxor mm7, mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 306%endif 307 ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 308 ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 309 ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 310 ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) 311 312 punpcklbw mmA, mmC ; mmA=(00 10 02 12 04 14 06 16) 313 punpcklbw mmE, mmG ; mmE=(20 30 22 32 24 34 26 36) 314 punpcklbw mmB, mmD ; mmB=(01 11 03 13 05 15 07 17) 315 punpcklbw mmF, mmH ; mmF=(21 31 23 33 25 35 27 37) 316 317 movq mmC, mmA 318 punpcklwd mmA, mmE ; mmA=(00 10 20 30 02 12 22 32) 319 punpckhwd mmC, mmE ; mmC=(04 14 24 34 06 16 26 36) 320 movq mmG, mmB 321 punpcklwd mmB, mmF ; mmB=(01 11 21 31 03 13 23 33) 322 punpckhwd mmG, mmF ; mmG=(05 15 25 35 07 17 27 37) 323 324 movq mmD, mmA 325 punpckldq mmA, mmB ; mmA=(00 10 20 30 01 11 21 31) 326 punpckhdq mmD, mmB ; mmD=(02 12 22 32 03 13 23 33) 327 movq mmH, mmC 328 punpckldq mmC, mmG ; mmC=(04 14 24 34 05 15 25 35) 329 punpckhdq mmH, mmG ; mmH=(06 16 26 36 07 17 27 37) 330 331 cmp ecx, byte SIZEOF_MMWORD 332 jb short .column_st16 333 334 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 335 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 336 movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 337 movq MMWORD [edi+3*SIZEOF_MMWORD], mmH 338 339 sub ecx, byte SIZEOF_MMWORD 340 jz short .nextrow 341 342 add esi, byte SIZEOF_MMWORD ; inptr0 343 add ebx, byte SIZEOF_MMWORD ; inptr1 344 add edx, byte SIZEOF_MMWORD ; inptr2 345 add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 346 jmp near .columnloop 347 alignx 16, 7 348 349.column_st16: 350 cmp ecx, byte SIZEOF_MMWORD/2 351 jb short .column_st8 352 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 353 movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 354 movq mmA, mmC 355 movq mmD, mmH 356 sub ecx, byte SIZEOF_MMWORD/2 357 add edi, byte 2*SIZEOF_MMWORD 358.column_st8: 359 cmp ecx, byte SIZEOF_MMWORD/4 360 jb short .column_st4 361 movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 362 movq mmA, mmD 363 sub ecx, byte SIZEOF_MMWORD/4 364 add edi, byte 1*SIZEOF_MMWORD 365.column_st4: 366 cmp ecx, byte SIZEOF_MMWORD/8 367 jb short .nextrow 368 movd dword [edi+0*SIZEOF_DWORD], mmA 369 370%endif ; RGB_PIXELSIZE ; --------------- 371 372 alignx 16, 7 373 374.nextrow: 375 pop ecx 376 pop esi 377 pop ebx 378 pop edx 379 pop edi 380 pop eax 381 382 add esi, byte SIZEOF_JSAMPROW 383 add ebx, byte SIZEOF_JSAMPROW 384 add edx, byte SIZEOF_JSAMPROW 385 add edi, byte SIZEOF_JSAMPROW ; output_buf 386 dec eax ; num_rows 387 jg near .rowloop 388 389 emms ; empty MMX state 390 391.return: 392 pop edi 393 pop esi 394; pop edx ; need not be preserved 395; pop ecx ; need not be preserved 396 pop ebx 397 mov esp, ebp ; esp <- aligned ebp 398 pop esp ; esp <- original ebp 399 pop ebp 400 ret 401 402; For some reason, the OS X linker does not honor the request to align the 403; segment unless we do this. 404 align 32 405