1; 2; jdmrgext.asm - merged upsampling/color conversion (SSE2) 3; 4; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2012, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jcolsamp.inc" 20 21; -------------------------------------------------------------------------- 22; 23; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 24; 25; GLOBAL(void) 26; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width, 27; JSAMPIMAGE input_buf, 28; JDIMENSION in_row_group_ctr, 29; JSAMPARRAY output_buf); 30; 31 32%define output_width(b) (b)+8 ; JDIMENSION output_width 33%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf 34%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr 35%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf 36 37%define original_ebp ebp+0 38%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 39%define WK_NUM 3 40%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 41 42 align 16 43 global EXTN(jsimd_h2v1_merged_upsample_sse2) 44 45EXTN(jsimd_h2v1_merged_upsample_sse2): 46 push ebp 47 mov eax,esp ; eax = original ebp 48 sub esp, byte 4 49 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 50 mov [esp],eax 51 mov ebp,esp ; ebp = aligned ebp 52 lea esp, [wk(0)] 53 pushpic eax ; make a room for GOT address 54 push ebx 55; push ecx ; need not be preserved 56; push edx ; need not be preserved 57 push esi 58 push edi 59 60 get_GOT ebx ; get GOT address 61 movpic POINTER [gotptr], ebx ; save GOT address 62 63 mov ecx, JDIMENSION [output_width(eax)] ; col 64 test ecx,ecx 65 jz near .return 66 67 push ecx 68 69 mov edi, JSAMPIMAGE [input_buf(eax)] 70 mov ecx, JDIMENSION [in_row_group_ctr(eax)] 71 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 72 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 73 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 74 mov edi, JSAMPARRAY [output_buf(eax)] 75 mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 76 mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 77 mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 78 mov edi, JSAMPROW [edi] ; outptr 79 80 pop ecx ; col 81 82 alignx 16,7 83.columnloop: 84 movpic eax, POINTER [gotptr] ; load GOT address (eax) 85 86 movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) 87 movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) 88 89 pxor xmm1,xmm1 ; xmm1=(all 0's) 90 pcmpeqw xmm3,xmm3 91 psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} 92 93 movdqa xmm4,xmm6 94 punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH 95 punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL 96 movdqa xmm0,xmm7 97 punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH 98 punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL 99 100 paddw xmm6,xmm3 101 paddw xmm4,xmm3 102 paddw xmm7,xmm3 103 paddw xmm0,xmm3 104 105 ; (Original) 106 ; R = Y + 1.40200 * Cr 107 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 108 ; B = Y + 1.77200 * Cb 109 ; 110 ; (This implementation) 111 ; R = Y + 0.40200 * Cr + Cr 112 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 113 ; B = Y - 0.22800 * Cb + Cb + Cb 114 115 movdqa xmm5,xmm6 ; xmm5=CbH 116 movdqa xmm2,xmm4 ; xmm2=CbL 117 paddw xmm6,xmm6 ; xmm6=2*CbH 118 paddw xmm4,xmm4 ; xmm4=2*CbL 119 movdqa xmm1,xmm7 ; xmm1=CrH 120 movdqa xmm3,xmm0 ; xmm3=CrL 121 paddw xmm7,xmm7 ; xmm7=2*CrH 122 paddw xmm0,xmm0 ; xmm0=2*CrL 123 124 pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) 125 pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) 126 pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) 127 pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) 128 129 paddw xmm6,[GOTOFF(eax,PW_ONE)] 130 paddw xmm4,[GOTOFF(eax,PW_ONE)] 131 psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) 132 psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) 133 paddw xmm7,[GOTOFF(eax,PW_ONE)] 134 paddw xmm0,[GOTOFF(eax,PW_ONE)] 135 psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) 136 psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) 137 138 paddw xmm6,xmm5 139 paddw xmm4,xmm2 140 paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H 141 paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L 142 paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H 143 paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L 144 145 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H 146 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H 147 148 movdqa xmm6,xmm5 149 movdqa xmm7,xmm2 150 punpcklwd xmm5,xmm1 151 punpckhwd xmm6,xmm1 152 pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] 153 pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] 154 punpcklwd xmm2,xmm3 155 punpckhwd xmm7,xmm3 156 pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] 157 pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] 158 159 paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] 160 paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] 161 psrad xmm5,SCALEBITS 162 psrad xmm6,SCALEBITS 163 paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] 164 paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] 165 psrad xmm2,SCALEBITS 166 psrad xmm7,SCALEBITS 167 168 packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 169 packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 170 psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 171 psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 172 173 movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H 174 175 mov al,2 ; Yctr 176 jmp short .Yloop_1st 177 alignx 16,7 178 179.Yloop_2nd: 180 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H 181 movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H 182 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H 183 alignx 16,7 184 185.Yloop_1st: 186 movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) 187 188 pcmpeqw xmm6,xmm6 189 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 190 pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE 191 psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO 192 193 movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) 194 movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) 195 movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) 196 197 paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) 198 paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) 199 packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) 200 packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) 201 202 paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) 203 paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) 204 packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) 205 packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) 206 207 paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) 208 paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) 209 packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) 210 packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) 211 212%if RGB_PIXELSIZE == 3 ; --------------- 213 214 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 215 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 216 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 217 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 218 219 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 220 punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 221 punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 222 223 movdqa xmmG,xmmA 224 movdqa xmmH,xmmA 225 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 226 punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 227 228 psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 229 psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 230 231 movdqa xmmC,xmmD 232 movdqa xmmB,xmmD 233 punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 234 punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 235 236 psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 237 238 movdqa xmmF,xmmE 239 punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 240 punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 241 242 pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 243 movdqa xmmB,xmmE 244 punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 245 punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 246 punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 247 248 pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 249 movdqa xmmB,xmmF 250 punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 251 punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 252 punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 253 254 punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 255 punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 256 punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 257 258 cmp ecx, byte SIZEOF_XMMWORD 259 jb short .column_st32 260 261 test edi, SIZEOF_XMMWORD-1 262 jnz short .out1 263 ; --(aligned)------------------- 264 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 265 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 266 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF 267 jmp short .out0 268.out1: ; --(unaligned)----------------- 269 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 270 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 271 movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF 272.out0: 273 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 274 sub ecx, byte SIZEOF_XMMWORD 275 jz near .endcolumn 276 277 add esi, byte SIZEOF_XMMWORD ; inptr0 278 dec al ; Yctr 279 jnz near .Yloop_2nd 280 281 add ebx, byte SIZEOF_XMMWORD ; inptr1 282 add edx, byte SIZEOF_XMMWORD ; inptr2 283 jmp near .columnloop 284 alignx 16,7 285 286.column_st32: 287 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 288 cmp ecx, byte 2*SIZEOF_XMMWORD 289 jb short .column_st16 290 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 291 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 292 add edi, byte 2*SIZEOF_XMMWORD ; outptr 293 movdqa xmmA,xmmF 294 sub ecx, byte 2*SIZEOF_XMMWORD 295 jmp short .column_st15 296.column_st16: 297 cmp ecx, byte SIZEOF_XMMWORD 298 jb short .column_st15 299 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 300 add edi, byte SIZEOF_XMMWORD ; outptr 301 movdqa xmmA,xmmD 302 sub ecx, byte SIZEOF_XMMWORD 303.column_st15: 304 ; Store the lower 8 bytes of xmmA to the output when it has enough 305 ; space. 306 cmp ecx, byte SIZEOF_MMWORD 307 jb short .column_st7 308 movq XMM_MMWORD [edi], xmmA 309 add edi, byte SIZEOF_MMWORD 310 sub ecx, byte SIZEOF_MMWORD 311 psrldq xmmA, SIZEOF_MMWORD 312.column_st7: 313 ; Store the lower 4 bytes of xmmA to the output when it has enough 314 ; space. 315 cmp ecx, byte SIZEOF_DWORD 316 jb short .column_st3 317 movd XMM_DWORD [edi], xmmA 318 add edi, byte SIZEOF_DWORD 319 sub ecx, byte SIZEOF_DWORD 320 psrldq xmmA, SIZEOF_DWORD 321.column_st3: 322 ; Store the lower 2 bytes of eax to the output when it has enough 323 ; space. 324 movd eax, xmmA 325 cmp ecx, byte SIZEOF_WORD 326 jb short .column_st1 327 mov WORD [edi], ax 328 add edi, byte SIZEOF_WORD 329 sub ecx, byte SIZEOF_WORD 330 shr eax, 16 331.column_st1: 332 ; Store the lower 1 byte of eax to the output when it has enough 333 ; space. 334 test ecx, ecx 335 jz short .endcolumn 336 mov BYTE [edi], al 337 338%else ; RGB_PIXELSIZE == 4 ; ----------- 339 340%ifdef RGBX_FILLER_0XFF 341 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 342 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 343%else 344 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 345 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 346%endif 347 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 348 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 349 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 350 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 351 352 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 353 punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 354 punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 355 punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 356 357 movdqa xmmC,xmmA 358 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 359 punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 360 movdqa xmmG,xmmB 361 punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 362 punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 363 364 movdqa xmmD,xmmA 365 punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 366 punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 367 movdqa xmmH,xmmC 368 punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 369 punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 370 371 cmp ecx, byte SIZEOF_XMMWORD 372 jb short .column_st32 373 374 test edi, SIZEOF_XMMWORD-1 375 jnz short .out1 376 ; --(aligned)------------------- 377 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 378 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 379 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC 380 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH 381 jmp short .out0 382.out1: ; --(unaligned)----------------- 383 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 384 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 385 movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC 386 movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH 387.out0: 388 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 389 sub ecx, byte SIZEOF_XMMWORD 390 jz near .endcolumn 391 392 add esi, byte SIZEOF_XMMWORD ; inptr0 393 dec al ; Yctr 394 jnz near .Yloop_2nd 395 396 add ebx, byte SIZEOF_XMMWORD ; inptr1 397 add edx, byte SIZEOF_XMMWORD ; inptr2 398 jmp near .columnloop 399 alignx 16,7 400 401.column_st32: 402 cmp ecx, byte SIZEOF_XMMWORD/2 403 jb short .column_st16 404 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 405 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 406 add edi, byte 2*SIZEOF_XMMWORD ; outptr 407 movdqa xmmA,xmmC 408 movdqa xmmD,xmmH 409 sub ecx, byte SIZEOF_XMMWORD/2 410.column_st16: 411 cmp ecx, byte SIZEOF_XMMWORD/4 412 jb short .column_st15 413 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 414 add edi, byte SIZEOF_XMMWORD ; outptr 415 movdqa xmmA,xmmD 416 sub ecx, byte SIZEOF_XMMWORD/4 417.column_st15: 418 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 419 ; space. 420 cmp ecx, byte SIZEOF_XMMWORD/8 421 jb short .column_st7 422 movq XMM_MMWORD [edi], xmmA 423 add edi, byte SIZEOF_XMMWORD/8*4 424 sub ecx, byte SIZEOF_XMMWORD/8 425 psrldq xmmA, SIZEOF_XMMWORD/8*4 426.column_st7: 427 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 428 ; space. 429 test ecx, ecx 430 jz short .endcolumn 431 movd XMM_DWORD [edi], xmmA 432 433%endif ; RGB_PIXELSIZE ; --------------- 434 435.endcolumn: 436 sfence ; flush the write buffer 437 438.return: 439 pop edi 440 pop esi 441; pop edx ; need not be preserved 442; pop ecx ; need not be preserved 443 pop ebx 444 mov esp,ebp ; esp <- aligned ebp 445 pop esp ; esp <- original ebp 446 pop ebp 447 ret 448 449; -------------------------------------------------------------------------- 450; 451; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 452; 453; GLOBAL(void) 454; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width, 455; JSAMPIMAGE input_buf, 456; JDIMENSION in_row_group_ctr, 457; JSAMPARRAY output_buf); 458; 459 460%define output_width(b) (b)+8 ; JDIMENSION output_width 461%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf 462%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr 463%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf 464 465 align 16 466 global EXTN(jsimd_h2v2_merged_upsample_sse2) 467 468EXTN(jsimd_h2v2_merged_upsample_sse2): 469 push ebp 470 mov ebp,esp 471 push ebx 472; push ecx ; need not be preserved 473; push edx ; need not be preserved 474 push esi 475 push edi 476 477 mov eax, POINTER [output_width(ebp)] 478 479 mov edi, JSAMPIMAGE [input_buf(ebp)] 480 mov ecx, JDIMENSION [in_row_group_ctr(ebp)] 481 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 482 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 483 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 484 mov edi, JSAMPARRAY [output_buf(ebp)] 485 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 486 487 push edx ; inptr2 488 push ebx ; inptr1 489 push esi ; inptr00 490 mov ebx,esp 491 492 push edi ; output_buf (outptr0) 493 push ecx ; in_row_group_ctr 494 push ebx ; input_buf 495 push eax ; output_width 496 497 call near EXTN(jsimd_h2v1_merged_upsample_sse2) 498 499 add esi, byte SIZEOF_JSAMPROW ; inptr01 500 add edi, byte SIZEOF_JSAMPROW ; outptr1 501 mov POINTER [ebx+0*SIZEOF_POINTER], esi 502 mov POINTER [ebx-1*SIZEOF_POINTER], edi 503 504 call near EXTN(jsimd_h2v1_merged_upsample_sse2) 505 506 add esp, byte 7*SIZEOF_DWORD 507 508 pop edi 509 pop esi 510; pop edx ; need not be preserved 511; pop ecx ; need not be preserved 512 pop ebx 513 pop ebp 514 ret 515 516; For some reason, the OS X linker does not honor the request to align the 517; segment unless we do this. 518 align 16 519