1; 2; jdmrgext.asm - merged upsampling/color conversion (SSE2) 3; 4; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2012, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jcolsamp.inc" 18 19; -------------------------------------------------------------------------- 20; 21; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 22; 23; GLOBAL(void) 24; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, 25; JSAMPIMAGE input_buf, 26; JDIMENSION in_row_group_ctr, 27; JSAMPARRAY output_buf); 28; 29 30%define output_width(b) (b) + 8 ; JDIMENSION output_width 31%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 32%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr 33%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 34 35%define original_ebp ebp + 0 36%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 37 ; xmmword wk[WK_NUM] 38%define WK_NUM 3 39%define gotptr wk(0) - SIZEOF_POINTER ; void * gotptr 40 41 align 32 42 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) 43 44EXTN(jsimd_h2v1_merged_upsample_sse2): 45 push ebp 46 mov eax, esp ; eax = original ebp 47 sub esp, byte 4 48 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 49 mov [esp], eax 50 mov ebp, esp ; ebp = aligned ebp 51 lea esp, [wk(0)] 52 pushpic eax ; make a room for GOT address 53 push ebx 54; push ecx ; need not be preserved 55; push edx ; need not be preserved 56 push esi 57 push edi 58 59 get_GOT ebx ; get GOT address 60 movpic POINTER [gotptr], ebx ; save GOT address 61 62 mov ecx, JDIMENSION [output_width(eax)] ; col 63 test ecx, ecx 64 jz near .return 65 66 push ecx 67 68 mov edi, JSAMPIMAGE [input_buf(eax)] 69 mov ecx, JDIMENSION [in_row_group_ctr(eax)] 70 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 71 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 72 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 73 mov edi, JSAMPARRAY [output_buf(eax)] 74 mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 75 mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 76 mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 77 mov edi, JSAMPROW [edi] ; outptr 78 79 pop ecx ; col 80 81 alignx 16, 7 82.columnloop: 83 movpic eax, POINTER [gotptr] ; load GOT address (eax) 84 85 movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) 86 movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) 87 88 pxor xmm1, xmm1 ; xmm1=(all 0's) 89 pcmpeqw xmm3, xmm3 90 psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} 91 92 movdqa xmm4, xmm6 93 punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH 94 punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL 95 movdqa xmm0, xmm7 96 punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH 97 punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL 98 99 paddw xmm6, xmm3 100 paddw xmm4, xmm3 101 paddw xmm7, xmm3 102 paddw xmm0, xmm3 103 104 ; (Original) 105 ; R = Y + 1.40200 * Cr 106 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 107 ; B = Y + 1.77200 * Cb 108 ; 109 ; (This implementation) 110 ; R = Y + 0.40200 * Cr + Cr 111 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 112 ; B = Y - 0.22800 * Cb + Cb + Cb 113 114 movdqa xmm5, xmm6 ; xmm5=CbH 115 movdqa xmm2, xmm4 ; xmm2=CbL 116 paddw xmm6, xmm6 ; xmm6=2*CbH 117 paddw xmm4, xmm4 ; xmm4=2*CbL 118 movdqa xmm1, xmm7 ; xmm1=CrH 119 movdqa xmm3, xmm0 ; xmm3=CrL 120 paddw xmm7, xmm7 ; xmm7=2*CrH 121 paddw xmm0, xmm0 ; xmm0=2*CrL 122 123 pmulhw xmm6, [GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) 124 pmulhw xmm4, [GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) 125 pmulhw xmm7, [GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) 126 pmulhw xmm0, [GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) 127 128 paddw xmm6, [GOTOFF(eax,PW_ONE)] 129 paddw xmm4, [GOTOFF(eax,PW_ONE)] 130 psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) 131 psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) 132 paddw xmm7, [GOTOFF(eax,PW_ONE)] 133 paddw xmm0, [GOTOFF(eax,PW_ONE)] 134 psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) 135 psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) 136 137 paddw xmm6, xmm5 138 paddw xmm4, xmm2 139 paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H 140 paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L 141 paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H 142 paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L 143 144 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H 145 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H 146 147 movdqa xmm6, xmm5 148 movdqa xmm7, xmm2 149 punpcklwd xmm5, xmm1 150 punpckhwd xmm6, xmm1 151 pmaddwd xmm5, [GOTOFF(eax,PW_MF0344_F0285)] 152 pmaddwd xmm6, [GOTOFF(eax,PW_MF0344_F0285)] 153 punpcklwd xmm2, xmm3 154 punpckhwd xmm7, xmm3 155 pmaddwd xmm2, [GOTOFF(eax,PW_MF0344_F0285)] 156 pmaddwd xmm7, [GOTOFF(eax,PW_MF0344_F0285)] 157 158 paddd xmm5, [GOTOFF(eax,PD_ONEHALF)] 159 paddd xmm6, [GOTOFF(eax,PD_ONEHALF)] 160 psrad xmm5, SCALEBITS 161 psrad xmm6, SCALEBITS 162 paddd xmm2, [GOTOFF(eax,PD_ONEHALF)] 163 paddd xmm7, [GOTOFF(eax,PD_ONEHALF)] 164 psrad xmm2, SCALEBITS 165 psrad xmm7, SCALEBITS 166 167 packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 168 packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 169 psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 170 psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 171 172 movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H 173 174 mov al, 2 ; Yctr 175 jmp short .Yloop_1st 176 alignx 16, 7 177 178.Yloop_2nd: 179 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H 180 movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H 181 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H 182 alignx 16, 7 183 184.Yloop_1st: 185 movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) 186 187 pcmpeqw xmm6, xmm6 188 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 189 pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE 190 psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO 191 192 movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) 193 movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) 194 movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) 195 196 paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) 197 paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) 198 packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) 199 packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) 200 201 paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) 202 paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) 203 packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) 204 packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) 205 206 paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) 207 paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) 208 packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) 209 packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) 210 211%if RGB_PIXELSIZE == 3 ; --------------- 212 213 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 214 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 215 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 216 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 217 218 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 219 punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 220 punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 221 222 movdqa xmmG, xmmA 223 movdqa xmmH, xmmA 224 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 225 punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 226 227 psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 228 psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 229 230 movdqa xmmC, xmmD 231 movdqa xmmB, xmmD 232 punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 233 punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 234 235 psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 236 237 movdqa xmmF, xmmE 238 punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 239 punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 240 241 pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 242 movdqa xmmB, xmmE 243 punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 244 punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 245 punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 246 247 pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 248 movdqa xmmB, xmmF 249 punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 250 punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 251 punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 252 253 punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 254 punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 255 punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 256 257 cmp ecx, byte SIZEOF_XMMWORD 258 jb short .column_st32 259 260 test edi, SIZEOF_XMMWORD-1 261 jnz short .out1 262 ; --(aligned)------------------- 263 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 264 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 265 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF 266 jmp short .out0 267.out1: ; --(unaligned)----------------- 268 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 269 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 270 movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF 271.out0: 272 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 273 sub ecx, byte SIZEOF_XMMWORD 274 jz near .endcolumn 275 276 add esi, byte SIZEOF_XMMWORD ; inptr0 277 dec al ; Yctr 278 jnz near .Yloop_2nd 279 280 add ebx, byte SIZEOF_XMMWORD ; inptr1 281 add edx, byte SIZEOF_XMMWORD ; inptr2 282 jmp near .columnloop 283 alignx 16, 7 284 285.column_st32: 286 lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 287 cmp ecx, byte 2*SIZEOF_XMMWORD 288 jb short .column_st16 289 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 290 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 291 add edi, byte 2*SIZEOF_XMMWORD ; outptr 292 movdqa xmmA, xmmF 293 sub ecx, byte 2*SIZEOF_XMMWORD 294 jmp short .column_st15 295.column_st16: 296 cmp ecx, byte SIZEOF_XMMWORD 297 jb short .column_st15 298 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 299 add edi, byte SIZEOF_XMMWORD ; outptr 300 movdqa xmmA, xmmD 301 sub ecx, byte SIZEOF_XMMWORD 302.column_st15: 303 ; Store the lower 8 bytes of xmmA to the output when it has enough 304 ; space. 305 cmp ecx, byte SIZEOF_MMWORD 306 jb short .column_st7 307 movq XMM_MMWORD [edi], xmmA 308 add edi, byte SIZEOF_MMWORD 309 sub ecx, byte SIZEOF_MMWORD 310 psrldq xmmA, SIZEOF_MMWORD 311.column_st7: 312 ; Store the lower 4 bytes of xmmA to the output when it has enough 313 ; space. 314 cmp ecx, byte SIZEOF_DWORD 315 jb short .column_st3 316 movd XMM_DWORD [edi], xmmA 317 add edi, byte SIZEOF_DWORD 318 sub ecx, byte SIZEOF_DWORD 319 psrldq xmmA, SIZEOF_DWORD 320.column_st3: 321 ; Store the lower 2 bytes of eax to the output when it has enough 322 ; space. 323 movd eax, xmmA 324 cmp ecx, byte SIZEOF_WORD 325 jb short .column_st1 326 mov word [edi], ax 327 add edi, byte SIZEOF_WORD 328 sub ecx, byte SIZEOF_WORD 329 shr eax, 16 330.column_st1: 331 ; Store the lower 1 byte of eax to the output when it has enough 332 ; space. 333 test ecx, ecx 334 jz short .endcolumn 335 mov byte [edi], al 336 337%else ; RGB_PIXELSIZE == 4 ; ----------- 338 339%ifdef RGBX_FILLER_0XFF 340 pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 341 pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 342%else 343 pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 344 pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 345%endif 346 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 347 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 348 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 349 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 350 351 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 352 punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 353 punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 354 punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 355 356 movdqa xmmC, xmmA 357 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 358 punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 359 movdqa xmmG, xmmB 360 punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 361 punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 362 363 movdqa xmmD, xmmA 364 punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 365 punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 366 movdqa xmmH, xmmC 367 punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 368 punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 369 370 cmp ecx, byte SIZEOF_XMMWORD 371 jb short .column_st32 372 373 test edi, SIZEOF_XMMWORD-1 374 jnz short .out1 375 ; --(aligned)------------------- 376 movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 377 movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 378 movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC 379 movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH 380 jmp short .out0 381.out1: ; --(unaligned)----------------- 382 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 383 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 384 movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC 385 movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH 386.out0: 387 add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 388 sub ecx, byte SIZEOF_XMMWORD 389 jz near .endcolumn 390 391 add esi, byte SIZEOF_XMMWORD ; inptr0 392 dec al ; Yctr 393 jnz near .Yloop_2nd 394 395 add ebx, byte SIZEOF_XMMWORD ; inptr1 396 add edx, byte SIZEOF_XMMWORD ; inptr2 397 jmp near .columnloop 398 alignx 16, 7 399 400.column_st32: 401 cmp ecx, byte SIZEOF_XMMWORD/2 402 jb short .column_st16 403 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 404 movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD 405 add edi, byte 2*SIZEOF_XMMWORD ; outptr 406 movdqa xmmA, xmmC 407 movdqa xmmD, xmmH 408 sub ecx, byte SIZEOF_XMMWORD/2 409.column_st16: 410 cmp ecx, byte SIZEOF_XMMWORD/4 411 jb short .column_st15 412 movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA 413 add edi, byte SIZEOF_XMMWORD ; outptr 414 movdqa xmmA, xmmD 415 sub ecx, byte SIZEOF_XMMWORD/4 416.column_st15: 417 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 418 ; space. 419 cmp ecx, byte SIZEOF_XMMWORD/8 420 jb short .column_st7 421 movq XMM_MMWORD [edi], xmmA 422 add edi, byte SIZEOF_XMMWORD/8*4 423 sub ecx, byte SIZEOF_XMMWORD/8 424 psrldq xmmA, SIZEOF_XMMWORD/8*4 425.column_st7: 426 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 427 ; space. 428 test ecx, ecx 429 jz short .endcolumn 430 movd XMM_DWORD [edi], xmmA 431 432%endif ; RGB_PIXELSIZE ; --------------- 433 434.endcolumn: 435 sfence ; flush the write buffer 436 437.return: 438 pop edi 439 pop esi 440; pop edx ; need not be preserved 441; pop ecx ; need not be preserved 442 pop ebx 443 mov esp, ebp ; esp <- aligned ebp 444 pop esp ; esp <- original ebp 445 pop ebp 446 ret 447 448; -------------------------------------------------------------------------- 449; 450; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 451; 452; GLOBAL(void) 453; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, 454; JSAMPIMAGE input_buf, 455; JDIMENSION in_row_group_ctr, 456; JSAMPARRAY output_buf); 457; 458 459%define output_width(b) (b) + 8 ; JDIMENSION output_width 460%define input_buf(b) (b) + 12 ; JSAMPIMAGE input_buf 461%define in_row_group_ctr(b) (b) + 16 ; JDIMENSION in_row_group_ctr 462%define output_buf(b) (b) + 20 ; JSAMPARRAY output_buf 463 464 align 32 465 GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) 466 467EXTN(jsimd_h2v2_merged_upsample_sse2): 468 push ebp 469 mov ebp, esp 470 push ebx 471; push ecx ; need not be preserved 472; push edx ; need not be preserved 473 push esi 474 push edi 475 476 mov eax, POINTER [output_width(ebp)] 477 478 mov edi, JSAMPIMAGE [input_buf(ebp)] 479 mov ecx, JDIMENSION [in_row_group_ctr(ebp)] 480 mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 481 mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 482 mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 483 mov edi, JSAMPARRAY [output_buf(ebp)] 484 lea esi, [esi+ecx*SIZEOF_JSAMPROW] 485 486 push edx ; inptr2 487 push ebx ; inptr1 488 push esi ; inptr00 489 mov ebx, esp 490 491 push edi ; output_buf (outptr0) 492 push ecx ; in_row_group_ctr 493 push ebx ; input_buf 494 push eax ; output_width 495 496 call near EXTN(jsimd_h2v1_merged_upsample_sse2) 497 498 add esi, byte SIZEOF_JSAMPROW ; inptr01 499 add edi, byte SIZEOF_JSAMPROW ; outptr1 500 mov POINTER [ebx+0*SIZEOF_POINTER], esi 501 mov POINTER [ebx-1*SIZEOF_POINTER], edi 502 503 call near EXTN(jsimd_h2v1_merged_upsample_sse2) 504 505 add esp, byte 7*SIZEOF_DWORD 506 507 pop edi 508 pop esi 509; pop edx ; need not be preserved 510; pop ecx ; need not be preserved 511 pop ebx 512 pop ebp 513 ret 514 515; For some reason, the OS X linker does not honor the request to align the 516; segment unless we do this. 517 align 32 518