1; 2; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) 3; 4; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2012, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jcolsamp.inc" 18 19; -------------------------------------------------------------------------- 20; 21; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 22; 23; GLOBAL(void) 24; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, 25; JSAMPIMAGE input_buf, 26; JDIMENSION in_row_group_ctr, 27; JSAMPARRAY output_buf); 28; 29 30; r10d = JDIMENSION output_width 31; r11 = JSAMPIMAGE input_buf 32; r12d = JDIMENSION in_row_group_ctr 33; r13 = JSAMPARRAY output_buf 34 35%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 36%define WK_NUM 3 37 38 align 32 39 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) 40 41EXTN(jsimd_h2v1_merged_upsample_sse2): 42 push rbp 43 mov rax, rsp ; rax = original rbp 44 sub rsp, byte 4 45 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 46 mov [rsp], rax 47 mov rbp, rsp ; rbp = aligned rbp 48 lea rsp, [wk(0)] 49 collect_args 4 50 push rbx 51 52 mov ecx, r10d ; col 53 test rcx, rcx 54 jz near .return 55 56 push rcx 57 58 mov rdi, r11 59 mov ecx, r12d 60 mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 61 mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 62 mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 63 mov rdi, r13 64 mov rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 65 mov rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 66 mov rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 67 mov rdi, JSAMPROW [rdi] ; outptr 68 69 pop rcx ; col 70 71.columnloop: 72 73 movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) 74 movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) 75 76 pxor xmm1, xmm1 ; xmm1=(all 0's) 77 pcmpeqw xmm3, xmm3 78 psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} 79 80 movdqa xmm4, xmm6 81 punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH 82 punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL 83 movdqa xmm0, xmm7 84 punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH 85 punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL 86 87 paddw xmm6, xmm3 88 paddw xmm4, xmm3 89 paddw xmm7, xmm3 90 paddw xmm0, xmm3 91 92 ; (Original) 93 ; R = Y + 1.40200 * Cr 94 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 95 ; B = Y + 1.77200 * Cb 96 ; 97 ; (This implementation) 98 ; R = Y + 0.40200 * Cr + Cr 99 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 100 ; B = Y - 0.22800 * Cb + Cb + Cb 101 102 movdqa xmm5, xmm6 ; xmm5=CbH 103 movdqa xmm2, xmm4 ; xmm2=CbL 104 paddw xmm6, xmm6 ; xmm6=2*CbH 105 paddw xmm4, xmm4 ; xmm4=2*CbL 106 movdqa xmm1, xmm7 ; xmm1=CrH 107 movdqa xmm3, xmm0 ; xmm3=CrL 108 paddw xmm7, xmm7 ; xmm7=2*CrH 109 paddw xmm0, xmm0 ; xmm0=2*CrL 110 111 pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) 112 pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) 113 pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) 114 pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) 115 116 paddw xmm6, [rel PW_ONE] 117 paddw xmm4, [rel PW_ONE] 118 psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) 119 psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) 120 paddw xmm7, [rel PW_ONE] 121 paddw xmm0, [rel PW_ONE] 122 psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) 123 psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) 124 125 paddw xmm6, xmm5 126 paddw xmm4, xmm2 127 paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H 128 paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L 129 paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H 130 paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L 131 132 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H 133 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H 134 135 movdqa xmm6, xmm5 136 movdqa xmm7, xmm2 137 punpcklwd xmm5, xmm1 138 punpckhwd xmm6, xmm1 139 pmaddwd xmm5, [rel PW_MF0344_F0285] 140 pmaddwd xmm6, [rel PW_MF0344_F0285] 141 punpcklwd xmm2, xmm3 142 punpckhwd xmm7, xmm3 143 pmaddwd xmm2, [rel PW_MF0344_F0285] 144 pmaddwd xmm7, [rel PW_MF0344_F0285] 145 146 paddd xmm5, [rel PD_ONEHALF] 147 paddd xmm6, [rel PD_ONEHALF] 148 psrad xmm5, SCALEBITS 149 psrad xmm6, SCALEBITS 150 paddd xmm2, [rel PD_ONEHALF] 151 paddd xmm7, [rel PD_ONEHALF] 152 psrad xmm2, SCALEBITS 153 psrad xmm7, SCALEBITS 154 155 packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 156 packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 157 psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 158 psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 159 160 movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H 161 162 mov al, 2 ; Yctr 163 jmp short .Yloop_1st 164 165.Yloop_2nd: 166 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H 167 movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H 168 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H 169 170.Yloop_1st: 171 movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) 172 173 pcmpeqw xmm6, xmm6 174 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 175 pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE 176 psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO 177 178 movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) 179 movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) 180 movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) 181 182 paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) 183 paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) 184 packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) 185 packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) 186 187 paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) 188 paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) 189 packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) 190 packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) 191 192 paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) 193 paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) 194 packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) 195 packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) 196 197%if RGB_PIXELSIZE == 3 ; --------------- 198 199 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 200 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 201 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 202 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 203 204 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 205 punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 206 punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 207 208 movdqa xmmG, xmmA 209 movdqa xmmH, xmmA 210 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 211 punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 212 213 psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 214 psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 215 216 movdqa xmmC, xmmD 217 movdqa xmmB, xmmD 218 punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 219 punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 220 221 psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 222 223 movdqa xmmF, xmmE 224 punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 225 punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 226 227 pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 228 movdqa xmmB, xmmE 229 punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 230 punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 231 punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 232 233 pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 234 movdqa xmmB, xmmF 235 punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 236 punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 237 punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 238 239 punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 240 punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 241 punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 242 243 cmp rcx, byte SIZEOF_XMMWORD 244 jb short .column_st32 245 246 test rdi, SIZEOF_XMMWORD-1 247 jnz short .out1 248 ; --(aligned)------------------- 249 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 250 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 251 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 252 jmp short .out0 253.out1: ; --(unaligned)----------------- 254 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 255 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 256 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 257.out0: 258 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 259 sub rcx, byte SIZEOF_XMMWORD 260 jz near .endcolumn 261 262 add rsi, byte SIZEOF_XMMWORD ; inptr0 263 dec al ; Yctr 264 jnz near .Yloop_2nd 265 266 add rbx, byte SIZEOF_XMMWORD ; inptr1 267 add rdx, byte SIZEOF_XMMWORD ; inptr2 268 jmp near .columnloop 269 270.column_st32: 271 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 272 cmp rcx, byte 2*SIZEOF_XMMWORD 273 jb short .column_st16 274 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 275 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 276 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 277 movdqa xmmA, xmmF 278 sub rcx, byte 2*SIZEOF_XMMWORD 279 jmp short .column_st15 280.column_st16: 281 cmp rcx, byte SIZEOF_XMMWORD 282 jb short .column_st15 283 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 284 add rdi, byte SIZEOF_XMMWORD ; outptr 285 movdqa xmmA, xmmD 286 sub rcx, byte SIZEOF_XMMWORD 287.column_st15: 288 ; Store the lower 8 bytes of xmmA to the output when it has enough 289 ; space. 290 cmp rcx, byte SIZEOF_MMWORD 291 jb short .column_st7 292 movq XMM_MMWORD [rdi], xmmA 293 add rdi, byte SIZEOF_MMWORD 294 sub rcx, byte SIZEOF_MMWORD 295 psrldq xmmA, SIZEOF_MMWORD 296.column_st7: 297 ; Store the lower 4 bytes of xmmA to the output when it has enough 298 ; space. 299 cmp rcx, byte SIZEOF_DWORD 300 jb short .column_st3 301 movd XMM_DWORD [rdi], xmmA 302 add rdi, byte SIZEOF_DWORD 303 sub rcx, byte SIZEOF_DWORD 304 psrldq xmmA, SIZEOF_DWORD 305.column_st3: 306 ; Store the lower 2 bytes of rax to the output when it has enough 307 ; space. 308 movd eax, xmmA 309 cmp rcx, byte SIZEOF_WORD 310 jb short .column_st1 311 mov word [rdi], ax 312 add rdi, byte SIZEOF_WORD 313 sub rcx, byte SIZEOF_WORD 314 shr rax, 16 315.column_st1: 316 ; Store the lower 1 byte of rax to the output when it has enough 317 ; space. 318 test rcx, rcx 319 jz short .endcolumn 320 mov byte [rdi], al 321 322%else ; RGB_PIXELSIZE == 4 ; ----------- 323 324%ifdef RGBX_FILLER_0XFF 325 pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 326 pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 327%else 328 pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 329 pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 330%endif 331 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 332 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 333 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 334 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 335 336 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 337 punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 338 punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 339 punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 340 341 movdqa xmmC, xmmA 342 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 343 punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 344 movdqa xmmG, xmmB 345 punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 346 punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 347 348 movdqa xmmD, xmmA 349 punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 350 punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 351 movdqa xmmH, xmmC 352 punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 353 punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 354 355 cmp rcx, byte SIZEOF_XMMWORD 356 jb short .column_st32 357 358 test rdi, SIZEOF_XMMWORD-1 359 jnz short .out1 360 ; --(aligned)------------------- 361 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 362 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 363 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 364 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 365 jmp short .out0 366.out1: ; --(unaligned)----------------- 367 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 368 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 369 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 370 movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 371.out0: 372 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 373 sub rcx, byte SIZEOF_XMMWORD 374 jz near .endcolumn 375 376 add rsi, byte SIZEOF_XMMWORD ; inptr0 377 dec al ; Yctr 378 jnz near .Yloop_2nd 379 380 add rbx, byte SIZEOF_XMMWORD ; inptr1 381 add rdx, byte SIZEOF_XMMWORD ; inptr2 382 jmp near .columnloop 383 384.column_st32: 385 cmp rcx, byte SIZEOF_XMMWORD/2 386 jb short .column_st16 387 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 388 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 389 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 390 movdqa xmmA, xmmC 391 movdqa xmmD, xmmH 392 sub rcx, byte SIZEOF_XMMWORD/2 393.column_st16: 394 cmp rcx, byte SIZEOF_XMMWORD/4 395 jb short .column_st15 396 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 397 add rdi, byte SIZEOF_XMMWORD ; outptr 398 movdqa xmmA, xmmD 399 sub rcx, byte SIZEOF_XMMWORD/4 400.column_st15: 401 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 402 ; space. 403 cmp rcx, byte SIZEOF_XMMWORD/8 404 jb short .column_st7 405 movq XMM_MMWORD [rdi], xmmA 406 add rdi, byte SIZEOF_XMMWORD/8*4 407 sub rcx, byte SIZEOF_XMMWORD/8 408 psrldq xmmA, SIZEOF_XMMWORD/8*4 409.column_st7: 410 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 411 ; space. 412 test rcx, rcx 413 jz short .endcolumn 414 movd XMM_DWORD [rdi], xmmA 415 416%endif ; RGB_PIXELSIZE ; --------------- 417 418.endcolumn: 419 sfence ; flush the write buffer 420 421.return: 422 pop rbx 423 uncollect_args 4 424 mov rsp, rbp ; rsp <- aligned rbp 425 pop rsp ; rsp <- original rbp 426 pop rbp 427 ret 428 429; -------------------------------------------------------------------------- 430; 431; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 432; 433; GLOBAL(void) 434; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, 435; JSAMPIMAGE input_buf, 436; JDIMENSION in_row_group_ctr, 437; JSAMPARRAY output_buf); 438; 439 440; r10d = JDIMENSION output_width 441; r11 = JSAMPIMAGE input_buf 442; r12d = JDIMENSION in_row_group_ctr 443; r13 = JSAMPARRAY output_buf 444 445 align 32 446 GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) 447 448EXTN(jsimd_h2v2_merged_upsample_sse2): 449 push rbp 450 mov rax, rsp 451 mov rbp, rsp 452 collect_args 4 453 push rbx 454 455 mov eax, r10d 456 457 mov rdi, r11 458 mov ecx, r12d 459 mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 460 mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 461 mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 462 mov rdi, r13 463 lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] 464 465 push rdx ; inptr2 466 push rbx ; inptr1 467 push rsi ; inptr00 468 mov rbx, rsp 469 470 push rdi 471 push rcx 472 push rax 473 474 %ifdef WIN64 475 mov r8, rcx 476 mov r9, rdi 477 mov rcx, rax 478 mov rdx, rbx 479 %else 480 mov rdx, rcx 481 mov rcx, rdi 482 mov rdi, rax 483 mov rsi, rbx 484 %endif 485 486 call EXTN(jsimd_h2v1_merged_upsample_sse2) 487 488 pop rax 489 pop rcx 490 pop rdi 491 pop rsi 492 pop rbx 493 pop rdx 494 495 add rdi, byte SIZEOF_JSAMPROW ; outptr1 496 add rsi, byte SIZEOF_JSAMPROW ; inptr01 497 498 push rdx ; inptr2 499 push rbx ; inptr1 500 push rsi ; inptr00 501 mov rbx, rsp 502 503 push rdi 504 push rcx 505 push rax 506 507 %ifdef WIN64 508 mov r8, rcx 509 mov r9, rdi 510 mov rcx, rax 511 mov rdx, rbx 512 %else 513 mov rdx, rcx 514 mov rcx, rdi 515 mov rdi, rax 516 mov rsi, rbx 517 %endif 518 519 call EXTN(jsimd_h2v1_merged_upsample_sse2) 520 521 pop rax 522 pop rcx 523 pop rdi 524 pop rsi 525 pop rbx 526 pop rdx 527 528 pop rbx 529 uncollect_args 4 530 pop rbp 531 ret 532 533; For some reason, the OS X linker does not honor the request to align the 534; segment unless we do this. 535 align 32 536