1; 2; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2) 3; 4; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2012, 2016, D. R. Commander. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jcolsamp.inc" 19 20; -------------------------------------------------------------------------- 21; 22; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 23; 24; GLOBAL(void) 25; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width, 26; JSAMPIMAGE input_buf, 27; JDIMENSION in_row_group_ctr, 28; JSAMPARRAY output_buf); 29; 30 31; r10d = JDIMENSION output_width 32; r11 = JSAMPIMAGE input_buf 33; r12d = JDIMENSION in_row_group_ctr 34; r13 = JSAMPARRAY output_buf 35 36%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 37%define WK_NUM 3 38 39 align 32 40 GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2) 41 42EXTN(jsimd_h2v1_merged_upsample_sse2): 43 push rbp 44 mov rax, rsp ; rax = original rbp 45 sub rsp, byte 4 46 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 47 mov [rsp], rax 48 mov rbp, rsp ; rbp = aligned rbp 49 lea rsp, [wk(0)] 50 collect_args 4 51 push rbx 52 53 mov ecx, r10d ; col 54 test rcx, rcx 55 jz near .return 56 57 push rcx 58 59 mov rdi, r11 60 mov ecx, r12d 61 mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 62 mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 63 mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 64 mov rdi, r13 65 mov rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW] ; inptr0 66 mov rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW] ; inptr1 67 mov rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW] ; inptr2 68 mov rdip, JSAMPROW [rdi] ; outptr 69 70 pop rcx ; col 71 72.columnloop: 73 74 movdqa xmm6, XMMWORD [rbx] ; xmm6=Cb(0123456789ABCDEF) 75 movdqa xmm7, XMMWORD [rdx] ; xmm7=Cr(0123456789ABCDEF) 76 77 pxor xmm1, xmm1 ; xmm1=(all 0's) 78 pcmpeqw xmm3, xmm3 79 psllw xmm3, 7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} 80 81 movdqa xmm4, xmm6 82 punpckhbw xmm6, xmm1 ; xmm6=Cb(89ABCDEF)=CbH 83 punpcklbw xmm4, xmm1 ; xmm4=Cb(01234567)=CbL 84 movdqa xmm0, xmm7 85 punpckhbw xmm7, xmm1 ; xmm7=Cr(89ABCDEF)=CrH 86 punpcklbw xmm0, xmm1 ; xmm0=Cr(01234567)=CrL 87 88 paddw xmm6, xmm3 89 paddw xmm4, xmm3 90 paddw xmm7, xmm3 91 paddw xmm0, xmm3 92 93 ; (Original) 94 ; R = Y + 1.40200 * Cr 95 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 96 ; B = Y + 1.77200 * Cb 97 ; 98 ; (This implementation) 99 ; R = Y + 0.40200 * Cr + Cr 100 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 101 ; B = Y - 0.22800 * Cb + Cb + Cb 102 103 movdqa xmm5, xmm6 ; xmm5=CbH 104 movdqa xmm2, xmm4 ; xmm2=CbL 105 paddw xmm6, xmm6 ; xmm6=2*CbH 106 paddw xmm4, xmm4 ; xmm4=2*CbL 107 movdqa xmm1, xmm7 ; xmm1=CrH 108 movdqa xmm3, xmm0 ; xmm3=CrL 109 paddw xmm7, xmm7 ; xmm7=2*CrH 110 paddw xmm0, xmm0 ; xmm0=2*CrL 111 112 pmulhw xmm6, [rel PW_MF0228] ; xmm6=(2*CbH * -FIX(0.22800)) 113 pmulhw xmm4, [rel PW_MF0228] ; xmm4=(2*CbL * -FIX(0.22800)) 114 pmulhw xmm7, [rel PW_F0402] ; xmm7=(2*CrH * FIX(0.40200)) 115 pmulhw xmm0, [rel PW_F0402] ; xmm0=(2*CrL * FIX(0.40200)) 116 117 paddw xmm6, [rel PW_ONE] 118 paddw xmm4, [rel PW_ONE] 119 psraw xmm6, 1 ; xmm6=(CbH * -FIX(0.22800)) 120 psraw xmm4, 1 ; xmm4=(CbL * -FIX(0.22800)) 121 paddw xmm7, [rel PW_ONE] 122 paddw xmm0, [rel PW_ONE] 123 psraw xmm7, 1 ; xmm7=(CrH * FIX(0.40200)) 124 psraw xmm0, 1 ; xmm0=(CrL * FIX(0.40200)) 125 126 paddw xmm6, xmm5 127 paddw xmm4, xmm2 128 paddw xmm6, xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H 129 paddw xmm4, xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L 130 paddw xmm7, xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H 131 paddw xmm0, xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L 132 133 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H 134 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H 135 136 movdqa xmm6, xmm5 137 movdqa xmm7, xmm2 138 punpcklwd xmm5, xmm1 139 punpckhwd xmm6, xmm1 140 pmaddwd xmm5, [rel PW_MF0344_F0285] 141 pmaddwd xmm6, [rel PW_MF0344_F0285] 142 punpcklwd xmm2, xmm3 143 punpckhwd xmm7, xmm3 144 pmaddwd xmm2, [rel PW_MF0344_F0285] 145 pmaddwd xmm7, [rel PW_MF0344_F0285] 146 147 paddd xmm5, [rel PD_ONEHALF] 148 paddd xmm6, [rel PD_ONEHALF] 149 psrad xmm5, SCALEBITS 150 psrad xmm6, SCALEBITS 151 paddd xmm2, [rel PD_ONEHALF] 152 paddd xmm7, [rel PD_ONEHALF] 153 psrad xmm2, SCALEBITS 154 psrad xmm7, SCALEBITS 155 156 packssdw xmm5, xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 157 packssdw xmm2, xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 158 psubw xmm5, xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 159 psubw xmm2, xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 160 161 movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H 162 163 mov al, 2 ; Yctr 164 jmp short .Yloop_1st 165 166.Yloop_2nd: 167 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H 168 movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H 169 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H 170 171.Yloop_1st: 172 movdqa xmm7, XMMWORD [rsi] ; xmm7=Y(0123456789ABCDEF) 173 174 pcmpeqw xmm6, xmm6 175 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 176 pand xmm6, xmm7 ; xmm6=Y(02468ACE)=YE 177 psrlw xmm7, BYTE_BIT ; xmm7=Y(13579BDF)=YO 178 179 movdqa xmm1, xmm0 ; xmm1=xmm0=(R-Y)(L/H) 180 movdqa xmm3, xmm2 ; xmm3=xmm2=(G-Y)(L/H) 181 movdqa xmm5, xmm4 ; xmm5=xmm4=(B-Y)(L/H) 182 183 paddw xmm0, xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) 184 paddw xmm1, xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) 185 packuswb xmm0, xmm0 ; xmm0=R(02468ACE********) 186 packuswb xmm1, xmm1 ; xmm1=R(13579BDF********) 187 188 paddw xmm2, xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) 189 paddw xmm3, xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) 190 packuswb xmm2, xmm2 ; xmm2=G(02468ACE********) 191 packuswb xmm3, xmm3 ; xmm3=G(13579BDF********) 192 193 paddw xmm4, xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) 194 paddw xmm5, xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) 195 packuswb xmm4, xmm4 ; xmm4=B(02468ACE********) 196 packuswb xmm5, xmm5 ; xmm5=B(13579BDF********) 197 198%if RGB_PIXELSIZE == 3 ; --------------- 199 200 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 201 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 202 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 203 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 204 205 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 206 punpcklbw xmmE, xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 207 punpcklbw xmmD, xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 208 209 movdqa xmmG, xmmA 210 movdqa xmmH, xmmA 211 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 212 punpckhwd xmmG, xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 213 214 psrldq xmmH, 2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 215 psrldq xmmE, 2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 216 217 movdqa xmmC, xmmD 218 movdqa xmmB, xmmD 219 punpcklwd xmmD, xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 220 punpckhwd xmmC, xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 221 222 psrldq xmmB, 2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 223 224 movdqa xmmF, xmmE 225 punpcklwd xmmE, xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 226 punpckhwd xmmF, xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 227 228 pshufd xmmH, xmmA, 0x4E ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 229 movdqa xmmB, xmmE 230 punpckldq xmmA, xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 231 punpckldq xmmE, xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 232 punpckhdq xmmD, xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 233 234 pshufd xmmH, xmmG, 0x4E ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 235 movdqa xmmB, xmmF 236 punpckldq xmmG, xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 237 punpckldq xmmF, xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 238 punpckhdq xmmC, xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 239 240 punpcklqdq xmmA, xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 241 punpcklqdq xmmD, xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 242 punpcklqdq xmmF, xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 243 244 cmp rcx, byte SIZEOF_XMMWORD 245 jb short .column_st32 246 247 test rdi, SIZEOF_XMMWORD-1 248 jnz short .out1 249 ; --(aligned)------------------- 250 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 251 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 252 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 253 jmp short .out0 254.out1: ; --(unaligned)----------------- 255 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 256 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 257 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 258.out0: 259 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 260 sub rcx, byte SIZEOF_XMMWORD 261 jz near .endcolumn 262 263 add rsi, byte SIZEOF_XMMWORD ; inptr0 264 dec al ; Yctr 265 jnz near .Yloop_2nd 266 267 add rbx, byte SIZEOF_XMMWORD ; inptr1 268 add rdx, byte SIZEOF_XMMWORD ; inptr2 269 jmp near .columnloop 270 271.column_st32: 272 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 273 cmp rcx, byte 2*SIZEOF_XMMWORD 274 jb short .column_st16 275 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 276 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 277 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 278 movdqa xmmA, xmmF 279 sub rcx, byte 2*SIZEOF_XMMWORD 280 jmp short .column_st15 281.column_st16: 282 cmp rcx, byte SIZEOF_XMMWORD 283 jb short .column_st15 284 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 285 add rdi, byte SIZEOF_XMMWORD ; outptr 286 movdqa xmmA, xmmD 287 sub rcx, byte SIZEOF_XMMWORD 288.column_st15: 289 ; Store the lower 8 bytes of xmmA to the output when it has enough 290 ; space. 291 cmp rcx, byte SIZEOF_MMWORD 292 jb short .column_st7 293 movq XMM_MMWORD [rdi], xmmA 294 add rdi, byte SIZEOF_MMWORD 295 sub rcx, byte SIZEOF_MMWORD 296 psrldq xmmA, SIZEOF_MMWORD 297.column_st7: 298 ; Store the lower 4 bytes of xmmA to the output when it has enough 299 ; space. 300 cmp rcx, byte SIZEOF_DWORD 301 jb short .column_st3 302 movd XMM_DWORD [rdi], xmmA 303 add rdi, byte SIZEOF_DWORD 304 sub rcx, byte SIZEOF_DWORD 305 psrldq xmmA, SIZEOF_DWORD 306.column_st3: 307 ; Store the lower 2 bytes of rax to the output when it has enough 308 ; space. 309 movd eax, xmmA 310 cmp rcx, byte SIZEOF_WORD 311 jb short .column_st1 312 mov word [rdi], ax 313 add rdi, byte SIZEOF_WORD 314 sub rcx, byte SIZEOF_WORD 315 shr rax, 16 316.column_st1: 317 ; Store the lower 1 byte of rax to the output when it has enough 318 ; space. 319 test rcx, rcx 320 jz short .endcolumn 321 mov byte [rdi], al 322 323%else ; RGB_PIXELSIZE == 4 ; ----------- 324 325%ifdef RGBX_FILLER_0XFF 326 pcmpeqb xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 327 pcmpeqb xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 328%else 329 pxor xmm6, xmm6 ; xmm6=XE=X(02468ACE********) 330 pxor xmm7, xmm7 ; xmm7=XO=X(13579BDF********) 331%endif 332 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 333 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 334 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 335 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 336 337 punpcklbw xmmA, xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 338 punpcklbw xmmE, xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 339 punpcklbw xmmB, xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 340 punpcklbw xmmF, xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 341 342 movdqa xmmC, xmmA 343 punpcklwd xmmA, xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 344 punpckhwd xmmC, xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 345 movdqa xmmG, xmmB 346 punpcklwd xmmB, xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 347 punpckhwd xmmG, xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 348 349 movdqa xmmD, xmmA 350 punpckldq xmmA, xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 351 punpckhdq xmmD, xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 352 movdqa xmmH, xmmC 353 punpckldq xmmC, xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 354 punpckhdq xmmH, xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 355 356 cmp rcx, byte SIZEOF_XMMWORD 357 jb short .column_st32 358 359 test rdi, SIZEOF_XMMWORD-1 360 jnz short .out1 361 ; --(aligned)------------------- 362 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 363 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 364 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 365 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 366 jmp short .out0 367.out1: ; --(unaligned)----------------- 368 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 369 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 370 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 371 movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 372.out0: 373 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 374 sub rcx, byte SIZEOF_XMMWORD 375 jz near .endcolumn 376 377 add rsi, byte SIZEOF_XMMWORD ; inptr0 378 dec al ; Yctr 379 jnz near .Yloop_2nd 380 381 add rbx, byte SIZEOF_XMMWORD ; inptr1 382 add rdx, byte SIZEOF_XMMWORD ; inptr2 383 jmp near .columnloop 384 385.column_st32: 386 cmp rcx, byte SIZEOF_XMMWORD/2 387 jb short .column_st16 388 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 389 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 390 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 391 movdqa xmmA, xmmC 392 movdqa xmmD, xmmH 393 sub rcx, byte SIZEOF_XMMWORD/2 394.column_st16: 395 cmp rcx, byte SIZEOF_XMMWORD/4 396 jb short .column_st15 397 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 398 add rdi, byte SIZEOF_XMMWORD ; outptr 399 movdqa xmmA, xmmD 400 sub rcx, byte SIZEOF_XMMWORD/4 401.column_st15: 402 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 403 ; space. 404 cmp rcx, byte SIZEOF_XMMWORD/8 405 jb short .column_st7 406 movq XMM_MMWORD [rdi], xmmA 407 add rdi, byte SIZEOF_XMMWORD/8*4 408 sub rcx, byte SIZEOF_XMMWORD/8 409 psrldq xmmA, SIZEOF_XMMWORD/8*4 410.column_st7: 411 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 412 ; space. 413 test rcx, rcx 414 jz short .endcolumn 415 movd XMM_DWORD [rdi], xmmA 416 417%endif ; RGB_PIXELSIZE ; --------------- 418 419.endcolumn: 420 sfence ; flush the write buffer 421 422.return: 423 pop rbx 424 uncollect_args 4 425 mov rsp, rbp ; rsp <- aligned rbp 426 pop rsp ; rsp <- original rbp 427 pop rbp 428 ret 429 430; -------------------------------------------------------------------------- 431; 432; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 433; 434; GLOBAL(void) 435; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width, 436; JSAMPIMAGE input_buf, 437; JDIMENSION in_row_group_ctr, 438; JSAMPARRAY output_buf); 439; 440 441; r10d = JDIMENSION output_width 442; r11 = JSAMPIMAGE input_buf 443; r12d = JDIMENSION in_row_group_ctr 444; r13 = JSAMPARRAY output_buf 445 446 align 32 447 GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2) 448 449EXTN(jsimd_h2v2_merged_upsample_sse2): 450 push rbp 451 mov rax, rsp 452 mov rbp, rsp 453 collect_args 4 454 push rbx 455 456 mov eax, r10d 457 458 mov rdi, r11 459 mov ecx, r12d 460 mov rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 461 mov rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 462 mov rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 463 mov rdi, r13 464 lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] 465 466 sub rsp, SIZEOF_JSAMPARRAY*4 467 mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 468 mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 469 mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 470 mov rbx, rsp 471 472 push rdi 473 push rcx 474 push rax 475 476 %ifdef WIN64 477 mov r8, rcx 478 mov r9, rdi 479 mov rcx, rax 480 mov rdx, rbx 481 %else 482 mov rdx, rcx 483 mov rcx, rdi 484 mov rdi, rax 485 mov rsi, rbx 486 %endif 487 488 call EXTN(jsimd_h2v1_merged_upsample_sse2) 489 490 pop rax 491 pop rcx 492 pop rdi 493 mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] 494 mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] 495 mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] 496 497 add rdi, byte SIZEOF_JSAMPROW ; outptr1 498 add rsi, byte SIZEOF_JSAMPROW ; inptr01 499 500 mov JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip ; intpr00 501 mov JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp ; intpr1 502 mov JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp ; intpr2 503 mov rbx, rsp 504 505 push rdi 506 push rcx 507 push rax 508 509 %ifdef WIN64 510 mov r8, rcx 511 mov r9, rdi 512 mov rcx, rax 513 mov rdx, rbx 514 %else 515 mov rdx, rcx 516 mov rcx, rdi 517 mov rdi, rax 518 mov rsi, rbx 519 %endif 520 521 call EXTN(jsimd_h2v1_merged_upsample_sse2) 522 523 pop rax 524 pop rcx 525 pop rdi 526 mov rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY] 527 mov rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY] 528 mov rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY] 529 add rsp, SIZEOF_JSAMPARRAY*4 530 531 pop rbx 532 uncollect_args 4 533 pop rbp 534 ret 535 536; For some reason, the OS X linker does not honor the request to align the 537; segment unless we do this. 538 align 32 539