1; 2; jdcolext.asm - colorspace conversion (64-bit SSE2) 3; 4; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2012, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jcolsamp.inc" 20 21; -------------------------------------------------------------------------- 22; 23; Convert some rows of samples to the output colorspace. 24; 25; GLOBAL(void) 26; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width, 27; JSAMPIMAGE input_buf, JDIMENSION input_row, 28; JSAMPARRAY output_buf, int num_rows) 29; 30 31; r10 = JDIMENSION out_width 32; r11 = JSAMPIMAGE input_buf 33; r12 = JDIMENSION input_row 34; r13 = JSAMPARRAY output_buf 35; r14 = int num_rows 36 37%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 38%define WK_NUM 2 39 40 align 16 41 global EXTN(jsimd_ycc_rgb_convert_sse2) 42 43EXTN(jsimd_ycc_rgb_convert_sse2): 44 push rbp 45 mov rax,rsp ; rax = original rbp 46 sub rsp, byte 4 47 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 48 mov [rsp],rax 49 mov rbp,rsp ; rbp = aligned rbp 50 lea rsp, [wk(0)] 51 collect_args 52 push rbx 53 54 mov ecx, r10d ; num_cols 55 test rcx,rcx 56 jz near .return 57 58 push rcx 59 60 mov rdi, r11 61 mov ecx, r12d 62 mov rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY] 63 mov rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY] 64 mov rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY] 65 lea rsi, [rsi+rcx*SIZEOF_JSAMPROW] 66 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 67 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 68 69 pop rcx 70 71 mov rdi, r13 72 mov eax, r14d 73 test rax,rax 74 jle near .return 75.rowloop: 76 push rax 77 push rdi 78 push rdx 79 push rbx 80 push rsi 81 push rcx ; col 82 83 mov rsi, JSAMPROW [rsi] ; inptr0 84 mov rbx, JSAMPROW [rbx] ; inptr1 85 mov rdx, JSAMPROW [rdx] ; inptr2 86 mov rdi, JSAMPROW [rdi] ; outptr 87.columnloop: 88 89 movdqa xmm5, XMMWORD [rbx] ; xmm5=Cb(0123456789ABCDEF) 90 movdqa xmm1, XMMWORD [rdx] ; xmm1=Cr(0123456789ABCDEF) 91 92 pcmpeqw xmm4,xmm4 93 pcmpeqw xmm7,xmm7 94 psrlw xmm4,BYTE_BIT 95 psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 96 movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} 97 98 pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE 99 psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO 100 pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE 101 psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO 102 103 paddw xmm4,xmm7 104 paddw xmm5,xmm7 105 paddw xmm0,xmm7 106 paddw xmm1,xmm7 107 108 ; (Original) 109 ; R = Y + 1.40200 * Cr 110 ; G = Y - 0.34414 * Cb - 0.71414 * Cr 111 ; B = Y + 1.77200 * Cb 112 ; 113 ; (This implementation) 114 ; R = Y + 0.40200 * Cr + Cr 115 ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 116 ; B = Y - 0.22800 * Cb + Cb + Cb 117 118 movdqa xmm2,xmm4 ; xmm2=CbE 119 movdqa xmm3,xmm5 ; xmm3=CbO 120 paddw xmm4,xmm4 ; xmm4=2*CbE 121 paddw xmm5,xmm5 ; xmm5=2*CbO 122 movdqa xmm6,xmm0 ; xmm6=CrE 123 movdqa xmm7,xmm1 ; xmm7=CrO 124 paddw xmm0,xmm0 ; xmm0=2*CrE 125 paddw xmm1,xmm1 ; xmm1=2*CrO 126 127 pmulhw xmm4,[rel PW_MF0228] ; xmm4=(2*CbE * -FIX(0.22800)) 128 pmulhw xmm5,[rel PW_MF0228] ; xmm5=(2*CbO * -FIX(0.22800)) 129 pmulhw xmm0,[rel PW_F0402] ; xmm0=(2*CrE * FIX(0.40200)) 130 pmulhw xmm1,[rel PW_F0402] ; xmm1=(2*CrO * FIX(0.40200)) 131 132 paddw xmm4,[rel PW_ONE] 133 paddw xmm5,[rel PW_ONE] 134 psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) 135 psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) 136 paddw xmm0,[rel PW_ONE] 137 paddw xmm1,[rel PW_ONE] 138 psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) 139 psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) 140 141 paddw xmm4,xmm2 142 paddw xmm5,xmm3 143 paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E 144 paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O 145 paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E 146 paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O 147 148 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E 149 movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O 150 151 movdqa xmm4,xmm2 152 movdqa xmm5,xmm3 153 punpcklwd xmm2,xmm6 154 punpckhwd xmm4,xmm6 155 pmaddwd xmm2,[rel PW_MF0344_F0285] 156 pmaddwd xmm4,[rel PW_MF0344_F0285] 157 punpcklwd xmm3,xmm7 158 punpckhwd xmm5,xmm7 159 pmaddwd xmm3,[rel PW_MF0344_F0285] 160 pmaddwd xmm5,[rel PW_MF0344_F0285] 161 162 paddd xmm2,[rel PD_ONEHALF] 163 paddd xmm4,[rel PD_ONEHALF] 164 psrad xmm2,SCALEBITS 165 psrad xmm4,SCALEBITS 166 paddd xmm3,[rel PD_ONEHALF] 167 paddd xmm5,[rel PD_ONEHALF] 168 psrad xmm3,SCALEBITS 169 psrad xmm5,SCALEBITS 170 171 packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 172 packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 173 psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 174 psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 175 176 movdqa xmm5, XMMWORD [rsi] ; xmm5=Y(0123456789ABCDEF) 177 178 pcmpeqw xmm4,xmm4 179 psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} 180 pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE 181 psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO 182 183 paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) 184 paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) 185 packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) 186 packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) 187 188 paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) 189 paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) 190 packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) 191 packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) 192 193 paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) 194 paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) 195 packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) 196 packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) 197 198%if RGB_PIXELSIZE == 3 ; --------------- 199 200 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 201 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 202 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 203 ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) 204 205 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 206 punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) 207 punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) 208 209 movdqa xmmG,xmmA 210 movdqa xmmH,xmmA 211 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) 212 punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) 213 214 psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) 215 psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) 216 217 movdqa xmmC,xmmD 218 movdqa xmmB,xmmD 219 punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) 220 punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) 221 222 psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) 223 224 movdqa xmmF,xmmE 225 punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) 226 punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) 227 228 pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) 229 movdqa xmmB,xmmE 230 punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) 231 punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) 232 punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) 233 234 pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) 235 movdqa xmmB,xmmF 236 punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) 237 punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) 238 punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) 239 240 punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 241 punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 242 punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 243 244 cmp rcx, byte SIZEOF_XMMWORD 245 jb short .column_st32 246 247 test rdi, SIZEOF_XMMWORD-1 248 jnz short .out1 249 ; --(aligned)------------------- 250 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 251 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 252 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 253 jmp short .out0 254.out1: ; --(unaligned)----------------- 255 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 256 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 257 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF 258.out0: 259 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 260 sub rcx, byte SIZEOF_XMMWORD 261 jz near .nextrow 262 263 add rsi, byte SIZEOF_XMMWORD ; inptr0 264 add rbx, byte SIZEOF_XMMWORD ; inptr1 265 add rdx, byte SIZEOF_XMMWORD ; inptr2 266 jmp near .columnloop 267 268.column_st32: 269 lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE 270 cmp rcx, byte 2*SIZEOF_XMMWORD 271 jb short .column_st16 272 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 273 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 274 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 275 movdqa xmmA,xmmF 276 sub rcx, byte 2*SIZEOF_XMMWORD 277 jmp short .column_st15 278.column_st16: 279 cmp rcx, byte SIZEOF_XMMWORD 280 jb short .column_st15 281 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 282 add rdi, byte SIZEOF_XMMWORD ; outptr 283 movdqa xmmA,xmmD 284 sub rcx, byte SIZEOF_XMMWORD 285.column_st15: 286 ; Store the lower 8 bytes of xmmA to the output when it has enough 287 ; space. 288 cmp rcx, byte SIZEOF_MMWORD 289 jb short .column_st7 290 movq XMM_MMWORD [rdi], xmmA 291 add rdi, byte SIZEOF_MMWORD 292 sub rcx, byte SIZEOF_MMWORD 293 psrldq xmmA, SIZEOF_MMWORD 294.column_st7: 295 ; Store the lower 4 bytes of xmmA to the output when it has enough 296 ; space. 297 cmp rcx, byte SIZEOF_DWORD 298 jb short .column_st3 299 movd XMM_DWORD [rdi], xmmA 300 add rdi, byte SIZEOF_DWORD 301 sub rcx, byte SIZEOF_DWORD 302 psrldq xmmA, SIZEOF_DWORD 303.column_st3: 304 ; Store the lower 2 bytes of rax to the output when it has enough 305 ; space. 306 movd eax, xmmA 307 cmp rcx, byte SIZEOF_WORD 308 jb short .column_st1 309 mov WORD [rdi], ax 310 add rdi, byte SIZEOF_WORD 311 sub rcx, byte SIZEOF_WORD 312 shr rax, 16 313.column_st1: 314 ; Store the lower 1 byte of rax to the output when it has enough 315 ; space. 316 test rcx, rcx 317 jz short .nextrow 318 mov BYTE [rdi], al 319 320%else ; RGB_PIXELSIZE == 4 ; ----------- 321 322%ifdef RGBX_FILLER_0XFF 323 pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 324 pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 325%else 326 pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) 327 pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) 328%endif 329 ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) 330 ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) 331 ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) 332 ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) 333 334 punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) 335 punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) 336 punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) 337 punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) 338 339 movdqa xmmC,xmmA 340 punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) 341 punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) 342 movdqa xmmG,xmmB 343 punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) 344 punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) 345 346 movdqa xmmD,xmmA 347 punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 348 punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 349 movdqa xmmH,xmmC 350 punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 351 punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 352 353 cmp rcx, byte SIZEOF_XMMWORD 354 jb short .column_st32 355 356 test rdi, SIZEOF_XMMWORD-1 357 jnz short .out1 358 ; --(aligned)------------------- 359 movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 360 movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 361 movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 362 movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 363 jmp short .out0 364.out1: ; --(unaligned)----------------- 365 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 366 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 367 movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC 368 movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH 369.out0: 370 add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr 371 sub rcx, byte SIZEOF_XMMWORD 372 jz near .nextrow 373 374 add rsi, byte SIZEOF_XMMWORD ; inptr0 375 add rbx, byte SIZEOF_XMMWORD ; inptr1 376 add rdx, byte SIZEOF_XMMWORD ; inptr2 377 jmp near .columnloop 378 379.column_st32: 380 cmp rcx, byte SIZEOF_XMMWORD/2 381 jb short .column_st16 382 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 383 movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD 384 add rdi, byte 2*SIZEOF_XMMWORD ; outptr 385 movdqa xmmA,xmmC 386 movdqa xmmD,xmmH 387 sub rcx, byte SIZEOF_XMMWORD/2 388.column_st16: 389 cmp rcx, byte SIZEOF_XMMWORD/4 390 jb short .column_st15 391 movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA 392 add rdi, byte SIZEOF_XMMWORD ; outptr 393 movdqa xmmA,xmmD 394 sub rcx, byte SIZEOF_XMMWORD/4 395.column_st15: 396 ; Store two pixels (8 bytes) of xmmA to the output when it has enough 397 ; space. 398 cmp rcx, byte SIZEOF_XMMWORD/8 399 jb short .column_st7 400 movq MMWORD [rdi], xmmA 401 add rdi, byte SIZEOF_XMMWORD/8*4 402 sub rcx, byte SIZEOF_XMMWORD/8 403 psrldq xmmA, SIZEOF_XMMWORD/8*4 404.column_st7: 405 ; Store one pixel (4 bytes) of xmmA to the output when it has enough 406 ; space. 407 test rcx, rcx 408 jz short .nextrow 409 movd XMM_DWORD [rdi], xmmA 410 411%endif ; RGB_PIXELSIZE ; --------------- 412 413.nextrow: 414 pop rcx 415 pop rsi 416 pop rbx 417 pop rdx 418 pop rdi 419 pop rax 420 421 add rsi, byte SIZEOF_JSAMPROW 422 add rbx, byte SIZEOF_JSAMPROW 423 add rdx, byte SIZEOF_JSAMPROW 424 add rdi, byte SIZEOF_JSAMPROW ; output_buf 425 dec rax ; num_rows 426 jg near .rowloop 427 428 sfence ; flush the write buffer 429 430.return: 431 pop rbx 432 uncollect_args 433 mov rsp,rbp ; rsp <- aligned rbp 434 pop rsp ; rsp <- original rbp 435 pop rbp 436 ret 437 438; For some reason, the OS X linker does not honor the request to align the 439; segment unless we do this. 440 align 16 441