1; 2; jccolext.asm - colorspace conversion (64-bit SSE2) 3; 4; Copyright (C) 2009, 2016, D. R. Commander. 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15 16%include "jcolsamp.inc" 17 18; -------------------------------------------------------------------------- 19; 20; Convert some rows of samples to the output colorspace. 21; 22; GLOBAL(void) 23; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, 24; JSAMPIMAGE output_buf, JDIMENSION output_row, 25; int num_rows); 26; 27 28; r10d = JDIMENSION img_width 29; r11 = JSAMPARRAY input_buf 30; r12 = JSAMPIMAGE output_buf 31; r13d = JDIMENSION output_row 32; r14d = int num_rows 33 34%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 35%define WK_NUM 8 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2) 39 40EXTN(jsimd_rgb_ycc_convert_sse2): 41 push rbp 42 mov rax, rsp ; rax = original rbp 43 sub rsp, byte 4 44 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 45 mov [rsp], rax 46 mov rbp, rsp ; rbp = aligned rbp 47 lea rsp, [wk(0)] 48 collect_args 5 49 push rbx 50 51 mov ecx, r10d 52 test rcx, rcx 53 jz near .return 54 55 push rcx 56 57 mov rsi, r12 58 mov ecx, r13d 59 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 60 mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 61 mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 62 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 63 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 64 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 65 66 pop rcx 67 68 mov rsi, r11 69 mov eax, r14d 70 test rax, rax 71 jle near .return 72.rowloop: 73 push rdx 74 push rbx 75 push rdi 76 push rsi 77 push rcx ; col 78 79 mov rsi, JSAMPROW [rsi] ; inptr 80 mov rdi, JSAMPROW [rdi] ; outptr0 81 mov rbx, JSAMPROW [rbx] ; outptr1 82 mov rdx, JSAMPROW [rdx] ; outptr2 83 84 cmp rcx, byte SIZEOF_XMMWORD 85 jae near .columnloop 86 87%if RGB_PIXELSIZE == 3 ; --------------- 88 89.column_ld1: 90 push rax 91 push rdx 92 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 93 test cl, SIZEOF_BYTE 94 jz short .column_ld2 95 sub rcx, byte SIZEOF_BYTE 96 movzx rax, byte [rsi+rcx] 97.column_ld2: 98 test cl, SIZEOF_WORD 99 jz short .column_ld4 100 sub rcx, byte SIZEOF_WORD 101 movzx rdx, word [rsi+rcx] 102 shl rax, WORD_BIT 103 or rax, rdx 104.column_ld4: 105 movd xmmA, eax 106 pop rdx 107 pop rax 108 test cl, SIZEOF_DWORD 109 jz short .column_ld8 110 sub rcx, byte SIZEOF_DWORD 111 movd xmmF, XMM_DWORD [rsi+rcx] 112 pslldq xmmA, SIZEOF_DWORD 113 por xmmA, xmmF 114.column_ld8: 115 test cl, SIZEOF_MMWORD 116 jz short .column_ld16 117 sub rcx, byte SIZEOF_MMWORD 118 movq xmmB, XMM_MMWORD [rsi+rcx] 119 pslldq xmmA, SIZEOF_MMWORD 120 por xmmA, xmmB 121.column_ld16: 122 test cl, SIZEOF_XMMWORD 123 jz short .column_ld32 124 movdqa xmmF, xmmA 125 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 126 mov rcx, SIZEOF_XMMWORD 127 jmp short .rgb_ycc_cnv 128.column_ld32: 129 test cl, 2*SIZEOF_XMMWORD 130 mov rcx, SIZEOF_XMMWORD 131 jz short .rgb_ycc_cnv 132 movdqa xmmB, xmmA 133 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 134 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 135 jmp short .rgb_ycc_cnv 136 137.columnloop: 138 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 139 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 140 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 141 142.rgb_ycc_cnv: 143 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 144 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 145 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 146 147 movdqa xmmG, xmmA 148 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 149 psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 150 151 punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 152 pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 153 154 punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 155 punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 156 157 movdqa xmmD, xmmA 158 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 159 psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 160 161 punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 162 pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 163 164 punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 165 punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 166 167 movdqa xmmE, xmmA 168 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 169 psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 170 171 punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 172 pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 173 174 punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 175 punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 176 177 pxor xmmH, xmmH 178 179 movdqa xmmC, xmmA 180 punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 181 punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 182 183 movdqa xmmB, xmmE 184 punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 185 punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 186 187 movdqa xmmF, xmmD 188 punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 189 punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 190 191%else ; RGB_PIXELSIZE == 4 ; ----------- 192 193.column_ld1: 194 test cl, SIZEOF_XMMWORD/16 195 jz short .column_ld2 196 sub rcx, byte SIZEOF_XMMWORD/16 197 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 198.column_ld2: 199 test cl, SIZEOF_XMMWORD/8 200 jz short .column_ld4 201 sub rcx, byte SIZEOF_XMMWORD/8 202 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 203 pslldq xmmA, SIZEOF_MMWORD 204 por xmmA, xmmE 205.column_ld4: 206 test cl, SIZEOF_XMMWORD/4 207 jz short .column_ld8 208 sub rcx, byte SIZEOF_XMMWORD/4 209 movdqa xmmE, xmmA 210 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 211.column_ld8: 212 test cl, SIZEOF_XMMWORD/2 213 mov rcx, SIZEOF_XMMWORD 214 jz short .rgb_ycc_cnv 215 movdqa xmmF, xmmA 216 movdqa xmmH, xmmE 217 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 218 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 219 jmp short .rgb_ycc_cnv 220 221.columnloop: 222 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 223 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 224 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 225 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 226 227.rgb_ycc_cnv: 228 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 229 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 230 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 231 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 232 233 movdqa xmmD, xmmA 234 punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 235 punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 236 237 movdqa xmmC, xmmF 238 punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 239 punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 240 241 movdqa xmmB, xmmA 242 punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 243 punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 244 245 movdqa xmmG, xmmD 246 punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 247 punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 248 249 movdqa xmmE, xmmA 250 punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 251 punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 252 253 movdqa xmmH, xmmB 254 punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 255 punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 256 257 pxor xmmF, xmmF 258 259 movdqa xmmC, xmmA 260 punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 261 punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 262 263 movdqa xmmD, xmmB 264 punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 265 punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 266 267 movdqa xmmG, xmmE 268 punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 269 punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 270 271 punpcklbw xmmF, xmmH 272 punpckhbw xmmH, xmmH 273 psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 274 psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 275 276%endif ; RGB_PIXELSIZE ; --------------- 277 278 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 279 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 280 281 ; (Original) 282 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 283 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 284 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 285 ; 286 ; (This implementation) 287 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 288 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 289 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 290 291 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE 292 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO 293 movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE 294 movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO 295 296 movdqa xmm6, xmm1 297 punpcklwd xmm1, xmm3 298 punpckhwd xmm6, xmm3 299 movdqa xmm7, xmm1 300 movdqa xmm4, xmm6 301 pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 302 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 303 pmaddwd xmm7, [rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 304 pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 305 306 movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 307 movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 308 309 pxor xmm1, xmm1 310 pxor xmm6, xmm6 311 punpcklwd xmm1, xmm5 ; xmm1=BOL 312 punpckhwd xmm6, xmm5 ; xmm6=BOH 313 psrld xmm1, 1 ; xmm1=BOL*FIX(0.500) 314 psrld xmm6, 1 ; xmm6=BOH*FIX(0.500) 315 316 movdqa xmm5, [rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ] 317 318 paddd xmm7, xmm1 319 paddd xmm4, xmm6 320 paddd xmm7, xmm5 321 paddd xmm4, xmm5 322 psrld xmm7, SCALEBITS ; xmm7=CbOL 323 psrld xmm4, SCALEBITS ; xmm4=CbOH 324 packssdw xmm7, xmm4 ; xmm7=CbO 325 326 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE 327 328 movdqa xmm6, xmm0 329 punpcklwd xmm0, xmm2 330 punpckhwd xmm6, xmm2 331 movdqa xmm5, xmm0 332 movdqa xmm4, xmm6 333 pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 334 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 335 pmaddwd xmm5, [rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 336 pmaddwd xmm4, [rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 337 338 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 339 movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 340 341 pxor xmm0, xmm0 342 pxor xmm6, xmm6 343 punpcklwd xmm0, xmm1 ; xmm0=BEL 344 punpckhwd xmm6, xmm1 ; xmm6=BEH 345 psrld xmm0, 1 ; xmm0=BEL*FIX(0.500) 346 psrld xmm6, 1 ; xmm6=BEH*FIX(0.500) 347 348 movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 349 350 paddd xmm5, xmm0 351 paddd xmm4, xmm6 352 paddd xmm5, xmm1 353 paddd xmm4, xmm1 354 psrld xmm5, SCALEBITS ; xmm5=CbEL 355 psrld xmm4, SCALEBITS ; xmm4=CbEH 356 packssdw xmm5, xmm4 ; xmm5=CbE 357 358 psllw xmm7, BYTE_BIT 359 por xmm5, xmm7 ; xmm5=Cb 360 movdqa XMMWORD [rbx], xmm5 ; Save Cb 361 362 movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO 363 movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE 364 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO 365 366 movdqa xmm4, xmm0 367 punpcklwd xmm0, xmm3 368 punpckhwd xmm4, xmm3 369 movdqa xmm7, xmm0 370 movdqa xmm5, xmm4 371 pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 372 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 373 pmaddwd xmm7, [rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 374 pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 375 376 movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 377 378 paddd xmm0, XMMWORD [wk(4)] 379 paddd xmm4, XMMWORD [wk(5)] 380 paddd xmm0, xmm3 381 paddd xmm4, xmm3 382 psrld xmm0, SCALEBITS ; xmm0=YOL 383 psrld xmm4, SCALEBITS ; xmm4=YOH 384 packssdw xmm0, xmm4 ; xmm0=YO 385 386 pxor xmm3, xmm3 387 pxor xmm4, xmm4 388 punpcklwd xmm3, xmm1 ; xmm3=ROL 389 punpckhwd xmm4, xmm1 ; xmm4=ROH 390 psrld xmm3, 1 ; xmm3=ROL*FIX(0.500) 391 psrld xmm4, 1 ; xmm4=ROH*FIX(0.500) 392 393 movdqa xmm1, [rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ] 394 395 paddd xmm7, xmm3 396 paddd xmm5, xmm4 397 paddd xmm7, xmm1 398 paddd xmm5, xmm1 399 psrld xmm7, SCALEBITS ; xmm7=CrOL 400 psrld xmm5, SCALEBITS ; xmm5=CrOH 401 packssdw xmm7, xmm5 ; xmm7=CrO 402 403 movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE 404 405 movdqa xmm4, xmm6 406 punpcklwd xmm6, xmm2 407 punpckhwd xmm4, xmm2 408 movdqa xmm1, xmm6 409 movdqa xmm5, xmm4 410 pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 411 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 412 pmaddwd xmm1, [rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 413 pmaddwd xmm5, [rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 414 415 movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 416 417 paddd xmm6, XMMWORD [wk(6)] 418 paddd xmm4, XMMWORD [wk(7)] 419 paddd xmm6, xmm2 420 paddd xmm4, xmm2 421 psrld xmm6, SCALEBITS ; xmm6=YEL 422 psrld xmm4, SCALEBITS ; xmm4=YEH 423 packssdw xmm6, xmm4 ; xmm6=YE 424 425 psllw xmm0, BYTE_BIT 426 por xmm6, xmm0 ; xmm6=Y 427 movdqa XMMWORD [rdi], xmm6 ; Save Y 428 429 pxor xmm2, xmm2 430 pxor xmm4, xmm4 431 punpcklwd xmm2, xmm3 ; xmm2=REL 432 punpckhwd xmm4, xmm3 ; xmm4=REH 433 psrld xmm2, 1 ; xmm2=REL*FIX(0.500) 434 psrld xmm4, 1 ; xmm4=REH*FIX(0.500) 435 436 movdqa xmm0, [rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ] 437 438 paddd xmm1, xmm2 439 paddd xmm5, xmm4 440 paddd xmm1, xmm0 441 paddd xmm5, xmm0 442 psrld xmm1, SCALEBITS ; xmm1=CrEL 443 psrld xmm5, SCALEBITS ; xmm5=CrEH 444 packssdw xmm1, xmm5 ; xmm1=CrE 445 446 psllw xmm7, BYTE_BIT 447 por xmm1, xmm7 ; xmm1=Cr 448 movdqa XMMWORD [rdx], xmm1 ; Save Cr 449 450 sub rcx, byte SIZEOF_XMMWORD 451 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 452 add rdi, byte SIZEOF_XMMWORD ; outptr0 453 add rbx, byte SIZEOF_XMMWORD ; outptr1 454 add rdx, byte SIZEOF_XMMWORD ; outptr2 455 cmp rcx, byte SIZEOF_XMMWORD 456 jae near .columnloop 457 test rcx, rcx 458 jnz near .column_ld1 459 460 pop rcx ; col 461 pop rsi 462 pop rdi 463 pop rbx 464 pop rdx 465 466 add rsi, byte SIZEOF_JSAMPROW ; input_buf 467 add rdi, byte SIZEOF_JSAMPROW 468 add rbx, byte SIZEOF_JSAMPROW 469 add rdx, byte SIZEOF_JSAMPROW 470 dec rax ; num_rows 471 jg near .rowloop 472 473.return: 474 pop rbx 475 uncollect_args 5 476 mov rsp, rbp ; rsp <- aligned rbp 477 pop rsp ; rsp <- original rbp 478 pop rbp 479 ret 480 481; For some reason, the OS X linker does not honor the request to align the 482; segment unless we do this. 483 align 32 484