1; 2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) 3; 4; Copyright (C) 2011, 2016, D. R. Commander. 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15 16%include "jcolsamp.inc" 17 18; -------------------------------------------------------------------------- 19; 20; Convert some rows of samples to the output colorspace. 21; 22; GLOBAL(void) 23; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, 24; JSAMPIMAGE output_buf, JDIMENSION output_row, 25; int num_rows); 26; 27 28; r10d = JDIMENSION img_width 29; r11 = JSAMPARRAY input_buf 30; r12 = JSAMPIMAGE output_buf 31; r13d = JDIMENSION output_row 32; r14d = int num_rows 33 34%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 35%define WK_NUM 2 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) 39 40EXTN(jsimd_rgb_gray_convert_sse2): 41 push rbp 42 mov rax, rsp ; rax = original rbp 43 sub rsp, byte 4 44 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 45 mov [rsp], rax 46 mov rbp, rsp ; rbp = aligned rbp 47 lea rsp, [wk(0)] 48 collect_args 5 49 push rbx 50 51 mov ecx, r10d 52 test rcx, rcx 53 jz near .return 54 55 push rcx 56 57 mov rsi, r12 58 mov ecx, r13d 59 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 60 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 61 62 pop rcx 63 64 mov rsi, r11 65 mov eax, r14d 66 test rax, rax 67 jle near .return 68.rowloop: 69 push rdi 70 push rsi 71 push rcx ; col 72 73 mov rsi, JSAMPROW [rsi] ; inptr 74 mov rdi, JSAMPROW [rdi] ; outptr0 75 76 cmp rcx, byte SIZEOF_XMMWORD 77 jae near .columnloop 78 79%if RGB_PIXELSIZE == 3 ; --------------- 80 81.column_ld1: 82 push rax 83 push rdx 84 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 85 test cl, SIZEOF_BYTE 86 jz short .column_ld2 87 sub rcx, byte SIZEOF_BYTE 88 movzx rax, byte [rsi+rcx] 89.column_ld2: 90 test cl, SIZEOF_WORD 91 jz short .column_ld4 92 sub rcx, byte SIZEOF_WORD 93 movzx rdx, word [rsi+rcx] 94 shl rax, WORD_BIT 95 or rax, rdx 96.column_ld4: 97 movd xmmA, eax 98 pop rdx 99 pop rax 100 test cl, SIZEOF_DWORD 101 jz short .column_ld8 102 sub rcx, byte SIZEOF_DWORD 103 movd xmmF, XMM_DWORD [rsi+rcx] 104 pslldq xmmA, SIZEOF_DWORD 105 por xmmA, xmmF 106.column_ld8: 107 test cl, SIZEOF_MMWORD 108 jz short .column_ld16 109 sub rcx, byte SIZEOF_MMWORD 110 movq xmmB, XMM_MMWORD [rsi+rcx] 111 pslldq xmmA, SIZEOF_MMWORD 112 por xmmA, xmmB 113.column_ld16: 114 test cl, SIZEOF_XMMWORD 115 jz short .column_ld32 116 movdqa xmmF, xmmA 117 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 118 mov rcx, SIZEOF_XMMWORD 119 jmp short .rgb_gray_cnv 120.column_ld32: 121 test cl, 2*SIZEOF_XMMWORD 122 mov rcx, SIZEOF_XMMWORD 123 jz short .rgb_gray_cnv 124 movdqa xmmB, xmmA 125 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 126 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 127 jmp short .rgb_gray_cnv 128 129.columnloop: 130 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 131 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 132 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 133 134.rgb_gray_cnv: 135 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 136 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 137 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 138 139 movdqa xmmG, xmmA 140 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 141 psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 142 143 punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 144 pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 145 146 punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 147 punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 148 149 movdqa xmmD, xmmA 150 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 151 psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 152 153 punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 154 pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 155 156 punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 157 punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 158 159 movdqa xmmE, xmmA 160 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 161 psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 162 163 punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 164 pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 165 166 punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 167 punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 168 169 pxor xmmH, xmmH 170 171 movdqa xmmC, xmmA 172 punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 173 punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 174 175 movdqa xmmB, xmmE 176 punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 177 punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 178 179 movdqa xmmF, xmmD 180 punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 181 punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 182 183%else ; RGB_PIXELSIZE == 4 ; ----------- 184 185.column_ld1: 186 test cl, SIZEOF_XMMWORD/16 187 jz short .column_ld2 188 sub rcx, byte SIZEOF_XMMWORD/16 189 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 190.column_ld2: 191 test cl, SIZEOF_XMMWORD/8 192 jz short .column_ld4 193 sub rcx, byte SIZEOF_XMMWORD/8 194 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 195 pslldq xmmA, SIZEOF_MMWORD 196 por xmmA, xmmE 197.column_ld4: 198 test cl, SIZEOF_XMMWORD/4 199 jz short .column_ld8 200 sub rcx, byte SIZEOF_XMMWORD/4 201 movdqa xmmE, xmmA 202 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 203.column_ld8: 204 test cl, SIZEOF_XMMWORD/2 205 mov rcx, SIZEOF_XMMWORD 206 jz short .rgb_gray_cnv 207 movdqa xmmF, xmmA 208 movdqa xmmH, xmmE 209 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 210 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 211 jmp short .rgb_gray_cnv 212 213.columnloop: 214 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 215 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 216 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 217 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 218 219.rgb_gray_cnv: 220 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 221 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 222 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 223 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 224 225 movdqa xmmD, xmmA 226 punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 227 punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 228 229 movdqa xmmC, xmmF 230 punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 231 punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 232 233 movdqa xmmB, xmmA 234 punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 235 punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 236 237 movdqa xmmG, xmmD 238 punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 239 punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 240 241 movdqa xmmE, xmmA 242 punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 243 punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 244 245 movdqa xmmH, xmmB 246 punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 247 punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 248 249 pxor xmmF, xmmF 250 251 movdqa xmmC, xmmA 252 punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 253 punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 254 255 movdqa xmmD, xmmB 256 punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 257 punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 258 259 movdqa xmmG, xmmE 260 punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 261 punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 262 263 punpcklbw xmmF, xmmH 264 punpckhbw xmmH, xmmH 265 psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 266 psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 267 268%endif ; RGB_PIXELSIZE ; --------------- 269 270 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 271 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 272 273 ; (Original) 274 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 275 ; 276 ; (This implementation) 277 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 278 279 movdqa xmm6, xmm1 280 punpcklwd xmm1, xmm3 281 punpckhwd xmm6, xmm3 282 pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 283 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 284 285 movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) 286 287 movdqa xmm6, xmm0 288 punpcklwd xmm0, xmm2 289 punpckhwd xmm6, xmm2 290 pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 291 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 292 293 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 294 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 295 296 movdqa xmm0, xmm5 ; xmm0=BO 297 movdqa xmm6, xmm4 ; xmm6=BE 298 299 movdqa xmm4, xmm0 300 punpcklwd xmm0, xmm3 301 punpckhwd xmm4, xmm3 302 pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 303 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 304 305 movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 306 307 paddd xmm0, xmm1 308 paddd xmm4, xmm7 309 paddd xmm0, xmm3 310 paddd xmm4, xmm3 311 psrld xmm0, SCALEBITS ; xmm0=YOL 312 psrld xmm4, SCALEBITS ; xmm4=YOH 313 packssdw xmm0, xmm4 ; xmm0=YO 314 315 movdqa xmm4, xmm6 316 punpcklwd xmm6, xmm2 317 punpckhwd xmm4, xmm2 318 pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 319 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 320 321 movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 322 323 paddd xmm6, XMMWORD [wk(0)] 324 paddd xmm4, XMMWORD [wk(1)] 325 paddd xmm6, xmm2 326 paddd xmm4, xmm2 327 psrld xmm6, SCALEBITS ; xmm6=YEL 328 psrld xmm4, SCALEBITS ; xmm4=YEH 329 packssdw xmm6, xmm4 ; xmm6=YE 330 331 psllw xmm0, BYTE_BIT 332 por xmm6, xmm0 ; xmm6=Y 333 movdqa XMMWORD [rdi], xmm6 ; Save Y 334 335 sub rcx, byte SIZEOF_XMMWORD 336 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 337 add rdi, byte SIZEOF_XMMWORD ; outptr0 338 cmp rcx, byte SIZEOF_XMMWORD 339 jae near .columnloop 340 test rcx, rcx 341 jnz near .column_ld1 342 343 pop rcx ; col 344 pop rsi 345 pop rdi 346 347 add rsi, byte SIZEOF_JSAMPROW ; input_buf 348 add rdi, byte SIZEOF_JSAMPROW 349 dec rax ; num_rows 350 jg near .rowloop 351 352.return: 353 pop rbx 354 uncollect_args 5 355 mov rsp, rbp ; rsp <- aligned rbp 356 pop rsp ; rsp <- original rbp 357 pop rbp 358 ret 359 360; For some reason, the OS X linker does not honor the request to align the 361; segment unless we do this. 362 align 32 363