1; 2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2) 3; 4; Copyright (C) 2011, 2016, D. R. Commander. 5; Copyright (C) 2018, Matthias Räncker. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16 17%include "jcolsamp.inc" 18 19; -------------------------------------------------------------------------- 20; 21; Convert some rows of samples to the output colorspace. 22; 23; GLOBAL(void) 24; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf, 25; JSAMPIMAGE output_buf, JDIMENSION output_row, 26; int num_rows); 27; 28 29; r10d = JDIMENSION img_width 30; r11 = JSAMPARRAY input_buf 31; r12 = JSAMPIMAGE output_buf 32; r13d = JDIMENSION output_row 33; r14d = int num_rows 34 35%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 36%define WK_NUM 2 37 38 align 32 39 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2) 40 41EXTN(jsimd_rgb_gray_convert_sse2): 42 push rbp 43 mov rax, rsp ; rax = original rbp 44 sub rsp, byte 4 45 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 46 mov [rsp], rax 47 mov rbp, rsp ; rbp = aligned rbp 48 lea rsp, [wk(0)] 49 collect_args 5 50 push rbx 51 52 mov ecx, r10d 53 test rcx, rcx 54 jz near .return 55 56 push rcx 57 58 mov rsi, r12 59 mov ecx, r13d 60 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 61 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 62 63 pop rcx 64 65 mov rsi, r11 66 mov eax, r14d 67 test rax, rax 68 jle near .return 69.rowloop: 70 push rdi 71 push rsi 72 push rcx ; col 73 74 mov rsip, JSAMPROW [rsi] ; inptr 75 mov rdip, JSAMPROW [rdi] ; outptr0 76 77 cmp rcx, byte SIZEOF_XMMWORD 78 jae near .columnloop 79 80%if RGB_PIXELSIZE == 3 ; --------------- 81 82.column_ld1: 83 push rax 84 push rdx 85 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 86 test cl, SIZEOF_BYTE 87 jz short .column_ld2 88 sub rcx, byte SIZEOF_BYTE 89 movzx rax, byte [rsi+rcx] 90.column_ld2: 91 test cl, SIZEOF_WORD 92 jz short .column_ld4 93 sub rcx, byte SIZEOF_WORD 94 movzx rdx, word [rsi+rcx] 95 shl rax, WORD_BIT 96 or rax, rdx 97.column_ld4: 98 movd xmmA, eax 99 pop rdx 100 pop rax 101 test cl, SIZEOF_DWORD 102 jz short .column_ld8 103 sub rcx, byte SIZEOF_DWORD 104 movd xmmF, XMM_DWORD [rsi+rcx] 105 pslldq xmmA, SIZEOF_DWORD 106 por xmmA, xmmF 107.column_ld8: 108 test cl, SIZEOF_MMWORD 109 jz short .column_ld16 110 sub rcx, byte SIZEOF_MMWORD 111 movq xmmB, XMM_MMWORD [rsi+rcx] 112 pslldq xmmA, SIZEOF_MMWORD 113 por xmmA, xmmB 114.column_ld16: 115 test cl, SIZEOF_XMMWORD 116 jz short .column_ld32 117 movdqa xmmF, xmmA 118 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 119 mov rcx, SIZEOF_XMMWORD 120 jmp short .rgb_gray_cnv 121.column_ld32: 122 test cl, 2*SIZEOF_XMMWORD 123 mov rcx, SIZEOF_XMMWORD 124 jz short .rgb_gray_cnv 125 movdqa xmmB, xmmA 126 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 127 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 128 jmp short .rgb_gray_cnv 129 130.columnloop: 131 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 132 movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 133 movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 134 135.rgb_gray_cnv: 136 ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 137 ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 138 ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 139 140 movdqa xmmG, xmmA 141 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 142 psrldq xmmG, 8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 143 144 punpckhbw xmmA, xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 145 pslldq xmmF, 8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 146 147 punpcklbw xmmG, xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 148 punpckhbw xmmF, xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 149 150 movdqa xmmD, xmmA 151 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 152 psrldq xmmD, 8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 153 154 punpckhbw xmmA, xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 155 pslldq xmmG, 8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 156 157 punpcklbw xmmD, xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 158 punpckhbw xmmG, xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 159 160 movdqa xmmE, xmmA 161 pslldq xmmA, 8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 162 psrldq xmmE, 8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 163 164 punpckhbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 165 pslldq xmmD, 8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 166 167 punpcklbw xmmE, xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 168 punpckhbw xmmD, xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 169 170 pxor xmmH, xmmH 171 172 movdqa xmmC, xmmA 173 punpcklbw xmmA, xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 174 punpckhbw xmmC, xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 175 176 movdqa xmmB, xmmE 177 punpcklbw xmmE, xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 178 punpckhbw xmmB, xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 179 180 movdqa xmmF, xmmD 181 punpcklbw xmmD, xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 182 punpckhbw xmmF, xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 183 184%else ; RGB_PIXELSIZE == 4 ; ----------- 185 186.column_ld1: 187 test cl, SIZEOF_XMMWORD/16 188 jz short .column_ld2 189 sub rcx, byte SIZEOF_XMMWORD/16 190 movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 191.column_ld2: 192 test cl, SIZEOF_XMMWORD/8 193 jz short .column_ld4 194 sub rcx, byte SIZEOF_XMMWORD/8 195 movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 196 pslldq xmmA, SIZEOF_MMWORD 197 por xmmA, xmmE 198.column_ld4: 199 test cl, SIZEOF_XMMWORD/4 200 jz short .column_ld8 201 sub rcx, byte SIZEOF_XMMWORD/4 202 movdqa xmmE, xmmA 203 movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 204.column_ld8: 205 test cl, SIZEOF_XMMWORD/2 206 mov rcx, SIZEOF_XMMWORD 207 jz short .rgb_gray_cnv 208 movdqa xmmF, xmmA 209 movdqa xmmH, xmmE 210 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 211 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 212 jmp short .rgb_gray_cnv 213 214.columnloop: 215 movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 216 movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 217 movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 218 movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 219 220.rgb_gray_cnv: 221 ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 222 ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 223 ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 224 ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 225 226 movdqa xmmD, xmmA 227 punpcklbw xmmA, xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 228 punpckhbw xmmD, xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 229 230 movdqa xmmC, xmmF 231 punpcklbw xmmF, xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 232 punpckhbw xmmC, xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 233 234 movdqa xmmB, xmmA 235 punpcklwd xmmA, xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 236 punpckhwd xmmB, xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 237 238 movdqa xmmG, xmmD 239 punpcklwd xmmD, xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 240 punpckhwd xmmG, xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 241 242 movdqa xmmE, xmmA 243 punpcklbw xmmA, xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 244 punpckhbw xmmE, xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 245 246 movdqa xmmH, xmmB 247 punpcklbw xmmB, xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 248 punpckhbw xmmH, xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 249 250 pxor xmmF, xmmF 251 252 movdqa xmmC, xmmA 253 punpcklbw xmmA, xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 254 punpckhbw xmmC, xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 255 256 movdqa xmmD, xmmB 257 punpcklbw xmmB, xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 258 punpckhbw xmmD, xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 259 260 movdqa xmmG, xmmE 261 punpcklbw xmmE, xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 262 punpckhbw xmmG, xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 263 264 punpcklbw xmmF, xmmH 265 punpckhbw xmmH, xmmH 266 psrlw xmmF, BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 267 psrlw xmmH, BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 268 269%endif ; RGB_PIXELSIZE ; --------------- 270 271 ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 272 ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 273 274 ; (Original) 275 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 276 ; 277 ; (This implementation) 278 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 279 280 movdqa xmm6, xmm1 281 punpcklwd xmm1, xmm3 282 punpckhwd xmm6, xmm3 283 pmaddwd xmm1, [rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 284 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 285 286 movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) 287 288 movdqa xmm6, xmm0 289 punpcklwd xmm0, xmm2 290 punpckhwd xmm6, xmm2 291 pmaddwd xmm0, [rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 292 pmaddwd xmm6, [rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 293 294 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 295 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 296 297 movdqa xmm0, xmm5 ; xmm0=BO 298 movdqa xmm6, xmm4 ; xmm6=BE 299 300 movdqa xmm4, xmm0 301 punpcklwd xmm0, xmm3 302 punpckhwd xmm4, xmm3 303 pmaddwd xmm0, [rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 304 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 305 306 movdqa xmm3, [rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 307 308 paddd xmm0, xmm1 309 paddd xmm4, xmm7 310 paddd xmm0, xmm3 311 paddd xmm4, xmm3 312 psrld xmm0, SCALEBITS ; xmm0=YOL 313 psrld xmm4, SCALEBITS ; xmm4=YOH 314 packssdw xmm0, xmm4 ; xmm0=YO 315 316 movdqa xmm4, xmm6 317 punpcklwd xmm6, xmm2 318 punpckhwd xmm4, xmm2 319 pmaddwd xmm6, [rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 320 pmaddwd xmm4, [rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 321 322 movdqa xmm2, [rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 323 324 paddd xmm6, XMMWORD [wk(0)] 325 paddd xmm4, XMMWORD [wk(1)] 326 paddd xmm6, xmm2 327 paddd xmm4, xmm2 328 psrld xmm6, SCALEBITS ; xmm6=YEL 329 psrld xmm4, SCALEBITS ; xmm4=YEH 330 packssdw xmm6, xmm4 ; xmm6=YE 331 332 psllw xmm0, BYTE_BIT 333 por xmm6, xmm0 ; xmm6=Y 334 movdqa XMMWORD [rdi], xmm6 ; Save Y 335 336 sub rcx, byte SIZEOF_XMMWORD 337 add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 338 add rdi, byte SIZEOF_XMMWORD ; outptr0 339 cmp rcx, byte SIZEOF_XMMWORD 340 jae near .columnloop 341 test rcx, rcx 342 jnz near .column_ld1 343 344 pop rcx ; col 345 pop rsi 346 pop rdi 347 348 add rsi, byte SIZEOF_JSAMPROW ; input_buf 349 add rdi, byte SIZEOF_JSAMPROW 350 dec rax ; num_rows 351 jg near .rowloop 352 353.return: 354 pop rbx 355 uncollect_args 5 356 mov rsp, rbp ; rsp <- aligned rbp 357 pop rsp ; rsp <- original rbp 358 pop rbp 359 ret 360 361; For some reason, the OS X linker does not honor the request to align the 362; segment unless we do this. 363 align 32 364