1; 2; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2) 3; 4; Copyright (C) 2011, 2016, D. R. Commander. 5; Copyright (C) 2015, Intel Corporation. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jcolsamp.inc" 19 20; -------------------------------------------------------------------------- 21; 22; Convert some rows of samples to the output colorspace. 23; 24; GLOBAL(void) 25; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, 26; JSAMPIMAGE output_buf, JDIMENSION output_row, 27; int num_rows); 28; 29 30; r10d = JDIMENSION img_width 31; r11 = JSAMPARRAY input_buf 32; r12 = JSAMPIMAGE output_buf 33; r13d = JDIMENSION output_row 34; r14d = int num_rows 35 36%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 37%define WK_NUM 2 38 39 align 32 40 GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2) 41 42EXTN(jsimd_rgb_gray_convert_avx2): 43 push rbp 44 mov rax, rsp ; rax = original rbp 45 sub rsp, byte 4 46 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 47 mov [rsp], rax 48 mov rbp, rsp ; rbp = aligned rbp 49 lea rsp, [wk(0)] 50 collect_args 5 51 push rbx 52 53 mov ecx, r10d 54 test rcx, rcx 55 jz near .return 56 57 push rcx 58 59 mov rsi, r12 60 mov ecx, r13d 61 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 62 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 63 64 pop rcx 65 66 mov rsi, r11 67 mov eax, r14d 68 test rax, rax 69 jle near .return 70.rowloop: 71 push rdi 72 push rsi 73 push rcx ; col 74 75 mov rsip, JSAMPROW [rsi] ; inptr 76 mov rdip, JSAMPROW [rdi] ; outptr0 77 78 cmp rcx, byte SIZEOF_YMMWORD 79 jae near .columnloop 80 81%if RGB_PIXELSIZE == 3 ; --------------- 82 83.column_ld1: 84 push rax 85 push rdx 86 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 87 test cl, SIZEOF_BYTE 88 jz short .column_ld2 89 sub rcx, byte SIZEOF_BYTE 90 movzx rax, byte [rsi+rcx] 91.column_ld2: 92 test cl, SIZEOF_WORD 93 jz short .column_ld4 94 sub rcx, byte SIZEOF_WORD 95 movzx rdx, word [rsi+rcx] 96 shl rax, WORD_BIT 97 or rax, rdx 98.column_ld4: 99 vmovd xmmA, eax 100 pop rdx 101 pop rax 102 test cl, SIZEOF_DWORD 103 jz short .column_ld8 104 sub rcx, byte SIZEOF_DWORD 105 vmovd xmmF, XMM_DWORD [rsi+rcx] 106 vpslldq xmmA, xmmA, SIZEOF_DWORD 107 vpor xmmA, xmmA, xmmF 108.column_ld8: 109 test cl, SIZEOF_MMWORD 110 jz short .column_ld16 111 sub rcx, byte SIZEOF_MMWORD 112 vmovq xmmB, XMM_MMWORD [rsi+rcx] 113 vpslldq xmmA, xmmA, SIZEOF_MMWORD 114 vpor xmmA, xmmA, xmmB 115.column_ld16: 116 test cl, SIZEOF_XMMWORD 117 jz short .column_ld32 118 sub rcx, byte SIZEOF_XMMWORD 119 vmovdqu xmmB, XMM_MMWORD [rsi+rcx] 120 vperm2i128 ymmA, ymmA, ymmA, 1 121 vpor ymmA, ymmB 122.column_ld32: 123 test cl, SIZEOF_YMMWORD 124 jz short .column_ld64 125 sub rcx, byte SIZEOF_YMMWORD 126 vmovdqa ymmF, ymmA 127 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 128.column_ld64: 129 test cl, 2*SIZEOF_YMMWORD 130 mov rcx, SIZEOF_YMMWORD 131 jz short .rgb_gray_cnv 132 vmovdqa ymmB, ymmA 133 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 134 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 135 jmp short .rgb_gray_cnv 136 137.columnloop: 138 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 139 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 140 vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] 141 142.rgb_gray_cnv: 143 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 144 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 145 ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 146 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 147 ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 148 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 149 150 vmovdqu ymmC, ymmA 151 vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 152 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 153 vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 154 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 155 vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 156 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 157 vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A 158 ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) 159 160 vmovdqa ymmG, ymmA 161 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 162 ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) 163 vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I 164 ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) 165 166 vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A 167 ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) 168 vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 169 ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) 170 171 vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D 172 ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) 173 vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F 174 ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) 175 176 vmovdqa ymmD, ymmA 177 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 178 ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) 179 vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P 180 ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) 181 182 vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D 183 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) 184 vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B 185 ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) 186 187 vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E 188 ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) 189 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 190 ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) 191 192 vmovdqa ymmE, ymmA 193 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C 194 ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) 195 vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S 196 ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) 197 198 vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 199 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 200 vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D 201 ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) 202 203 vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F 204 ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) 205 vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F 206 ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) 207 208 vpxor ymmH, ymmH, ymmH 209 210 vmovdqa ymmC, ymmA 211 vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 212 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 213 214 vmovdqa ymmB, ymmE 215 vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 216 vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 217 218 vmovdqa ymmF, ymmD 219 vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 220 vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 221 222%else ; RGB_PIXELSIZE == 4 ; ----------- 223 224.column_ld1: 225 test cl, SIZEOF_XMMWORD/16 226 jz short .column_ld2 227 sub rcx, byte SIZEOF_XMMWORD/16 228 vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 229.column_ld2: 230 test cl, SIZEOF_XMMWORD/8 231 jz short .column_ld4 232 sub rcx, byte SIZEOF_XMMWORD/8 233 vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 234 vpslldq xmmA, xmmA, SIZEOF_MMWORD 235 vpor xmmA, xmmA, xmmF 236.column_ld4: 237 test cl, SIZEOF_XMMWORD/4 238 jz short .column_ld8 239 sub rcx, byte SIZEOF_XMMWORD/4 240 vmovdqa xmmF, xmmA 241 vperm2i128 ymmF, ymmF, ymmF, 1 242 vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 243 vpor ymmA, ymmA, ymmF 244.column_ld8: 245 test cl, SIZEOF_XMMWORD/2 246 jz short .column_ld16 247 sub rcx, byte SIZEOF_XMMWORD/2 248 vmovdqa ymmF, ymmA 249 vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] 250.column_ld16: 251 test cl, SIZEOF_XMMWORD 252 mov rcx, SIZEOF_YMMWORD 253 jz short .rgb_gray_cnv 254 vmovdqa ymmE, ymmA 255 vmovdqa ymmH, ymmF 256 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 257 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 258 jmp short .rgb_gray_cnv 259 260.columnloop: 261 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 262 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 263 vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] 264 vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] 265 266.rgb_gray_cnv: 267 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 268 ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 269 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 270 ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 271 ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J 272 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 273 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R 274 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 275 276 vmovdqa ymmB, ymmA 277 vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 278 ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) 279 vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 280 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 281 282 vmovdqa ymmB, ymmF 283 vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 284 ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) 285 vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F 286 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 287 288 vmovdqa ymmD, ymmA 289 vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 290 ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) 291 vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 292 ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) 293 294 vmovdqa ymmC, ymmF 295 vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D 296 ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) 297 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F 298 ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) 299 300 vmovdqa ymmB, ymmA 301 vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C 302 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) 303 vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D 304 ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) 305 306 vmovdqa ymmG, ymmD 307 vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E 308 ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) 309 vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F 310 ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) 311 312 vmovdqa ymmE, ymmA 313 vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 314 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 315 vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E 316 ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) 317 318 vmovdqa ymmH, ymmB 319 vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F 320 ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) 321 vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F 322 ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) 323 324 vpxor ymmF, ymmF, ymmF 325 326 vmovdqa ymmC, ymmA 327 vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 328 vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 329 330 vmovdqa ymmD, ymmB 331 vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 332 vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 333 334 vmovdqa ymmG, ymmE 335 vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 336 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) 337 338 vpunpcklbw ymmF, ymmF, ymmH 339 vpunpckhbw ymmH, ymmH, ymmH 340 vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 341 vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) 342 343%endif ; RGB_PIXELSIZE ; --------------- 344 345 ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE 346 ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO 347 348 ; (Original) 349 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 350 ; 351 ; (This implementation) 352 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 353 354 vmovdqa ymm6, ymm1 355 vpunpcklwd ymm1, ymm1, ymm3 356 vpunpckhwd ymm6, ymm6, ymm3 357 vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) 358 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) 359 360 vmovdqa ymm7, ymm6 ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337) 361 362 vmovdqa ymm6, ymm0 363 vpunpcklwd ymm0, ymm0, ymm2 364 vpunpckhwd ymm6, ymm6, ymm2 365 vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) 366 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) 367 368 vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 369 vmovdqa YMMWORD [wk(1)], ymm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 370 371 vmovdqa ymm0, ymm5 ; ymm0=BO 372 vmovdqa ymm6, ymm4 ; ymm6=BE 373 374 vmovdqa ymm4, ymm0 375 vpunpcklwd ymm0, ymm0, ymm3 376 vpunpckhwd ymm4, ymm4, ymm3 377 vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) 378 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) 379 380 vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] 381 382 vpaddd ymm0, ymm0, ymm1 383 vpaddd ymm4, ymm4, ymm7 384 vpaddd ymm0, ymm0, ymm3 385 vpaddd ymm4, ymm4, ymm3 386 vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL 387 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH 388 vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO 389 390 vmovdqa ymm4, ymm6 391 vpunpcklwd ymm6, ymm6, ymm2 392 vpunpckhwd ymm4, ymm4, ymm2 393 vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) 394 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) 395 396 vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] 397 398 vpaddd ymm6, ymm6, YMMWORD [wk(0)] 399 vpaddd ymm4, ymm4, YMMWORD [wk(1)] 400 vpaddd ymm6, ymm6, ymm2 401 vpaddd ymm4, ymm4, ymm2 402 vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL 403 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH 404 vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE 405 406 vpsllw ymm0, ymm0, BYTE_BIT 407 vpor ymm6, ymm6, ymm0 ; ymm6=Y 408 vmovdqu YMMWORD [rdi], ymm6 ; Save Y 409 410 sub rcx, byte SIZEOF_YMMWORD 411 add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr 412 add rdi, byte SIZEOF_YMMWORD ; outptr0 413 cmp rcx, byte SIZEOF_YMMWORD 414 jae near .columnloop 415 test rcx, rcx 416 jnz near .column_ld1 417 418 pop rcx ; col 419 pop rsi 420 pop rdi 421 422 add rsi, byte SIZEOF_JSAMPROW ; input_buf 423 add rdi, byte SIZEOF_JSAMPROW 424 dec rax ; num_rows 425 jg near .rowloop 426 427.return: 428 pop rbx 429 vzeroupper 430 uncollect_args 5 431 mov rsp, rbp ; rsp <- aligned rbp 432 pop rsp ; rsp <- original rbp 433 pop rbp 434 ret 435 436; For some reason, the OS X linker does not honor the request to align the 437; segment unless we do this. 438 align 32 439