1; 2; jccolext.asm - colorspace conversion (64-bit AVX2) 3; 4; Copyright (C) 2009, 2016, D. R. Commander. 5; Copyright (C) 2015, Intel Corporation. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jcolsamp.inc" 20 21; -------------------------------------------------------------------------- 22; 23; Convert some rows of samples to the output colorspace. 24; 25; GLOBAL(void) 26; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, 27; JSAMPIMAGE output_buf, JDIMENSION output_row, 28; int num_rows); 29; 30 31; r10d = JDIMENSION img_width 32; r11 = JSAMPARRAY input_buf 33; r12 = JSAMPIMAGE output_buf 34; r13d = JDIMENSION output_row 35; r14d = int num_rows 36 37%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 38%define WK_NUM 8 39 40 align 32 41 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) 42 43EXTN(jsimd_rgb_ycc_convert_avx2): 44 push rbp 45 mov rax, rsp ; rax = original rbp 46 sub rsp, byte 4 47 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 48 mov [rsp], rax 49 mov rbp, rsp ; rbp = aligned rbp 50 lea rsp, [wk(0)] 51 collect_args 5 52 push rbx 53 54 mov ecx, r10d 55 test rcx, rcx 56 jz near .return 57 58 push rcx 59 60 mov rsi, r12 61 mov ecx, r13d 62 mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 63 mov rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 64 mov rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 65 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 66 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 67 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 68 69 pop rcx 70 71 mov rsi, r11 72 mov eax, r14d 73 test rax, rax 74 jle near .return 75.rowloop: 76 push rdx 77 push rbx 78 push rdi 79 push rsi 80 push rcx ; col 81 82 mov rsi, JSAMPROW [rsi] ; inptr 83 mov rdi, JSAMPROW [rdi] ; outptr0 84 mov rbx, JSAMPROW [rbx] ; outptr1 85 mov rdx, JSAMPROW [rdx] ; outptr2 86 87 cmp rcx, byte SIZEOF_YMMWORD 88 jae near .columnloop 89 90%if RGB_PIXELSIZE == 3 ; --------------- 91 92.column_ld1: 93 push rax 94 push rdx 95 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 96 test cl, SIZEOF_BYTE 97 jz short .column_ld2 98 sub rcx, byte SIZEOF_BYTE 99 movzx rax, BYTE [rsi+rcx] 100.column_ld2: 101 test cl, SIZEOF_WORD 102 jz short .column_ld4 103 sub rcx, byte SIZEOF_WORD 104 movzx rdx, WORD [rsi+rcx] 105 shl rax, WORD_BIT 106 or rax, rdx 107.column_ld4: 108 vmovd xmmA, eax 109 pop rdx 110 pop rax 111 test cl, SIZEOF_DWORD 112 jz short .column_ld8 113 sub rcx, byte SIZEOF_DWORD 114 vmovd xmmF, XMM_DWORD [rsi+rcx] 115 vpslldq xmmA, xmmA, SIZEOF_DWORD 116 vpor xmmA, xmmA, xmmF 117.column_ld8: 118 test cl, SIZEOF_MMWORD 119 jz short .column_ld16 120 sub rcx, byte SIZEOF_MMWORD 121 vmovq xmmB, XMM_MMWORD [rsi+rcx] 122 vpslldq xmmA, xmmA, SIZEOF_MMWORD 123 vpor xmmA, xmmA, xmmB 124.column_ld16: 125 test cl, SIZEOF_XMMWORD 126 jz short .column_ld32 127 sub rcx, byte SIZEOF_XMMWORD 128 vmovdqu xmmB, XMM_MMWORD [rsi+rcx] 129 vperm2i128 ymmA, ymmA, ymmA, 1 130 vpor ymmA, ymmB 131.column_ld32: 132 test cl, SIZEOF_YMMWORD 133 jz short .column_ld64 134 sub rcx, byte SIZEOF_YMMWORD 135 vmovdqa ymmF, ymmA 136 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 137.column_ld64: 138 test cl, 2*SIZEOF_YMMWORD 139 mov rcx, SIZEOF_YMMWORD 140 jz short .rgb_ycc_cnv 141 vmovdqa ymmB, ymmA 142 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 143 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 144 jmp short .rgb_ycc_cnv 145 146.columnloop: 147 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 148 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 149 vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] 150 151.rgb_ycc_cnv: 152 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 153 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 154 ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 155 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 156 ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 157 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 158 159 vmovdqu ymmC, ymmA 160 vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 161 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 162 vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 163 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 164 vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 165 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 166 vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A 167 ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) 168 169 vmovdqa ymmG, ymmA 170 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 171 ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) 172 vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I 173 ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) 174 175 vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A 176 ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) 177 vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 178 ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) 179 180 vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D 181 ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) 182 vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F 183 ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) 184 185 vmovdqa ymmD, ymmA 186 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 187 ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) 188 vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P 189 ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) 190 191 vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D 192 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) 193 vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B 194 ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) 195 196 vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E 197 ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) 198 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 199 ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) 200 201 vmovdqa ymmE, ymmA 202 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C 203 ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) 204 vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S 205 ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) 206 207 vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 208 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 209 vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D 210 ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) 211 212 vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F 213 ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) 214 vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F 215 ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) 216 217 vpxor ymmH, ymmH, ymmH 218 219 vmovdqa ymmC, ymmA 220 vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 221 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 222 223 vmovdqa ymmB, ymmE 224 vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 225 vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 226 227 vmovdqa ymmF, ymmD 228 vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 229 vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 230 231%else ; RGB_PIXELSIZE == 4 ; ----------- 232 233.column_ld1: 234 test cl, SIZEOF_XMMWORD/16 235 jz short .column_ld2 236 sub rcx, byte SIZEOF_XMMWORD/16 237 vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 238.column_ld2: 239 test cl, SIZEOF_XMMWORD/8 240 jz short .column_ld4 241 sub rcx, byte SIZEOF_XMMWORD/8 242 vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 243 vpslldq xmmA, xmmA, SIZEOF_MMWORD 244 vpor xmmA, xmmA, xmmF 245.column_ld4: 246 test cl, SIZEOF_XMMWORD/4 247 jz short .column_ld8 248 sub rcx, byte SIZEOF_XMMWORD/4 249 vmovdqa xmmF, xmmA 250 vperm2i128 ymmF, ymmF, ymmF, 1 251 vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 252 vpor ymmA, ymmA, ymmF 253.column_ld8: 254 test cl, SIZEOF_XMMWORD/2 255 jz short .column_ld16 256 sub rcx, byte SIZEOF_XMMWORD/2 257 vmovdqa ymmF, ymmA 258 vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] 259.column_ld16: 260 test cl, SIZEOF_XMMWORD 261 mov rcx, SIZEOF_YMMWORD 262 jz short .rgb_ycc_cnv 263 vmovdqa ymmE, ymmA 264 vmovdqa ymmH, ymmF 265 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 266 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 267 jmp short .rgb_ycc_cnv 268 269.columnloop: 270 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 271 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 272 vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] 273 vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] 274 275.rgb_ycc_cnv: 276 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 277 ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 278 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 279 ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 280 ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J 281 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 282 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R 283 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 284 285 vmovdqa ymmB, ymmA 286 vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 287 ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) 288 vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 289 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 290 291 vmovdqa ymmB, ymmF 292 vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 293 ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) 294 vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F 295 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 296 297 vmovdqa ymmD, ymmA 298 vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 299 ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) 300 vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 301 ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) 302 303 vmovdqa ymmC, ymmF 304 vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D 305 ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) 306 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F 307 ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) 308 309 vmovdqa ymmB, ymmA 310 vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C 311 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) 312 vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D 313 ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) 314 315 vmovdqa ymmG, ymmD 316 vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E 317 ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) 318 vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F 319 ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) 320 321 vmovdqa ymmE, ymmA 322 vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 323 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 324 vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E 325 ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) 326 327 vmovdqa ymmH, ymmB 328 vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F 329 ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) 330 vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F 331 ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) 332 333 vpxor ymmF, ymmF, ymmF 334 335 vmovdqa ymmC, ymmA 336 vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 337 vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 338 339 vmovdqa ymmD, ymmB 340 vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 341 vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 342 343 vmovdqa ymmG, ymmE 344 vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 345 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) 346 347 vpunpcklbw ymmF, ymmF, ymmH 348 vpunpckhbw ymmH, ymmH, ymmH 349 vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 350 vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) 351 352%endif ; RGB_PIXELSIZE ; --------------- 353 354 ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE 355 ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO 356 357 ; (Original) 358 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 359 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 360 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 361 ; 362 ; (This implementation) 363 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 364 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 365 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 366 367 vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE 368 vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO 369 vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE 370 vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO 371 372 vmovdqa ymm6, ymm1 373 vpunpcklwd ymm1, ymm1, ymm3 374 vpunpckhwd ymm6, ymm6, ymm3 375 vmovdqa ymm7, ymm1 376 vmovdqa ymm4, ymm6 377 vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) 378 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) 379 vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 380 vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 381 382 vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 383 vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 384 385 vpxor ymm1, ymm1, ymm1 386 vpxor ymm6, ymm6, ymm6 387 vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL 388 vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH 389 vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500) 390 vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500) 391 392 vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ] 393 394 vpaddd ymm7, ymm7, ymm1 395 vpaddd ymm4, ymm4, ymm6 396 vpaddd ymm7, ymm7, ymm5 397 vpaddd ymm4, ymm4, ymm5 398 vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL 399 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH 400 vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO 401 402 vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE 403 404 vmovdqa ymm6, ymm0 405 vpunpcklwd ymm0, ymm0, ymm2 406 vpunpckhwd ymm6, ymm6, ymm2 407 vmovdqa ymm5, ymm0 408 vmovdqa ymm4, ymm6 409 vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) 410 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) 411 vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 412 vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 413 414 vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 415 vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 416 417 vpxor ymm0, ymm0, ymm0 418 vpxor ymm6, ymm6, ymm6 419 vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL 420 vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH 421 vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500) 422 vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500) 423 424 vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] 425 426 vpaddd ymm5, ymm5, ymm0 427 vpaddd ymm4, ymm4, ymm6 428 vpaddd ymm5, ymm5, ymm1 429 vpaddd ymm4, ymm4, ymm1 430 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL 431 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH 432 vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE 433 434 vpsllw ymm7, ymm7, BYTE_BIT 435 vpor ymm5, ymm5, ymm7 ; ymm5=Cb 436 vmovdqu YMMWORD [rbx], ymm5 ; Save Cb 437 438 vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO 439 vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE 440 vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO 441 442 vmovdqa ymm4, ymm0 443 vpunpcklwd ymm0, ymm0, ymm3 444 vpunpckhwd ymm4, ymm4, ymm3 445 vmovdqa ymm7, ymm0 446 vmovdqa ymm5, ymm4 447 vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) 448 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) 449 vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 450 vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 451 452 vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] 453 454 vpaddd ymm0, ymm0, YMMWORD [wk(4)] 455 vpaddd ymm4, ymm4, YMMWORD [wk(5)] 456 vpaddd ymm0, ymm0, ymm3 457 vpaddd ymm4, ymm4, ymm3 458 vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL 459 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH 460 vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO 461 462 vpxor ymm3, ymm3, ymm3 463 vpxor ymm4, ymm4, ymm4 464 vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL 465 vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH 466 vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500) 467 vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500) 468 469 vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] 470 471 vpaddd ymm7, ymm7, ymm3 472 vpaddd ymm5, ymm5, ymm4 473 vpaddd ymm7, ymm7, ymm1 474 vpaddd ymm5, ymm5, ymm1 475 vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL 476 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH 477 vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO 478 479 vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE 480 481 vmovdqa ymm4, ymm6 482 vpunpcklwd ymm6, ymm6, ymm2 483 vpunpckhwd ymm4, ymm4, ymm2 484 vmovdqa ymm1, ymm6 485 vmovdqa ymm5, ymm4 486 vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) 487 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) 488 vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 489 vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 490 491 vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] 492 493 vpaddd ymm6, ymm6, YMMWORD [wk(6)] 494 vpaddd ymm4, ymm4, YMMWORD [wk(7)] 495 vpaddd ymm6, ymm6, ymm2 496 vpaddd ymm4, ymm4, ymm2 497 vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL 498 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH 499 vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE 500 501 vpsllw ymm0, ymm0, BYTE_BIT 502 vpor ymm6, ymm6, ymm0 ; ymm6=Y 503 vmovdqu YMMWORD [rdi], ymm6 ; Save Y 504 505 vpxor ymm2, ymm2, ymm2 506 vpxor ymm4, ymm4, ymm4 507 vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL 508 vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH 509 vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500) 510 vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500) 511 512 vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ] 513 514 vpaddd ymm1, ymm1, ymm2 515 vpaddd ymm5, ymm5, ymm4 516 vpaddd ymm1, ymm1, ymm0 517 vpaddd ymm5, ymm5, ymm0 518 vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL 519 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH 520 vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE 521 522 vpsllw ymm7, ymm7, BYTE_BIT 523 vpor ymm1, ymm1, ymm7 ; ymm1=Cr 524 vmovdqu YMMWORD [rdx], ymm1 ; Save Cr 525 526 sub rcx, byte SIZEOF_YMMWORD 527 add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr 528 add rdi, byte SIZEOF_YMMWORD ; outptr0 529 add rbx, byte SIZEOF_YMMWORD ; outptr1 530 add rdx, byte SIZEOF_YMMWORD ; outptr2 531 cmp rcx, byte SIZEOF_YMMWORD 532 jae near .columnloop 533 test rcx, rcx 534 jnz near .column_ld1 535 536 pop rcx ; col 537 pop rsi 538 pop rdi 539 pop rbx 540 pop rdx 541 542 add rsi, byte SIZEOF_JSAMPROW ; input_buf 543 add rdi, byte SIZEOF_JSAMPROW 544 add rbx, byte SIZEOF_JSAMPROW 545 add rdx, byte SIZEOF_JSAMPROW 546 dec rax ; num_rows 547 jg near .rowloop 548 549.return: 550 pop rbx 551 vzeroupper 552 uncollect_args 5 553 mov rsp, rbp ; rsp <- aligned rbp 554 pop rsp ; rsp <- original rbp 555 pop rbp 556 ret 557 558; For some reason, the OS X linker does not honor the request to align the 559; segment unless we do this. 560 align 32 561