1; 2; jccolext.asm - colorspace conversion (64-bit AVX2) 3; 4; Copyright (C) 2009, 2016, D. R. Commander. 5; Copyright (C) 2015, Intel Corporation. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jcolsamp.inc" 19 20; -------------------------------------------------------------------------- 21; 22; Convert some rows of samples to the output colorspace. 23; 24; GLOBAL(void) 25; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf, 26; JSAMPIMAGE output_buf, JDIMENSION output_row, 27; int num_rows); 28; 29 30; r10d = JDIMENSION img_width 31; r11 = JSAMPARRAY input_buf 32; r12 = JSAMPIMAGE output_buf 33; r13d = JDIMENSION output_row 34; r14d = int num_rows 35 36%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM] 37%define WK_NUM 8 38 39 align 32 40 GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2) 41 42EXTN(jsimd_rgb_ycc_convert_avx2): 43 push rbp 44 mov rax, rsp ; rax = original rbp 45 sub rsp, byte 4 46 and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits 47 mov [rsp], rax 48 mov rbp, rsp ; rbp = aligned rbp 49 lea rsp, [wk(0)] 50 collect_args 5 51 push rbx 52 53 mov ecx, r10d 54 test rcx, rcx 55 jz near .return 56 57 push rcx 58 59 mov rsi, r12 60 mov ecx, r13d 61 mov rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 62 mov rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY] 63 mov rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY] 64 lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 65 lea rbx, [rbx+rcx*SIZEOF_JSAMPROW] 66 lea rdx, [rdx+rcx*SIZEOF_JSAMPROW] 67 68 pop rcx 69 70 mov rsi, r11 71 mov eax, r14d 72 test rax, rax 73 jle near .return 74.rowloop: 75 push rdx 76 push rbx 77 push rdi 78 push rsi 79 push rcx ; col 80 81 mov rsip, JSAMPROW [rsi] ; inptr 82 mov rdip, JSAMPROW [rdi] ; outptr0 83 mov rbxp, JSAMPROW [rbx] ; outptr1 84 mov rdxp, JSAMPROW [rdx] ; outptr2 85 86 cmp rcx, byte SIZEOF_YMMWORD 87 jae near .columnloop 88 89%if RGB_PIXELSIZE == 3 ; --------------- 90 91.column_ld1: 92 push rax 93 push rdx 94 lea rcx, [rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 95 test cl, SIZEOF_BYTE 96 jz short .column_ld2 97 sub rcx, byte SIZEOF_BYTE 98 movzx rax, byte [rsi+rcx] 99.column_ld2: 100 test cl, SIZEOF_WORD 101 jz short .column_ld4 102 sub rcx, byte SIZEOF_WORD 103 movzx rdx, word [rsi+rcx] 104 shl rax, WORD_BIT 105 or rax, rdx 106.column_ld4: 107 vmovd xmmA, eax 108 pop rdx 109 pop rax 110 test cl, SIZEOF_DWORD 111 jz short .column_ld8 112 sub rcx, byte SIZEOF_DWORD 113 vmovd xmmF, XMM_DWORD [rsi+rcx] 114 vpslldq xmmA, xmmA, SIZEOF_DWORD 115 vpor xmmA, xmmA, xmmF 116.column_ld8: 117 test cl, SIZEOF_MMWORD 118 jz short .column_ld16 119 sub rcx, byte SIZEOF_MMWORD 120 vmovq xmmB, XMM_MMWORD [rsi+rcx] 121 vpslldq xmmA, xmmA, SIZEOF_MMWORD 122 vpor xmmA, xmmA, xmmB 123.column_ld16: 124 test cl, SIZEOF_XMMWORD 125 jz short .column_ld32 126 sub rcx, byte SIZEOF_XMMWORD 127 vmovdqu xmmB, XMM_MMWORD [rsi+rcx] 128 vperm2i128 ymmA, ymmA, ymmA, 1 129 vpor ymmA, ymmB 130.column_ld32: 131 test cl, SIZEOF_YMMWORD 132 jz short .column_ld64 133 sub rcx, byte SIZEOF_YMMWORD 134 vmovdqa ymmF, ymmA 135 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 136.column_ld64: 137 test cl, 2*SIZEOF_YMMWORD 138 mov rcx, SIZEOF_YMMWORD 139 jz short .rgb_ycc_cnv 140 vmovdqa ymmB, ymmA 141 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 142 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 143 jmp short .rgb_ycc_cnv 144 145.columnloop: 146 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 147 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 148 vmovdqu ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD] 149 150.rgb_ycc_cnv: 151 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 152 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 153 ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 154 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 155 ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 156 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 157 158 vmovdqu ymmC, ymmA 159 vinserti128 ymmA, ymmF, xmmA, 0 ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05 160 ; 0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L) 161 vinserti128 ymmC, ymmC, xmmB, 0 ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q 162 ; 15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 163 vinserti128 ymmB, ymmB, xmmF, 0 ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F 164 ; 2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V) 165 vperm2i128 ymmF, ymmC, ymmC, 1 ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A 166 ; 1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q) 167 168 vmovdqa ymmG, ymmA 169 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12 170 ; 22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I) 171 vpsrldq ymmG, ymmG, 8 ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I 172 ; 2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --) 173 174 vpunpckhbw ymmA, ymmA, ymmF ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A 175 ; 0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q) 176 vpslldq ymmF, ymmF, 8 ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27 177 ; 08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N) 178 179 vpunpcklbw ymmG, ymmG, ymmB ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D 180 ; 2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T) 181 vpunpckhbw ymmF, ymmF, ymmB ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F 182 ; 1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V) 183 184 vmovdqa ymmD, ymmA 185 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09 186 ; 11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P) 187 vpsrldq ymmD, ymmD, 8 ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P 188 ; 1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --) 189 190 vpunpckhbw ymmA, ymmA, ymmG ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D 191 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T) 192 vpslldq ymmG, ymmG, 8 ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B 193 ; 04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R) 194 195 vpunpcklbw ymmD, ymmD, ymmF ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E 196 ; 1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U) 197 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 198 ; 2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V) 199 200 vmovdqa ymmE, ymmA 201 vpslldq ymmA, ymmA, 8 ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C 202 ; 20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S) 203 vpsrldq ymmE, ymmE, 8 ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S 204 ; 2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --) 205 206 vpunpckhbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 207 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 208 vpslldq ymmD, ymmD, 8 ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D 209 ; 02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T) 210 211 vpunpcklbw ymmE, ymmE, ymmG ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F 212 ; 2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V) 213 vpunpckhbw ymmD, ymmD, ymmG ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F 214 ; 1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V) 215 216 vpxor ymmH, ymmH, ymmH 217 218 vmovdqa ymmC, ymmA 219 vpunpcklbw ymmA, ymmA, ymmH ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 220 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 221 222 vmovdqa ymmB, ymmE 223 vpunpcklbw ymmE, ymmE, ymmH ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 224 vpunpckhbw ymmB, ymmB, ymmH ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 225 226 vmovdqa ymmF, ymmD 227 vpunpcklbw ymmD, ymmD, ymmH ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 228 vpunpckhbw ymmF, ymmF, ymmH ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 229 230%else ; RGB_PIXELSIZE == 4 ; ----------- 231 232.column_ld1: 233 test cl, SIZEOF_XMMWORD/16 234 jz short .column_ld2 235 sub rcx, byte SIZEOF_XMMWORD/16 236 vmovd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 237.column_ld2: 238 test cl, SIZEOF_XMMWORD/8 239 jz short .column_ld4 240 sub rcx, byte SIZEOF_XMMWORD/8 241 vmovq xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 242 vpslldq xmmA, xmmA, SIZEOF_MMWORD 243 vpor xmmA, xmmA, xmmF 244.column_ld4: 245 test cl, SIZEOF_XMMWORD/4 246 jz short .column_ld8 247 sub rcx, byte SIZEOF_XMMWORD/4 248 vmovdqa xmmF, xmmA 249 vperm2i128 ymmF, ymmF, ymmF, 1 250 vmovdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 251 vpor ymmA, ymmA, ymmF 252.column_ld8: 253 test cl, SIZEOF_XMMWORD/2 254 jz short .column_ld16 255 sub rcx, byte SIZEOF_XMMWORD/2 256 vmovdqa ymmF, ymmA 257 vmovdqu ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE] 258.column_ld16: 259 test cl, SIZEOF_XMMWORD 260 mov rcx, SIZEOF_YMMWORD 261 jz short .rgb_ycc_cnv 262 vmovdqa ymmE, ymmA 263 vmovdqa ymmH, ymmF 264 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 265 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 266 jmp short .rgb_ycc_cnv 267 268.columnloop: 269 vmovdqu ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD] 270 vmovdqu ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD] 271 vmovdqu ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD] 272 vmovdqu ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD] 273 274.rgb_ycc_cnv: 275 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 276 ; 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 277 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 278 ; 0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 279 ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J 280 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 281 ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R 282 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 283 284 vmovdqa ymmB, ymmA 285 vinserti128 ymmA, ymmA, xmmE, 1 ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 286 ; 0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J) 287 vperm2i128 ymmE, ymmB, ymmE, 0x31 ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 288 ; 0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N) 289 290 vmovdqa ymmB, ymmF 291 vinserti128 ymmF, ymmF, xmmH, 1 ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B 292 ; 0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R) 293 vperm2i128 ymmH, ymmB, ymmH, 0x31 ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F 294 ; 0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V) 295 296 vmovdqa ymmD, ymmA 297 vpunpcklbw ymmA, ymmA, ymmE ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35 298 ; 0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L) 299 vpunpckhbw ymmD, ymmD, ymmE ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37 300 ; 0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N) 301 302 vmovdqa ymmC, ymmF 303 vpunpcklbw ymmF, ymmF, ymmH ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D 304 ; 0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T) 305 vpunpckhbw ymmC, ymmC, ymmH ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F 306 ; 0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V) 307 308 vmovdqa ymmB, ymmA 309 vpunpcklwd ymmA, ymmA, ymmF ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C 310 ; 0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S) 311 vpunpckhwd ymmB, ymmB, ymmF ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D 312 ; 0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T) 313 314 vmovdqa ymmG, ymmD 315 vpunpcklwd ymmD, ymmD, ymmC ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E 316 ; 0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U) 317 vpunpckhwd ymmG, ymmG, ymmC ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F 318 ; 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V) 319 320 vmovdqa ymmE, ymmA 321 vpunpcklbw ymmA, ymmA, ymmD ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E 322 ; 0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U) 323 vpunpckhbw ymmE, ymmE, ymmD ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E 324 ; 2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U) 325 326 vmovdqa ymmH, ymmB 327 vpunpcklbw ymmB, ymmB, ymmG ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F 328 ; 0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V) 329 vpunpckhbw ymmH, ymmH, ymmG ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F 330 ; 2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V) 331 332 vpxor ymmF, ymmF, ymmF 333 334 vmovdqa ymmC, ymmA 335 vpunpcklbw ymmA, ymmA, ymmF ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U) 336 vpunpckhbw ymmC, ymmC, ymmF ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U) 337 338 vmovdqa ymmD, ymmB 339 vpunpcklbw ymmB, ymmB, ymmF ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V) 340 vpunpckhbw ymmD, ymmD, ymmF ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V) 341 342 vmovdqa ymmG, ymmE 343 vpunpcklbw ymmE, ymmE, ymmF ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U) 344 vpunpckhbw ymmG, ymmG, ymmF ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U) 345 346 vpunpcklbw ymmF, ymmF, ymmH 347 vpunpckhbw ymmH, ymmH, ymmH 348 vpsrlw ymmF, ymmF, BYTE_BIT ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V) 349 vpsrlw ymmH, ymmH, BYTE_BIT ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V) 350 351%endif ; RGB_PIXELSIZE ; --------------- 352 353 ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE 354 ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO 355 356 ; (Original) 357 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 358 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 359 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 360 ; 361 ; (This implementation) 362 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 363 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 364 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 365 366 vmovdqa YMMWORD [wk(0)], ymm0 ; wk(0)=RE 367 vmovdqa YMMWORD [wk(1)], ymm1 ; wk(1)=RO 368 vmovdqa YMMWORD [wk(2)], ymm4 ; wk(2)=BE 369 vmovdqa YMMWORD [wk(3)], ymm5 ; wk(3)=BO 370 371 vmovdqa ymm6, ymm1 372 vpunpcklwd ymm1, ymm1, ymm3 373 vpunpckhwd ymm6, ymm6, ymm3 374 vmovdqa ymm7, ymm1 375 vmovdqa ymm4, ymm6 376 vpmaddwd ymm1, ymm1, [rel PW_F0299_F0337] ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337) 377 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337) 378 vpmaddwd ymm7, ymm7, [rel PW_MF016_MF033] ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 379 vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 380 381 vmovdqa YMMWORD [wk(4)], ymm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 382 vmovdqa YMMWORD [wk(5)], ymm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 383 384 vpxor ymm1, ymm1, ymm1 385 vpxor ymm6, ymm6, ymm6 386 vpunpcklwd ymm1, ymm1, ymm5 ; ymm1=BOL 387 vpunpckhwd ymm6, ymm6, ymm5 ; ymm6=BOH 388 vpsrld ymm1, ymm1, 1 ; ymm1=BOL*FIX(0.500) 389 vpsrld ymm6, ymm6, 1 ; ymm6=BOH*FIX(0.500) 390 391 vmovdqa ymm5, [rel PD_ONEHALFM1_CJ] ; ymm5=[PD_ONEHALFM1_CJ] 392 393 vpaddd ymm7, ymm7, ymm1 394 vpaddd ymm4, ymm4, ymm6 395 vpaddd ymm7, ymm7, ymm5 396 vpaddd ymm4, ymm4, ymm5 397 vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CbOL 398 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbOH 399 vpackssdw ymm7, ymm7, ymm4 ; ymm7=CbO 400 401 vmovdqa ymm1, YMMWORD [wk(2)] ; ymm1=BE 402 403 vmovdqa ymm6, ymm0 404 vpunpcklwd ymm0, ymm0, ymm2 405 vpunpckhwd ymm6, ymm6, ymm2 406 vmovdqa ymm5, ymm0 407 vmovdqa ymm4, ymm6 408 vpmaddwd ymm0, ymm0, [rel PW_F0299_F0337] ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337) 409 vpmaddwd ymm6, ymm6, [rel PW_F0299_F0337] ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337) 410 vpmaddwd ymm5, ymm5, [rel PW_MF016_MF033] ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 411 vpmaddwd ymm4, ymm4, [rel PW_MF016_MF033] ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 412 413 vmovdqa YMMWORD [wk(6)], ymm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 414 vmovdqa YMMWORD [wk(7)], ymm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 415 416 vpxor ymm0, ymm0, ymm0 417 vpxor ymm6, ymm6, ymm6 418 vpunpcklwd ymm0, ymm0, ymm1 ; ymm0=BEL 419 vpunpckhwd ymm6, ymm6, ymm1 ; ymm6=BEH 420 vpsrld ymm0, ymm0, 1 ; ymm0=BEL*FIX(0.500) 421 vpsrld ymm6, ymm6, 1 ; ymm6=BEH*FIX(0.500) 422 423 vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] 424 425 vpaddd ymm5, ymm5, ymm0 426 vpaddd ymm4, ymm4, ymm6 427 vpaddd ymm5, ymm5, ymm1 428 vpaddd ymm4, ymm4, ymm1 429 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CbEL 430 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=CbEH 431 vpackssdw ymm5, ymm5, ymm4 ; ymm5=CbE 432 433 vpsllw ymm7, ymm7, BYTE_BIT 434 vpor ymm5, ymm5, ymm7 ; ymm5=Cb 435 vmovdqu YMMWORD [rbx], ymm5 ; Save Cb 436 437 vmovdqa ymm0, YMMWORD [wk(3)] ; ymm0=BO 438 vmovdqa ymm6, YMMWORD [wk(2)] ; ymm6=BE 439 vmovdqa ymm1, YMMWORD [wk(1)] ; ymm1=RO 440 441 vmovdqa ymm4, ymm0 442 vpunpcklwd ymm0, ymm0, ymm3 443 vpunpckhwd ymm4, ymm4, ymm3 444 vmovdqa ymm7, ymm0 445 vmovdqa ymm5, ymm4 446 vpmaddwd ymm0, ymm0, [rel PW_F0114_F0250] ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250) 447 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250) 448 vpmaddwd ymm7, ymm7, [rel PW_MF008_MF041] ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 449 vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 450 451 vmovdqa ymm3, [rel PD_ONEHALF] ; ymm3=[PD_ONEHALF] 452 453 vpaddd ymm0, ymm0, YMMWORD [wk(4)] 454 vpaddd ymm4, ymm4, YMMWORD [wk(5)] 455 vpaddd ymm0, ymm0, ymm3 456 vpaddd ymm4, ymm4, ymm3 457 vpsrld ymm0, ymm0, SCALEBITS ; ymm0=YOL 458 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YOH 459 vpackssdw ymm0, ymm0, ymm4 ; ymm0=YO 460 461 vpxor ymm3, ymm3, ymm3 462 vpxor ymm4, ymm4, ymm4 463 vpunpcklwd ymm3, ymm3, ymm1 ; ymm3=ROL 464 vpunpckhwd ymm4, ymm4, ymm1 ; ymm4=ROH 465 vpsrld ymm3, ymm3, 1 ; ymm3=ROL*FIX(0.500) 466 vpsrld ymm4, ymm4, 1 ; ymm4=ROH*FIX(0.500) 467 468 vmovdqa ymm1, [rel PD_ONEHALFM1_CJ] ; ymm1=[PD_ONEHALFM1_CJ] 469 470 vpaddd ymm7, ymm7, ymm3 471 vpaddd ymm5, ymm5, ymm4 472 vpaddd ymm7, ymm7, ymm1 473 vpaddd ymm5, ymm5, ymm1 474 vpsrld ymm7, ymm7, SCALEBITS ; ymm7=CrOL 475 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrOH 476 vpackssdw ymm7, ymm7, ymm5 ; ymm7=CrO 477 478 vmovdqa ymm3, YMMWORD [wk(0)] ; ymm3=RE 479 480 vmovdqa ymm4, ymm6 481 vpunpcklwd ymm6, ymm6, ymm2 482 vpunpckhwd ymm4, ymm4, ymm2 483 vmovdqa ymm1, ymm6 484 vmovdqa ymm5, ymm4 485 vpmaddwd ymm6, ymm6, [rel PW_F0114_F0250] ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250) 486 vpmaddwd ymm4, ymm4, [rel PW_F0114_F0250] ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250) 487 vpmaddwd ymm1, ymm1, [rel PW_MF008_MF041] ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 488 vpmaddwd ymm5, ymm5, [rel PW_MF008_MF041] ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 489 490 vmovdqa ymm2, [rel PD_ONEHALF] ; ymm2=[PD_ONEHALF] 491 492 vpaddd ymm6, ymm6, YMMWORD [wk(6)] 493 vpaddd ymm4, ymm4, YMMWORD [wk(7)] 494 vpaddd ymm6, ymm6, ymm2 495 vpaddd ymm4, ymm4, ymm2 496 vpsrld ymm6, ymm6, SCALEBITS ; ymm6=YEL 497 vpsrld ymm4, ymm4, SCALEBITS ; ymm4=YEH 498 vpackssdw ymm6, ymm6, ymm4 ; ymm6=YE 499 500 vpsllw ymm0, ymm0, BYTE_BIT 501 vpor ymm6, ymm6, ymm0 ; ymm6=Y 502 vmovdqu YMMWORD [rdi], ymm6 ; Save Y 503 504 vpxor ymm2, ymm2, ymm2 505 vpxor ymm4, ymm4, ymm4 506 vpunpcklwd ymm2, ymm2, ymm3 ; ymm2=REL 507 vpunpckhwd ymm4, ymm4, ymm3 ; ymm4=REH 508 vpsrld ymm2, ymm2, 1 ; ymm2=REL*FIX(0.500) 509 vpsrld ymm4, ymm4, 1 ; ymm4=REH*FIX(0.500) 510 511 vmovdqa ymm0, [rel PD_ONEHALFM1_CJ] ; ymm0=[PD_ONEHALFM1_CJ] 512 513 vpaddd ymm1, ymm1, ymm2 514 vpaddd ymm5, ymm5, ymm4 515 vpaddd ymm1, ymm1, ymm0 516 vpaddd ymm5, ymm5, ymm0 517 vpsrld ymm1, ymm1, SCALEBITS ; ymm1=CrEL 518 vpsrld ymm5, ymm5, SCALEBITS ; ymm5=CrEH 519 vpackssdw ymm1, ymm1, ymm5 ; ymm1=CrE 520 521 vpsllw ymm7, ymm7, BYTE_BIT 522 vpor ymm1, ymm1, ymm7 ; ymm1=Cr 523 vmovdqu YMMWORD [rdx], ymm1 ; Save Cr 524 525 sub rcx, byte SIZEOF_YMMWORD 526 add rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD ; inptr 527 add rdi, byte SIZEOF_YMMWORD ; outptr0 528 add rbx, byte SIZEOF_YMMWORD ; outptr1 529 add rdx, byte SIZEOF_YMMWORD ; outptr2 530 cmp rcx, byte SIZEOF_YMMWORD 531 jae near .columnloop 532 test rcx, rcx 533 jnz near .column_ld1 534 535 pop rcx ; col 536 pop rsi 537 pop rdi 538 pop rbx 539 pop rdx 540 541 add rsi, byte SIZEOF_JSAMPROW ; input_buf 542 add rdi, byte SIZEOF_JSAMPROW 543 add rbx, byte SIZEOF_JSAMPROW 544 add rdx, byte SIZEOF_JSAMPROW 545 dec rax ; num_rows 546 jg near .rowloop 547 548.return: 549 pop rbx 550 vzeroupper 551 uncollect_args 5 552 mov rsp, rbp ; rsp <- aligned rbp 553 pop rsp ; rsp <- original rbp 554 pop rbp 555 ret 556 557; For some reason, the OS X linker does not honor the request to align the 558; segment unless we do this. 559 align 32 560