• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jccolext.asm - colorspace conversion (64-bit AVX2)
3;
4; Copyright (C) 2009, 2016, D. R. Commander.
5; Copyright (C) 2015, Intel Corporation.
6; Copyright (C) 2018, Matthias Räncker.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Convert some rows of samples to the output colorspace.
23;
24; GLOBAL(void)
25; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
26;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
27;                            int num_rows);
28;
29
30; r10d = JDIMENSION img_width
31; r11 = JSAMPARRAY input_buf
32; r12 = JSAMPIMAGE output_buf
33; r13d = JDIMENSION output_row
34; r14d = int num_rows
35
36%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
37%define WK_NUM  8
38
39    align       32
40    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
41
42EXTN(jsimd_rgb_ycc_convert_avx2):
43    push        rbp
44    mov         rax, rsp                     ; rax = original rbp
45    sub         rsp, byte 4
46    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
47    mov         [rsp], rax
48    mov         rbp, rsp                     ; rbp = aligned rbp
49    lea         rsp, [wk(0)]
50    collect_args 5
51    push        rbx
52
53    mov         ecx, r10d
54    test        rcx, rcx
55    jz          near .return
56
57    push        rcx
58
59    mov         rsi, r12
60    mov         ecx, r13d
61    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
63    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
64    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
65    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
66    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
67
68    pop         rcx
69
70    mov         rsi, r11
71    mov         eax, r14d
72    test        rax, rax
73    jle         near .return
74.rowloop:
75    push        rdx
76    push        rbx
77    push        rdi
78    push        rsi
79    push        rcx                     ; col
80
81    mov         rsip, JSAMPROW [rsi]    ; inptr
82    mov         rdip, JSAMPROW [rdi]    ; outptr0
83    mov         rbxp, JSAMPROW [rbx]    ; outptr1
84    mov         rdxp, JSAMPROW [rdx]    ; outptr2
85
86    cmp         rcx, byte SIZEOF_YMMWORD
87    jae         near .columnloop
88
89%if RGB_PIXELSIZE == 3  ; ---------------
90
91.column_ld1:
92    push        rax
93    push        rdx
94    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
95    test        cl, SIZEOF_BYTE
96    jz          short .column_ld2
97    sub         rcx, byte SIZEOF_BYTE
98    movzx       rax, byte [rsi+rcx]
99.column_ld2:
100    test        cl, SIZEOF_WORD
101    jz          short .column_ld4
102    sub         rcx, byte SIZEOF_WORD
103    movzx       rdx, word [rsi+rcx]
104    shl         rax, WORD_BIT
105    or          rax, rdx
106.column_ld4:
107    vmovd       xmmA, eax
108    pop         rdx
109    pop         rax
110    test        cl, SIZEOF_DWORD
111    jz          short .column_ld8
112    sub         rcx, byte SIZEOF_DWORD
113    vmovd       xmmF, XMM_DWORD [rsi+rcx]
114    vpslldq     xmmA, xmmA, SIZEOF_DWORD
115    vpor        xmmA, xmmA, xmmF
116.column_ld8:
117    test        cl, SIZEOF_MMWORD
118    jz          short .column_ld16
119    sub         rcx, byte SIZEOF_MMWORD
120    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
121    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
122    vpor        xmmA, xmmA, xmmB
123.column_ld16:
124    test        cl, SIZEOF_XMMWORD
125    jz          short .column_ld32
126    sub         rcx, byte SIZEOF_XMMWORD
127    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
128    vperm2i128  ymmA, ymmA, ymmA, 1
129    vpor        ymmA, ymmB
130.column_ld32:
131    test        cl, SIZEOF_YMMWORD
132    jz          short .column_ld64
133    sub         rcx, byte SIZEOF_YMMWORD
134    vmovdqa     ymmF, ymmA
135    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
136.column_ld64:
137    test        cl, 2*SIZEOF_YMMWORD
138    mov         rcx, SIZEOF_YMMWORD
139    jz          short .rgb_ycc_cnv
140    vmovdqa     ymmB, ymmA
141    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
142    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
143    jmp         short .rgb_ycc_cnv
144
145.columnloop:
146    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
147    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
148    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
149
150.rgb_ycc_cnv:
151    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
152    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
153    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
154    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
155    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
156    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
157
158    vmovdqu     ymmC, ymmA
159    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
160                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
161    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
162                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
163    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
164                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
165    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
166                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
167
168    vmovdqa     ymmG, ymmA
169    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
170                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
171    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
172                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
173
174    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
175                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
176    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
177                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
178
179    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
180                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
181    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
182                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
183
184    vmovdqa     ymmD, ymmA
185    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
186                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
187    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
188                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
189
190    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
191                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
192    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
193                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
194
195    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
196                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
197    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
198                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
199
200    vmovdqa     ymmE, ymmA
201    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
202                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
203    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
204                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
205
206    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
207                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
208    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
209                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
210
211    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
212                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
213    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
214                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
215
216    vpxor       ymmH, ymmH, ymmH
217
218    vmovdqa     ymmC, ymmA
219    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
220    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
221
222    vmovdqa     ymmB, ymmE
223    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
224    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
225
226    vmovdqa     ymmF, ymmD
227    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
228    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
229
230%else  ; RGB_PIXELSIZE == 4 ; -----------
231
232.column_ld1:
233    test        cl, SIZEOF_XMMWORD/16
234    jz          short .column_ld2
235    sub         rcx, byte SIZEOF_XMMWORD/16
236    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
237.column_ld2:
238    test        cl, SIZEOF_XMMWORD/8
239    jz          short .column_ld4
240    sub         rcx, byte SIZEOF_XMMWORD/8
241    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
242    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
243    vpor        xmmA, xmmA, xmmF
244.column_ld4:
245    test        cl, SIZEOF_XMMWORD/4
246    jz          short .column_ld8
247    sub         rcx, byte SIZEOF_XMMWORD/4
248    vmovdqa     xmmF, xmmA
249    vperm2i128  ymmF, ymmF, ymmF, 1
250    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
251    vpor        ymmA, ymmA, ymmF
252.column_ld8:
253    test        cl, SIZEOF_XMMWORD/2
254    jz          short .column_ld16
255    sub         rcx, byte SIZEOF_XMMWORD/2
256    vmovdqa     ymmF, ymmA
257    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
258.column_ld16:
259    test        cl, SIZEOF_XMMWORD
260    mov         rcx, SIZEOF_YMMWORD
261    jz          short .rgb_ycc_cnv
262    vmovdqa     ymmE, ymmA
263    vmovdqa     ymmH, ymmF
264    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
265    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
266    jmp         short .rgb_ycc_cnv
267
268.columnloop:
269    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
270    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
271    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
272    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
273
274.rgb_ycc_cnv:
275    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
276    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
277    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
278    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
279    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
280    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
281    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
282    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
283
284    vmovdqa     ymmB, ymmA
285    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
286                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
287    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
288                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
289
290    vmovdqa     ymmB, ymmF
291    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
292                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
293    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
294                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
295
296    vmovdqa     ymmD, ymmA
297    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
298                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
299    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
300                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
301
302    vmovdqa     ymmC, ymmF
303    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
304                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
305    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
306                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
307
308    vmovdqa     ymmB, ymmA
309    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
310                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
311    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
312                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
313
314    vmovdqa     ymmG, ymmD
315    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
316                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
317    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
318                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
319
320    vmovdqa     ymmE, ymmA
321    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
322                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
323    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
324                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
325
326    vmovdqa     ymmH, ymmB
327    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
328                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
329    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
330                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
331
332    vpxor       ymmF, ymmF, ymmF
333
334    vmovdqa     ymmC, ymmA
335    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
336    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
337
338    vmovdqa     ymmD, ymmB
339    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
340    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
341
342    vmovdqa     ymmG, ymmE
343    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
344    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
345
346    vpunpcklbw  ymmF, ymmF, ymmH
347    vpunpckhbw  ymmH, ymmH, ymmH
348    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
349    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
350
351%endif  ; RGB_PIXELSIZE ; ---------------
352
353    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
354    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
355
356    ; (Original)
357    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
358    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
359    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
360    ;
361    ; (This implementation)
362    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
363    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
364    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
365
366    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
367    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
368    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
369    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
370
371    vmovdqa     ymm6, ymm1
372    vpunpcklwd  ymm1, ymm1, ymm3
373    vpunpckhwd  ymm6, ymm6, ymm3
374    vmovdqa     ymm7, ymm1
375    vmovdqa     ymm4, ymm6
376    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
377    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
378    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
379    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
380
381    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
382    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
383
384    vpxor       ymm1, ymm1, ymm1
385    vpxor       ymm6, ymm6, ymm6
386    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
387    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
388    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
389    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
390
391    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
392
393    vpaddd      ymm7, ymm7, ymm1
394    vpaddd      ymm4, ymm4, ymm6
395    vpaddd      ymm7, ymm7, ymm5
396    vpaddd      ymm4, ymm4, ymm5
397    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
398    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
399    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
400
401    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
402
403    vmovdqa     ymm6, ymm0
404    vpunpcklwd  ymm0, ymm0, ymm2
405    vpunpckhwd  ymm6, ymm6, ymm2
406    vmovdqa     ymm5, ymm0
407    vmovdqa     ymm4, ymm6
408    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
409    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
410    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
411    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
412
413    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
414    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
415
416    vpxor       ymm0, ymm0, ymm0
417    vpxor       ymm6, ymm6, ymm6
418    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
419    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
420    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
421    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
422
423    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
424
425    vpaddd      ymm5, ymm5, ymm0
426    vpaddd      ymm4, ymm4, ymm6
427    vpaddd      ymm5, ymm5, ymm1
428    vpaddd      ymm4, ymm4, ymm1
429    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
430    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
431    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
432
433    vpsllw      ymm7, ymm7, BYTE_BIT
434    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
435    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
436
437    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
438    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
439    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
440
441    vmovdqa     ymm4, ymm0
442    vpunpcklwd  ymm0, ymm0, ymm3
443    vpunpckhwd  ymm4, ymm4, ymm3
444    vmovdqa     ymm7, ymm0
445    vmovdqa     ymm5, ymm4
446    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
447    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
448    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
449    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
450
451    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
452
453    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
454    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
455    vpaddd      ymm0, ymm0, ymm3
456    vpaddd      ymm4, ymm4, ymm3
457    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
458    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
459    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
460
461    vpxor       ymm3, ymm3, ymm3
462    vpxor       ymm4, ymm4, ymm4
463    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
464    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
465    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
466    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
467
468    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
469
470    vpaddd      ymm7, ymm7, ymm3
471    vpaddd      ymm5, ymm5, ymm4
472    vpaddd      ymm7, ymm7, ymm1
473    vpaddd      ymm5, ymm5, ymm1
474    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
475    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
476    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
477
478    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
479
480    vmovdqa     ymm4, ymm6
481    vpunpcklwd  ymm6, ymm6, ymm2
482    vpunpckhwd  ymm4, ymm4, ymm2
483    vmovdqa     ymm1, ymm6
484    vmovdqa     ymm5, ymm4
485    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
486    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
487    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
488    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
489
490    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
491
492    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
493    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
494    vpaddd      ymm6, ymm6, ymm2
495    vpaddd      ymm4, ymm4, ymm2
496    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
497    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
498    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
499
500    vpsllw      ymm0, ymm0, BYTE_BIT
501    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
502    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
503
504    vpxor       ymm2, ymm2, ymm2
505    vpxor       ymm4, ymm4, ymm4
506    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
507    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
508    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
509    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
510
511    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
512
513    vpaddd      ymm1, ymm1, ymm2
514    vpaddd      ymm5, ymm5, ymm4
515    vpaddd      ymm1, ymm1, ymm0
516    vpaddd      ymm5, ymm5, ymm0
517    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
518    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
519    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
520
521    vpsllw      ymm7, ymm7, BYTE_BIT
522    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
523    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
524
525    sub         rcx, byte SIZEOF_YMMWORD
526    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
527    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
528    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
529    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
530    cmp         rcx, byte SIZEOF_YMMWORD
531    jae         near .columnloop
532    test        rcx, rcx
533    jnz         near .column_ld1
534
535    pop         rcx                     ; col
536    pop         rsi
537    pop         rdi
538    pop         rbx
539    pop         rdx
540
541    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
542    add         rdi, byte SIZEOF_JSAMPROW
543    add         rbx, byte SIZEOF_JSAMPROW
544    add         rdx, byte SIZEOF_JSAMPROW
545    dec         rax                        ; num_rows
546    jg          near .rowloop
547
548.return:
549    pop         rbx
550    vzeroupper
551    uncollect_args 5
552    mov         rsp, rbp                ; rsp <- aligned rbp
553    pop         rsp                     ; rsp <- original rbp
554    pop         rbp
555    ret
556
557; For some reason, the OS X linker does not honor the request to align the
558; segment unless we do this.
559    align       32
560